# Introduction
## Data
* The input data comprises of files within the folders with class labels as their name.
* The textual data are emails written from one journalist to another or to their source, regarding a story.
* Emails can be seen labelled under multiple classes which would mislead the model.

# Methodology
* To clean our data and obtain some meaningful insights, we first need to make sure our data is properly labelled and stored
* To do so:
    1. Read the text from each file and while doing so, make sure we are not reading duplicate data.
    2. Load the textual data into DataFrame for easier analysis.
    3. Clean the raw text by:
        * Removing special characters, punctuations, pronouns, stopwords.
        * Tokeizing each data point, i.e segmenting text into sentences and further into words.
        * Normalize the text by converting it into lower case.
        * Extract the lemma for each word. Ex: Lemma(swimming) -> swim.

In [None]:
#------------------------------------------Libraries---------------------------------------------------------------#
####################################################################################################################
#-------------------------------------Boiler Plate Imports---------------------------------------------------------#
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
#---------------------------------------Text Processing------------------------------------------------------------#
import regex
from wordcloud import WordCloud
from nltk.corpus import stopwords 
from nltk.tokenize import WordPunctTokenizer
from string import punctuation
from nltk.stem import WordNetLemmatizer
#####################################################################################################################

## Read Data from the '.txt' files

In [None]:
names = []
base = '/kaggle/input/topic-modelling-on-emails/Data/'
with os.scandir(base) as entries:
    for entry in entries:
        if(entry.is_file() == False):
            names.append(entry.name)
names

In [None]:
names.sort()

In [None]:
files = {}
unique = []
for name in names:
    path = base + name+'/'
    x = []
    with os.scandir(path) as entries:
        for entry in entries:
            if(entry.is_file()):
                x.append(entry.name)
    files[name] = x
    files[name].sort()

In [None]:
for k, v in files.items():
    print(k, len(v))

* We now know how many files are labelled under each class. Our job now is to remove the data points that are labelled under other classes. Ex: 14147.txt is labelled under 'Crime', 'Entertainment' and 'Science', so we will be removing the entry from 'Entertainment' and 'Science'.
* The risk here is that we might have removed the entry from a correctly labelled class. Ex: 14147.txt may be labelled as 'Science' initially and was repeated in other classes, by removing it from 'Science' we are mislabelling the data as 'Crime'.
* But 'Science' already contains the most no. of entries, making it easier for us to train the model for that particular class. Hence, our approach doesn't affect the analysis.

In [None]:
names

In [None]:
for i in range(len(names)):
    x = files[names[i]]
    for j in x:
        for k in range(i+1, len(names)):
            key = names[k]
            if j in files[key]:
                files[key].remove(j)

In [None]:
for k, v in files.items():
    print(k, len(v))

* From the above result it is clearly implied that the class 'Entertainment' had no data unique to its class. 
* By not considering it, we are also eliminating any variance that can be caused by the duplicate data.

In [None]:
data = {}
i = 0

for genre in files.keys() :
    texts = files[genre]
    for text in texts:
        if text in files[genre]:
            path = base + genre + '/' + text
            with open(path, "r", encoding = "latin1") as file:
                data[i] = file.readlines()
                i = i+1
            data[i-1] = [" ".join(data[i-1]), genre] 

data = pd.DataFrame(data).T
print(data.shape)
data.columns = ['Text', 'Class']
data.head()

In [None]:
data.info()

In [None]:
data.isna().sum()

There still exists few duplicate texts which might have been the result of poor data management or sending the same mail multiple times.

In [None]:
unique = list(data.Text.unique())
len(unique)

In [None]:
dic = dict(data)

In [None]:
uni = {}
i = 0
for k in range(len(list(dic['Text']))):
    if dic['Text'][k] in unique:
        uni[i] = [dic['Text'][k], dic['Class'][k]]
        unique.remove(dic['Text'][k])
        i += 1

In [None]:
data = pd.DataFrame(uni).T
print(data.shape)
data.columns = ['Text', 'Class']
data.head()

In [None]:
plt.figure(figsize=(10,5))
ax = sns.countplot(data.Class, palette = sns.color_palette("mako"))

In [None]:
def make_wordcloud(words,title):
    cloud = WordCloud(width=1920, height=1080,max_font_size=200, max_words=300, background_color="white").generate(words)
    plt.figure(figsize=(20,20))
    plt.imshow(cloud, interpolation="gaussian")
    plt.axis("off") 
    plt.title(title, fontsize=60)
    plt.show()

Now that we managed to load our data frame, we can move to the next step, i.e cleaning the data.

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

stop = stopwords.words('english')

for punct in punctuation:
    stop.append(punct)

def filter_text(text, stop_words):
    word_tokens = WordPunctTokenizer().tokenize(text.lower())
    filtered_text = [regex.sub(u'\p{^Latin}', u'', w) for w in word_tokens if w.isalpha() and len(w) > 3]
    filtered_text = [wordnet_lemmatizer.lemmatize(w, pos="v") for w in filtered_text if not w in stop_words] 
    return " ".join(filtered_text)

In [None]:
data["filtered_text"] = data.Text.apply(lambda x : filter_text(x, stop)) 
data.head()

We can now find some useful insights into the data set by constructing wordclouds and find term frequencies in each class.

### Crime

In [None]:
all_text = " ".join(data[data.Class == "Crime"].filtered_text) 
make_wordcloud(all_text, "Crime")

### Top 10 words in the Crime Category

In [None]:
count = pd.DataFrame(all_text.split(), columns = ['words'])
top_10 = count[count['words'].isin(list(count.words.value_counts()[:10].index[:10]))]
plt.figure(figsize=(10,5))
sns.barplot(x = top_10.words.value_counts().index,
            y = top_10.words.value_counts(), palette = sns.color_palette("mako"))

### Politics

In [None]:
all_text = " ".join(data[data.Class == "Politics"].filtered_text) 
make_wordcloud(all_text, "Politics")

### Top 10 words in the Politics Category

In [None]:
count = pd.DataFrame(all_text.split(), columns = ['words'])
top_10 = count[count['words'].isin(list(count.words.value_counts()[:10].index[:10]))]
plt.figure(figsize=(10,5))
sns.barplot(x = top_10.words.value_counts().index,
            y = top_10.words.value_counts(), palette = sns.color_palette("mako"))

### Science

In [None]:
all_text = " ".join(data[data.Class == "Science"].filtered_text) 
make_wordcloud(all_text, "Science")

### Top 10 words in the Science Category

In [None]:
count = pd.DataFrame(all_text.split(), columns = ['words'])
top_10 = count[count['words'].isin(list(count.words.value_counts()[:10].index[:10]))]
plt.figure(figsize=(10,5))
sns.barplot(x = top_10.words.value_counts().index,
            y = top_10.words.value_counts(), palette = sns.color_palette("mako"))

In [None]:
data['Class'].value_counts()

# Oversampling The Data

In [None]:
data=data.groupby('Class',as_index = False,group_keys=False).apply(lambda s: s.sample(1095,replace=True))

In [None]:
plt.figure(figsize=(10,5))
ax = sns.countplot(data.Class, palette = sns.color_palette("mako"))

# XLNET

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sns
import transformers

import nltk
import re


from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

plt.style.use('seaborn')

In [None]:
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

In [None]:
from transformers import TFXLNetModel, XLNetTokenizer

In [None]:
xlnet_model = 'xlnet-large-cased'
xlnet_tokenizer = XLNetTokenizer.from_pretrained(xlnet_model)

In [None]:
number_of_classes = 1 #len(names)
number_of_classes

In [None]:
def create_xlnet(mname):
    """ Creates the model. It is composed of the XLNet main block and then
    a classification head its added
    """
    # Define token ids as inputs
    word_inputs = tf.keras.Input(shape=(120,), name='word_inputs', dtype='int32')

    # Call XLNet model
    xlnet = TFXLNetModel.from_pretrained(mname)
    xlnet_encodings = xlnet(word_inputs)[0]

    # CLASSIFICATION HEAD 
    # Collect last step from last hidden state (CLS)
    doc_encoding = tf.squeeze(xlnet_encodings[:, -1:, :], axis=1)
    # Apply dropout for regularization
    doc_encoding = tf.keras.layers.Dropout(.1)(doc_encoding)
    # Final output 
    outputs = tf.keras.layers.Dense(number_of_classes, activation='sigmoid', name='outputs')(doc_encoding)

    # Compile model
    model = tf.keras.Model(inputs=[word_inputs], outputs=[outputs])
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=2e-5), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

    return model

In [None]:
xlnet = create_xlnet(xlnet_model)

In [None]:
xlnet.summary()

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
y = le.fit_transform(data['Class'])

In [None]:
text = data['filtered_text']
labels = data['Class']


X_train, X_test, y_train, y_test = train_test_split(text, y, test_size=0.15, random_state=196)

In [None]:
def get_inputs(text, tokenizer, max_len=512):
    """ Gets tensors from text using the tokenizer provided"""
    inps = [tokenizer.encode_plus(t, max_length=max_len, pad_to_max_length=True, add_special_tokens=True) for t in text]
    inp_tok = np.array([a['input_ids'] for a in inps])
    ids = np.array([a['attention_mask'] for a in inps])
    segments = np.array([a['token_type_ids'] for a in inps])
    return inp_tok, ids, segments

In [None]:
def warmup(epoch, lr):
    """Used for increasing the learning rate slowly, this tends to achieve better convergence.
    However, as we are finetuning for few epoch it's not crucial.
    """
    return max(lr +1e-6, 2e-5)

def plot_metrics(pred, true_labels):
    """Plots a ROC curve with the accuracy and the AUC"""
    acc = accuracy_score(true_labels, np.array(pred.flatten() >= .5, dtype='int'))
    fpr, tpr, thresholds = roc_curve(true_labels, pred)
    auc = roc_auc_score(true_labels, pred)

    fig, ax = plt.subplots(1, figsize=(8,8))
    ax.plot(fpr, tpr, color='red')
    ax.plot([0,1], [0,1], color='black', linestyle='--')
    ax.set_title(f"AUC: {auc}\nACC: {acc}");
    return fig

In [None]:
inp_tok, ids, segments = get_inputs(X_train, xlnet_tokenizer)

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=4, min_delta=0.02, restore_best_weights=True),
    tf.keras.callbacks.LearningRateScheduler(warmup, verbose=0),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=1e-6, patience=2, verbose=0, mode='auto', min_delta=0.001, cooldown=0, min_lr=1e-6)
]

In [None]:
hist = xlnet.fit(x=inp_tok, y=y_train, epochs=4, batch_size=2, validation_split=.15, callbacks=callbacks)

# Testing

In [None]:
inp_tok, ids, segments = get_inputs(X_test, xlnet_tokenizer)

In [None]:
preds = xlnet.predict(inp_tok, verbose=True)

In [None]:
#plot_metrics(preds, y_test);

In [None]:
pred_analysis_df = pd.DataFrame({'tweet':X_test.values, 'pred':preds.flatten(), 'real':y_test})
pred_analysis_df['rounded'] = np.array(pred_analysis_df['pred'] > 0.5, dtype='int')
diff = pred_analysis_df[pred_analysis_df['real'] != pred_analysis_df['rounded']]

In [None]:
#change to see other examples
idx = 44

tweet, real, pred = diff.iloc[idx, [0,2,3]]
print(tweet)
print("PRED: " + str(pred))
print("REAL: " + str(real))

In [None]:
# tweets = dataf_test['clean']

# inp_tok, ids, segments = get_inputs(tweets, xlnet_tokenizer)

In [None]:
preds = xlnet.predict(inp_tok, verbose=True)

In [None]:
# dataf_test['target'] = preds
# dataf_test['target'] = np.array(dataf_test['target'] >= 0.5, dtype='int')
# dataf_test[['id', 'target']].to_csv('submission.csv', index=False)

In [None]:
xlnet.save_weights("xlnet.h5")