# Pan nn Text Classification Test
Trying to classify the age of an author based on a written conversation

## 1. Import libraries

In [2]:
!pip install torchtext==0.4



In [3]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import re
import os
import time

import torch
import torchtext

from torchtext.datasets import text_classification
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
import nltk
import plotly.express as px
from nltk.stem import WordNetLemmatizer
import plotly.graph_objs as go
nltk.download('wordnet')
nltk.download('stopwords')

from collections import defaultdict, OrderedDict, Counter
import operator

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 2. Load and preprocces the dataframe

In [10]:
df = pd.read_csv('/content/drive/MyDrive/Language, speech and dialogue processing/Datasets/df_pan_train.csv') 

In [11]:
df = df.drop(['Unnamed: 0', 'lang', 'id'], axis=1)


In [6]:
df_sub = df[:10]
df_sub.to_csv('/content/drive/MyDrive/Language, speech and dialogue processing/sub_PAN.csv')

In [7]:
ages = df["age_group"].unique().tolist()
ages.sort()
age_dict = {}
for age in ages:
  age_dict[age] = df.loc[df['age_group'] == age]

for key, value in age_dict.items():
  age_dict[key] = value
  
print(age_dict)

{'10s':        age_group  gender                                               text
20           10s    male  Instantly you will notice people with the purc...
21           10s    male  Too many people throw in the towel early on be...
29           10s  female  he Best Site for Free Money Offers and Competi...
31           10s  female  No make a difference what your good reasons we...
32           10s  female  <strong>;Salam.\n<br />;<br />;okay, first, I ...
...          ...     ...                                                ...
401933       10s    male  You want to fix your PS3 and that's it. Anothe...
401934       10s  female  In recent decades, there are many game systems...
401935       10s  female  Super Mario brothers 3 is another major game w...
401959       10s  female  Men and women through every wander of life are...
401979       10s    male  I LIKE SURFING AND NEW! ADD ME AND WE CAN BECO...

[27428 rows x 3 columns], '20s':        age_group  gender                      

In [8]:


dataframes_10s = []
dataframes_20s = []
dataframes_30s = []
for key, value in age_dict.items():
  if key == '10s':
    dataframes_10s.append(value)
  elif key == '20s':
    dataframes_20s.append(value)
  elif '30s':
    dataframes_30s.append(value)

df_10s = pd.concat(dataframes_10s)
df_20s = pd.concat(dataframes_20s)
df_30s = pd.concat(dataframes_30s)

all_dataframes = [df_10s, df_20s, df_30s]

min_len = len(all_dataframes[0])

for df in all_dataframes:
  if len(df) < min_len:
    min_len = len(df)

df_10s["age"] = 0
df_20s["age"] = 1
df_30s["age"] = 2

all_dataframes = [df_10s.sample(min_len), df_20s.sample(min_len), df_30s.sample(min_len)]

df = pd.concat(all_dataframes)

Unnamed: 0,age_group,gender,text,encoded_gender
0,20s,female,The utilization of this item of therapy gear i...,0
1,20s,female,"Before, an individual additional learn about t...",0
2,30s,male,The vending device organization is one particu...,1
3,30s,male,National Treasure - three Stars (Excellent)<br...,1
4,20s,female,"Based in Southwest Louisiana, the Law Office o...",0


401993


## 3. Create n_grams from the text (Optional)

In [21]:
# lemmatizing function
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text]
    return text

# remove stopwords
def remove_stopwords(text):
    stopword = nltk.corpus.stopwords.words('english')
    text = [word for word in text if word not in stopword]
    return text
    
def tokenize(text):
    wrong_words = ["urllink", "nbsp"]
    tokens = [token for token in text.split(" ") if token != "" and token not in wrong_words]
    return tokens

def lowered(s):
    return s.lower()

def remove_nonalph(s):
      s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
      return s

def generate_ngrams(s, n):
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[s[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

In [None]:
df["text"] = df["text"].apply(lowered).apply(remove_nonalph).apply(tokenize)


# ngram = 2
# for i, text in enumerate(df['text']):
#   if type(df['text'].iloc[i]) != list:
#     lower = lowered(text)
#     non_alph = remove_nonalph(lower)
#     tokens = tokenize(non_alph)
#     # no_stopwords = remove_stopwords(tokens)
#     # lemma = lemmatize(no_stopwords)
#     # ngrams = generate_ngrams(lemma, ngram)
#     df['text'].iloc[i] = tokens
print(df['text'].iloc[0])
print(df['text'])

In [None]:
df.index = range(len(df))

In [None]:
df

In [None]:
# AMOUNT_OF_CATEGORIES = 5
# df = df.assign(age_group=pd.qcut(df['age'], AMOUNT_OF_CATEGORIES, labels=[i for i in range(AMOUNT_OF_CATEGORIES)]))

In [None]:
# hist_trace = go.Histogram(x=df['age_group'])
# go.Figure(hist_trace).show()

## 4. To GPU

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 5. Initialising Neural Network

In [None]:
class TextSentiment(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

## 6. Classes preprocessing
Checking the amount of classes and mapping them each to a different unique number 

In [None]:
class_choice = 'age_group'
classdict = defaultdict(int)

for row in df[class_choice]:
  classdict[row] += 1
print(sorted(classdict))

### PROBLEM: ALS IK DIT WISSEL NAAR NORMAL SORT DAN IS DE UITKOMST INEENS ANDERS???
classdict = dict(sorted(classdict.items(), key=operator.itemgetter(1), reverse=True))
#classdict = dict(sorted(classdict.items()))
print('classdict:')
print(classdict)

fig = go.Figure([go.Bar(x=('10s', '20s', '30s'), y=[classdict['10s'], classdict['20s'], classdict['30s']])])
fig.show()

In [None]:
# class to number mapping
classlist = list(classdict.keys())
classmap = dict([(y,x) for x,y in enumerate(classlist)])
print('classmap:')
print(classmap)
print('amount of classes')
print(len(classdict))

## 7. Vocabulary dictionary
Making a dict of all the words in the dataset and mapping each unique word to a unique number

In [None]:
vocabdict = defaultdict(int)   

for row in df['text']:
  for n_gram in row:
    n_gram = n_gram.lower()
    vocabdict[n_gram] += 1

vocabdict = dict(sorted(vocabdict.items(), key=operator.itemgetter(1), reverse=True))
print(vocabdict)

In [None]:
# WORD TO NUMBER MAPPING
l = list(vocabdict.keys())
wordmap = dict([(y,x) for x,y in enumerate(l)])
print(wordmap)

## 8. Mapping text column to numbers (tensor)

In [None]:
df_copy = df.copy()

for i, text in enumerate(df['text']):
  newtext = []
  for word in text:
    word = word.lower()
    newtext.append(int(wordmap[word]))
  df['text'][i] = torch.tensor(newtext).to(torch.int64)

In [None]:
train_dataset = []

for index, row in df.iterrows():
  clasn = classmap[row['age_group']]
  train_dataset.append(tuple((clasn, row['text'])))

As you can see below the train dataset is now a list with tuples (age, text)

In [None]:
print(train_dataset[0])

## 9. Run neural network

In [None]:
vocab = len(vocabdict)
embed_dim = 32
n_classes = len(classdict)
BATCH_SIZE = 16
model = TextSentiment(vocab, embed_dim, n_classes).to(device)

In [None]:
def generate_batch(batch):
    label = torch.tensor([entry[0] for entry in batch])
    text = [entry[1] for entry in batch]
    offsets = [0] + [len(entry) for entry in text]
    # torch.Tensor.cumsum returns the cumulative sum
    # of elements in the dimension dim.
    # torch.Tensor([1.0, 2.0, 3.0]).cumsum(dim=0)

    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text = torch.cat(text)
    return text, offsets, label

### 9.1 Confusion *matrix*

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# confusion matrix plot function
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    import itertools
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.savefig('/content/drive/MyDrive/Language, speech and dialogue processing/PAN_nn_confusion_matrix.png')
    

In [None]:
def train_func(sub_train_):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(sub_train_, batch_size=BATCH_SIZE, shuffle=True,
                      collate_fn=generate_batch)
    for i, (text, offsets, cls) in enumerate(data):
        optimizer.zero_grad()
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        output = model(text, offsets)
        loss = criterion(output, cls)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == cls).sum().item()

    # Adjust the learning rate
    scheduler.step()

    return train_loss / len(sub_train_), train_acc / len(sub_train_)

def test(data_, last_epoch=False):
    loss = 0
    acc = 0
    predictions = []
    labels = []
    inv_map = {v: k for k, v in classmap.items()}
    data = DataLoader(data_, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, offsets, cls in data:
        text, offsets, cls = text.to(device), offsets.to(device), cls.to(device)
        with torch.no_grad():
            output = model(text, offsets)
            loss = criterion(output, cls)
            loss += loss.item()
            acc += (output.argmax(1) == cls).sum().item()
        pred = output.argmax(1).to('cpu')
        label = cls.to('cpu')
        predictions.extend(pred)
        labels.extend(label)
        
    if last_epoch == False:
      return loss / len(data_), acc / len(data_)
    else:
      return predictions, labels

In [None]:
N_EPOCHS = 50
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=4.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

train_len = int(len(train_dataset) * 0.95)
sub_train_, sub_valid_ = \
    random_split(train_dataset, [train_len, len(train_dataset) - train_len])

start_time = time.time()
for epoch in range(N_EPOCHS):

  train_loss, train_acc = train_func(sub_train_)
  valid_loss, valid_acc = test(sub_valid_)

  print('Epoch: %d' %(epoch + 1))
  print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
  print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')
  if epoch == (N_EPOCHS-1):
    predictions, labels = test(sub_valid_, True)
    conf_matrix = confusion_matrix(np.array(labels), np.array(predictions), labels=[0, 1, 2])
    np.set_printoptions(precision=2)
    # Plot non-normalized confusion matrix
    #plt.figure()
    plot_confusion_matrix(conf_matrix, classes=['10s', '20s', '30s'], title='Confusion matrix, without normalization')
total_time = int(time.time() - start_time)
print('Total time elapsed: %d seconds.' %(total_time))
torch.save(model.state_dict(), '/content/drive/MyDrive/Language, speech and dialogue processing/pan_state.pth')

In [None]:
plot_confusion_matrix(conf_matrix, classes=['10s', '20s', '30s'], normalize=True, title='PAN NN Confusion matrix, with normalization')