## Models for Multilabel Subtheme Classification 

This notebook can be used to train the subtheme models for multilabel subtheme classification of comments of Question 1
<br> You can connect this notebook to a GPU/TPU from *Runtime -> Change runtime type*

In [3]:
import pandas as pd
import numpy as np

In [4]:
import os
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Activation, Concatenate
from keras.layers import Conv1D, Conv2D, MaxPooling2D, GlobalMaxPooling1D, MaxPool1D, MaxPooling1D, SpatialDropout1D, GRU, Bidirectional, AveragePooling1D, GlobalAveragePooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Input
from keras.layers.merge import concatenate
from keras.utils import to_categorical
from keras import layers
import tensorflow as tf
from tensorflow.keras import regularizers

Using TensorFlow backend.


In [5]:
import keras

In [6]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, hamming_loss, precision_recall_curve, auc

In [5]:
## Mounting Drive to this Colab notebook
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


### Standard model functions

Here we will define the 3 standard functions (Bi-GRU, Bi-GRU with 2 GRU layers and a CNN model) which are used to train the subtheme models

In [14]:
def bigru(max_features, max_len, n_class, weight_matrix, hidden_sequences, embed_size = 300):
  inputs1 = Input(shape=(max_len,))
  embedding1 = Embedding(max_features, embed_size, weights=[weight_matrix], trainable=False)(inputs1)

  bi_gru = Bidirectional(GRU(hidden_sequences, return_sequences=True))(embedding1)
  
  global_pool = GlobalMaxPooling1D()(bi_gru)
  avg_pool = GlobalAveragePooling1D()(bi_gru)

  concat_layer = Concatenate()([global_pool, avg_pool])

  output = Dense(n_class, activation='sigmoid')(concat_layer)

  model=Model(inputs1, output)

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'categorical_accuracy'])
  model.summary()
  return model

In [8]:
def bigru_2(max_features, max_len, n_class, weight_matrix, hidden_sequences, hidden_sequences_2, embed_size = 300):
  inputs1 = Input(shape=(max_len,))
  embedding1 = Embedding(max_features, embed_size, weights=[weight_matrix], trainable=False)(inputs1)

  bi_gru = Bidirectional(GRU(hidden_sequences, return_sequences=True))(embedding1)
  bi_gru2 = Bidirectional(GRU(hidden_sequences_2, return_sequences=True))(bi_gru)
  
  global_pool = GlobalMaxPooling1D()(bi_gru2)
  avg_pool = GlobalAveragePooling1D()(bi_gru2)

  concat_layer = Concatenate()([global_pool, avg_pool])

  output = Dense(n_class, activation='sigmoid')(concat_layer)

  model=Model(inputs1, output)

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', 'categorical_accuracy'])
  model.summary()
  return model

In [9]:
def cnn(max_features, maxlen, filters, kernel_size, hidden_dims, embed_size, n_class, weight_matrix):
  model = Sequential()

  model.add(Embedding(max_features, embed_size, weights=[weight_matrix], trainable=True, input_length=maxlen))

  model.add(Dropout(0.2))
  model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu',
                  strides=1))
  model.add(MaxPooling1D())
  model.add(Conv1D(filters, kernel_size, padding='valid',activation='relu'))
  model.add(MaxPooling1D())
  model.add(Flatten())

  # L2 regularization
  model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=keras.regularizers.l2(0.001)))
  model.add(Dense(hidden_dims, activation = 'relu', kernel_regularizer=keras.regularizers.l2(0.001)))
  model.add(Dense(n_class, activation = 'sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

  model.summary()
  return model

In [10]:
def eval_metrics(model_name, x_valid, y_valid):
  """
  Function for evaluating precision, recall, F1 score, accuracy at various
  thresholds for the validation set
  """
  pred_values = model_name.predict(x_valid)

  # PRECISION & RECALL
  predictions_results = []

  thresholds=[0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

  for val in thresholds:
    pred=pred_values.copy()
    pred[pred>=val]=1
    pred[pred<val]=0
    
    accuracy = accuracy_score(y_valid, pred, normalize=True, sample_weight=None)#average='micro')
    hamming = hamming_loss(y_valid, pred)
    precision = precision_score(y_valid, pred, average='micro')
    recall = recall_score(y_valid, pred, average='micro')
    f1 = f1_score(y_valid, pred, average='micro')
    case= {'Threshold': val,
            'Accuracy': accuracy,
            'Hamming loss': hamming,
            'Precision': precision,
            'Recall': recall,
            'F1-measure': f1}
    predictions_results.append(case)

  print("Micro-average quality numbers:")
  return pd.DataFrame(predictions_results)

- Now, we will build subtheme models for each theme. Note that the precision-recall values may change slightly each time the model is run due to randomness while fitting the model.

- The first step for each subtheme model will be loading the data. This can be done from **`data/interim/subthemes/<subtheme_name>`**, for `example data/interim/subthemes/CB` for theme `CB`. Upload the following files:
1. embedding_matrix.npy
2. X_train_padded.npy
3. X_valid_padded.npy
4. y_train.npy
5. y_valid.npy

*Warning: Be careful while uploading files. **Do not** upload files with sensitive information.*

- Also, since the training and validation padded data for subthemes are stored in different directories, but with same names, we recommend uploading these padded documents along with the embeddings on Google Drive and mount the drive to this Colab notebook for easier uploading of data.

### CB

In [8]:
## load data
padded_docs_train_cb = np.load('X_train_padded.npy')
padded_docs_valid_cb = np.load('X_valid_padded.npy')

embedding_matrix_ft_cb = np.load('embedding_matrix.npy')

y_train_cb = np.load('y_train.npy')
y_valid_cb = np.load('y_valid.npy')

In [9]:
max_features = embedding_matrix_ft_cb.shape[0]
maxlen = padded_docs_train_cb.shape[1]
batch_size = 128
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 7
embed_size = 300 # for glove we are using 300d dataset
n_class = y_train_cb.shape[1]
weight_matrix = embedding_matrix_ft_cb

In [12]:
model_cb = cnn(max_features=max_features, maxlen=maxlen, filters=filters, kernel_size=kernel_size, hidden_dims=hidden_dims, embed_size=embed_size, n_class=n_class, weight_matrix=weight_matrix)

model_cb.fit(padded_docs_train_cb, y_train_cb, batch_size=batch_size, epochs=7, class_weight='auto', validation_split=0.15)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 184, 300)          1502100   
_________________________________________________________________
dropout_1 (Dropout)          (None, 184, 300)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 182, 250)          225250    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 91, 250)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 89, 250)           187750    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 44, 250)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 11000)            

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 2646 samples, validate on 467 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.callbacks.History at 0x7f12ca4c0588>

In [16]:
## Saving the trained model
## model.save('/content/gdrive/My Drive/Subtheme_Models/cb_model')

In [15]:
eval_metrics(model_cb, padded_docs_valid_cb, y_valid_cb)

Micro-average quality numbers:


Unnamed: 0,Threshold,Accuracy,Hamming loss,Precision,Recall,F1-measure
0,0.3,0.74751,0.059696,0.839565,0.861262,0.850275
1,0.4,0.761966,0.056805,0.889717,0.812024,0.849097
2,0.5,0.736267,0.06066,0.914845,0.762786,0.831924
3,0.6,0.709284,0.064033,0.930855,0.728781,0.817516
4,0.7,0.673948,0.068851,0.947231,0.68852,0.797417
5,0.8,0.61195,0.079023,0.968484,0.618607,0.75498
6,0.9,0.493093,0.100867,0.980171,0.497552,0.660051


### CPD

In [12]:
## load data
padded_docs_train_cpd = np.load('X_train_padded.npy')
padded_docs_valid_cpd = np.load('X_valid_padded.npy')

embedding_matrix_ft_cpd = np.load('embedding_matrix.npy')

y_train_cpd = np.load('y_train.npy')
y_valid_cpd = np.load('y_valid.npy')

In [15]:
model_cpd = bigru(max_features=embedding_matrix_ft_cpd.shape[0], max_len=padded_docs_train_cpd.shape[1], n_class=y_train_cpd.shape[1],
                  weight_matrix=embedding_matrix_ft_cpd, hidden_sequences = 100)

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 163)          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 163, 300)     1428900     input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_2 (Bidirectional) (None, 163, 200)     240600      embedding_2[0][0]                
__________________________________________________________________________________________________
global_max_pooling1d_2 (GlobalM (None, 200)          0           bidirectional_2[0][0]            
____________________________________________________________________________________________

In [16]:
model_cpd.fit(padded_docs_train_cpd, y_train_cpd, validation_split=0.15, epochs=6, batch_size=100, verbose=1)

Train on 2191 samples, validate on 387 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.callbacks.History at 0x7f50503444a8>

In [17]:
eval_metrics(model_cpd, padded_docs_valid_cpd, y_valid_cpd)

Micro-average quality numbers:


Unnamed: 0,Threshold,Accuracy,Hamming loss,Precision,Recall,F1-measure
0,0.3,0.613377,0.111909,0.716535,0.811293,0.760976
1,0.4,0.655791,0.099511,0.775449,0.769688,0.772558
2,0.5,0.634584,0.102121,0.807167,0.702823,0.75139
3,0.6,0.601958,0.109625,0.833663,0.625557,0.714771
4,0.7,0.536705,0.117129,0.875598,0.543834,0.670944
5,0.8,0.435563,0.130179,0.930818,0.439822,0.597376
6,0.9,0.290375,0.159543,0.95098,0.288262,0.442417


### EWC

In [21]:
## load data
padded_docs_train_ewc = np.load('X_train_padded.npy')
padded_docs_valid_ewc = np.load('X_valid_padded.npy')

embedding_matrix_ft_ewc = np.load('embedding_matrix.npy')

y_train_ewc = np.load('y_train.npy')
y_valid_ewc = np.load('y_valid.npy')

In [22]:
model_ewc = bigru(max_features=embedding_matrix_ft_ewc.shape[0], max_len=padded_docs_train_ewc.shape[1], n_class=y_train_ewc.shape[1],
                  weight_matrix=embedding_matrix_ft_ewc, hidden_sequences = 100)

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            (None, 220)          0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 220, 300)     1463700     input_4[0][0]                    
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, 220, 200)     240600      embedding_4[0][0]                
__________________________________________________________________________________________________
global_max_pooling1d_4 (GlobalM (None, 200)          0           bidirectional_4[0][0]            
____________________________________________________________________________________________

In [23]:
model_ewc.fit(padded_docs_train_ewc, y_train_ewc, validation_split=0.15, epochs=15, batch_size=200, verbose=1)

Train on 1569 samples, validate on 277 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x7f4ffa8045c0>

In [24]:
eval_metrics(model_ewc, padded_docs_valid_ewc, y_valid_ewc)

Micro-average quality numbers:


Unnamed: 0,Threshold,Accuracy,Hamming loss,Precision,Recall,F1-measure
0,0.3,0.610092,0.099006,0.726708,0.73431,0.730489
1,0.4,0.626147,0.096713,0.762238,0.6841,0.721058
2,0.5,0.594037,0.096713,0.804878,0.621339,0.701299
3,0.6,0.53211,0.100153,0.850649,0.548117,0.666667
4,0.7,0.477064,0.105122,0.894942,0.481172,0.62585
5,0.8,0.40367,0.115061,0.923445,0.403766,0.561863
6,0.9,0.25,0.139526,0.966942,0.24477,0.390651


### Exec

In [None]:
## load data
padded_docs_train_exec = np.load('X_train_padded.npy')
padded_docs_valid_exec = np.load('X_valid_padded.npy')

embedding_matrix_ft_exec = np.load('embedding_matrix.npy')

y_train_exec = np.load('y_train.npy')
y_valid_exec = np.load('y_valid.npy')

In [None]:
model_exec = bigru(max_features=embedding_matrix_ft_exec.shape[0], max_len=padded_docs_train_exec.shape[1], n_class=y_train_exec.shape[1],
                  weight_matrix=embedding_matrix_ft_exec, hidden_sequences = 100)

In [None]:
model_exec.fit(padded_docs_train_exec, y_train_exec, validation_split=0.15, epochs=15, batch_size=256, verbose=1)

In [None]:
eval_metrics(model_exec, padded_docs_valid_exec, y_valid_exec)

### FWE

In [None]:
## load data
padded_docs_train_fwe = np.load('X_train_padded.npy')
padded_docs_valid_fwe = np.load('X_valid_padded.npy')

embedding_matrix_ft_fwe = np.load('embedding_matrix.npy')

y_train_fwe = np.load('y_train.npy')
y_valid_fwe = np.load('y_valid.npy')

In [None]:
model_fwe = bigru_2(max_features=embedding_matrix_ft_fwe.shape[0], max_len=padded_docs_train_fwe.shape[1], n_class=y_train_fwe.shape[1],
                  weight_matrix=embedding_matrix_ft_fwe, hidden_sequences = 200, hidden_sequences_2 = 75)

In [None]:
model_fwe.fit(padded_docs_train_fwe, y_train_fwe, validation_split=0.15, epochs=10, batch_size=156, verbose=1)

In [None]:
eval_metrics(model_fwe, padded_docs_valid_fwe, y_valid_fwe)

### OTH

In [None]:
## load data
padded_docs_train_oth = np.load('X_train_padded.npy')
padded_docs_valid_oth = np.load('X_valid_padded.npy')

embedding_matrix_ft_oth = np.load('embedding_matrix.npy')

y_train_oth = np.load('y_train.npy')
y_valid_oth = np.load('y_valid.npy')

In [None]:
model_oth = bigru_2(max_features=embedding_matrix_ft_oth.shape[0], max_len=padded_docs_train_oth.shape[1], n_class=y_train_oth.shape[1],
                  weight_matrix=embedding_matrix_ft_oth, hidden_sequences = 100, hidden_sequences_2 = 75)

In [None]:
model_oth.fit(padded_docs_train_oth, y_train_oth, validation_split=0.15, epochs=15, batch_size=200, verbose=1)

In [None]:
eval_metrics(model_oth, padded_docs_valid_oth, y_valid_oth)

### RE

In [None]:
## load data
padded_docs_train_re = np.load('X_train_padded.npy')
padded_docs_valid_re = np.load('X_valid_padded.npy')

embedding_matrix_ft_re = np.load('embedding_matrix.npy')

y_train_re = np.load('y_train.npy')
y_valid_re = np.load('y_valid.npy')

In [None]:
model_re = bigru_2(max_features=embedding_matrix_ft_re.shape[0], max_len=padded_docs_train_re.shape[1], n_class=y_train_re.shape[1],
                  weight_matrix=embedding_matrix_ft_re, hidden_sequences = 200, hidden_sequences_2 = 75)

In [None]:
model_re.fit(padded_docs_train_re, y_train_re, validation_split=0.15, epochs=12, batch_size=156, verbose=1)

In [None]:
eval_metrics(model_re, padded_docs_valid_re, y_valid_re)

### SP

In [None]:
## load data
padded_docs_train_sp = np.load('X_train_padded.npy')
padded_docs_valid_sp = np.load('X_valid_padded.npy')

embedding_matrix_ft_sp = np.load('embedding_matrix.npy')

y_train_sp = np.load('y_train.npy')
y_valid_sp = np.load('y_valid.npy')

In [None]:
model_sp = bigru(max_features=embedding_matrix_ft_sp.shape[0], max_len=padded_docs_train_sp.shape[1], n_class=y_train_sp.shape[1],
                  weight_matrix=embedding_matrix_ft_sp, hidden_sequences = 100)

In [None]:
model_sp.fit(padded_docs_train_sp, y_train_sp, validation_split=0.15, epochs=15, batch_size=256, verbose=1)

In [None]:
eval_metrics(model_sp, padded_docs_valid_sp, y_valid_sp)

### Sup

In [None]:
## load data
padded_docs_train_sup = np.load('X_train_padded.npy')
padded_docs_valid_sup = np.load('X_valid_padded.npy')

embedding_matrix_ft_sup = np.load('embedding_matrix.npy')

y_train_sup = np.load('y_train.npy')
y_valid_sup = np.load('y_valid.npy')

In [None]:
model_sup = bigru(max_features=embedding_matrix_ft_sup.shape[0], max_len=padded_docs_train_sup.shape[1], n_class=y_train_sup.shape[1],
                  weight_matrix=embedding_matrix_ft_sup, hidden_sequences = 100)

In [None]:
model_sup.fit(padded_docs_train_sup, y_train_sup, validation_split=0.15, epochs=20, batch_size=256, verbose=1)

In [None]:
eval_metrics(model_sup, padded_docs_valid_sup, y_valid_sup)

### SW

In [None]:
## load data
padded_docs_train_sw = np.load('X_train_padded.npy')
padded_docs_valid_sw = np.load('X_valid_padded.npy')

embedding_matrix_ft_sw = np.load('embedding_matrix.npy')

y_train_sw = np.load('y_train.npy')
y_valid_sw = np.load('y_valid.npy')

In [None]:
model_sw = bigru(max_features=embedding_matrix_ft_sw.shape[0], max_len=padded_docs_train_sw.shape[1], n_class=y_train_sw.shape[1],
                  weight_matrix=embedding_matrix_ft_sw, hidden_sequences = 100)

In [None]:
model_sw.fit(padded_docs_train_sw, y_train_sw, validation_split=0.15, epochs=20, batch_size=256, verbose=1)

In [None]:
eval_metrics(model_sw, padded_docs_valid_sw, y_valid_sw)

### TEPE

In [None]:
## load data
padded_docs_train_tepe = np.load('X_train_padded.npy')
padded_docs_valid_tepe = np.load('X_valid_padded.npy')

embedding_matrix_ft_tepe = np.load('embedding_matrix.npy')

y_train_tepe = np.load('y_train.npy')
y_valid_tepe = np.load('y_valid.npy')

In [None]:
model_tepe = bigru(max_features=embedding_matrix_ft_tepe.shape[0], max_len=padded_docs_train_tepe.shape[1], n_class=y_train_tepe.shape[1],
                  weight_matrix=embedding_matrix_ft_tepe, hidden_sequences = 100)

In [None]:
model_tepe.fit(padded_docs_train_tepe, y_train_tepe, validation_split=0.15, epochs=6, batch_size=256, verbose=1)

In [None]:
eval_metrics(model_tepe, padded_docs_valid_tepe, y_valid_tepe)

### VMG

In [None]:
## load data
padded_docs_train_vmg = np.load('X_train_padded.npy')
padded_docs_valid_vmg = np.load('X_valid_padded.npy')

embedding_matrix_ft_vmg = np.load('embedding_matrix.npy')

y_train_vmg = np.load('y_train.npy')
y_valid_vmg = np.load('y_valid.npy')

In [None]:
model_vmg = bigru_2(max_features=embedding_matrix_ft_vmg.shape[0], max_len=padded_docs_train_vmg.shape[1], n_class=y_train_vmg.shape[1],
                  weight_matrix=embedding_matrix_ft_vmg, hidden_sequences = 100, hidden_sequences_2 = 75)

In [None]:
model_vmg.fit(padded_docs_train_vmg, y_train_vmg, validation_split=0.15, epochs=15, batch_size=256, verbose=1)

In [None]:
eval_metrics(model_vmg, padded_docs_valid_vmg, y_valid_vmg)