In [None]:
#load google drive
from google.colab import drive
drive._mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#import the necessary libraries
import os
import csv
import numpy as np

#essential ml libraries
import tensorflow as tf 
import matplotlib.pyplot as plt

#class for preparing and cleaning the data
class samsungNLPchallange():
    def __init__(self):
        
        self.file_path_train = '/content/drive/MyDrive/models/prometeo/train/'
        self.file_path_test = '/content/drive/MyDrive/models/prometeo/'
        self.train_x_y = []
        self.train_x_y_f = []

        self.classes = []
        self.words = []

        return None

    def prepareTestData(self):
        f = open(self.file_path_test+'test_public.csv')
        csvreader = csv.reader(f)
        header = []
        rows = []
        tokens = []

        header = next(csvreader)
        for row in csvreader:
            rows.append(row)

        for sen in rows:
            for w in sen[1].split():
                tokens.append(w)

        print(len(tokens))
        return None
    
    def prepareTrainData(self):
        dir_list = os.listdir(self.file_path_train)
        dir_list = sorted(dir_list)
        
        line_f = []
        for i in range(len(dir_list)):
            with open(self.file_path_train+dir_list[i], 'r') as f:
                line = ''
                #line_f = []
                while True:
                    line = f.readline()
                    line_s = line.split('\t')
                    line_t = [] 
                    for l in line_s:
                        line_t.append(l.split())
                    line_f.append(line_t)  
                    if line == '':
                        break
            
                line_f = line_f[:len(line_f)-1]

            f.close()

        self.train_x_y = line_f
        return line_f

    def prepareLabels(self):
        
        unique = []

        for val in self.train_x_y:
            for v in val[1]:
                if v not in unique:
                    unique.append(v)
        
        unique = sorted(unique)
        
        self.classes = unique

        return unique

    def prepareWordBag(self):
        
        unique = []

        for val in self.train_x_y:
            for v in val[0]:
                if v not in unique:
                    unique.append(v)
        unique = sorted(unique)
        self.words = unique
        return unique

    #main mapper function (word->idx and classes->idx)
    def mapping(self):
        
        t_train_x_y_f = []
        
        w_n = len(self.words)
        c_n = len(self.classes)
        
        for x in self.train_x_y:
            
            w_index = []
            
            for xx in x[0]:
                w_index.append(self.words.index(xx))
            
            c_index = []
            
            for yy in x[1]:
                c_index.append(self.classes.index(yy))
            
            t_train_x_y_f.append([w_index,c_index])
        
        self.train_x_y_f = t_train_x_y_f

        return t_train_x_y_f


if __name__ == "__main__":

    OBJ     = samsungNLPchallange()
    train   = OBJ.prepareTrainData()
    classes = OBJ.prepareLabels()
    words   = OBJ.prepareWordBag()
    final   = OBJ.mapping()


In [None]:
#preparing the test dataset to be released 
def mappingTest(test_):
  t_test_y = []
        
  for x in test_:        
    w_index = []
    for xx in x:
      try:
        w_index.append(words.index(xx))
      except:
        w_index.append(words.index(' '))
        
    t_test_y.append(w_index)
  return t_test_y

file = open('/content/drive/MyDrive/models/prometeo/test_public.csv')
csvreader = csv.reader(file)

header = []
rows = []

header = next(csvreader)
for row in csvreader:
  rows.append(row)

test = []

for i in rows:
  test.append(i[1].split())

### At this moment we have the following data prepared:


*   `train` : Contains the supervision data (sequence of words and respective labels).
*   `test` : Constains the data to be tested on.
*   `classes` : The class labels.
*   `words`   : The bag of words that constitutes the words that are accumulated in the dataset.
*   `final`   : It is the numeric encoding of the words and labels in the training set.



In [None]:
print(len(words))
words.append(' ')
print(len(words))

1769
1770


In [None]:
print(len(classes))
classes.append('')
print(len(classes))

33
34


In [None]:
#create a format that can be used in the tensorflow model
new_train = []
for i in train:
  new_lst = []
  for j in range(len(i[0])):
    new_lst.append([i[0][j],i[1][j]])
  new_train.append(new_lst)

In [None]:
import numpy as np
X = []
y = []
for lst in final:
  X.append(np.array(lst[0], dtype=np.int32))
  y.append(np.array(lst[1], dtype=np.int32))

In [None]:
max_len = 20
from keras.preprocessing.sequence import pad_sequences
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=len(words)-1)
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=classes.index('o'))

In [None]:
#obtain the dataset that is to be used for training the model
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

y = [to_categorical(i, num_classes=len(classes)) for i in y]
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1)

In [None]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional

In [None]:
input = Input(shape=(max_len,))
model = Embedding(input_dim=len(words), output_dim=max_len, input_length=max_len)(input)  # 20 embeddings
model = Dropout(0.5)(model)
model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.5))(model)
#model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.5))(model)  # variational biLSTM
out = TimeDistributed(Dense(len(classes), activation="softmax"))(model)  # softmax output layer

model = Model(input, out)



In [None]:
from keras import backend as K
#assiting code for finding the recall measure, precision measure and finally the `f1 measure`

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy",f1_m])
checkpointer = tf.keras.callbacks.ModelCheckpoint('model_best.h5', verbose=1, save_best_only=True)
history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=250, validation_split=0.1, verbose=1,callbacks=[checkpointer])

Epoch 1/250
Epoch 1: val_loss improved from inf to 0.58722, saving model to model_best.h5
Epoch 2/250
Epoch 2: val_loss improved from 0.58722 to 0.56935, saving model to model_best.h5
Epoch 3/250
Epoch 3: val_loss improved from 0.56935 to 0.54776, saving model to model_best.h5
Epoch 4/250
Epoch 4: val_loss improved from 0.54776 to 0.51844, saving model to model_best.h5
Epoch 5/250
Epoch 5: val_loss improved from 0.51844 to 0.51032, saving model to model_best.h5
Epoch 6/250
Epoch 6: val_loss improved from 0.51032 to 0.48525, saving model to model_best.h5
Epoch 7/250
Epoch 7: val_loss improved from 0.48525 to 0.46663, saving model to model_best.h5
Epoch 8/250
Epoch 8: val_loss improved from 0.46663 to 0.43965, saving model to model_best.h5
Epoch 9/250
Epoch 9: val_loss improved from 0.43965 to 0.43868, saving model to model_best.h5
Epoch 10/250
Epoch 10: val_loss improved from 0.43868 to 0.41685, saving model to model_best.h5
Epoch 11/250
Epoch 11: val_loss improved from 0.41685 to 0.399

## Final testing of the public test set

The public test set is a sub sample of the data the model is going to see during the phase of post-deployment. The test set here is mentioned in the `test_public.csv` file. The file has a lot of ambiguities as the data might have unseen values to be classified into a rejection class.

The `test` dataset need to be used here for obtaining the results.

In [None]:
test_final = mappingTest(test)
import numpy as np
X_test = []

for lst in test_final:
  X_test.append(np.array(lst, dtype=np.int32))

In [None]:
from keras.preprocessing.sequence import pad_sequences
X_TEST = pad_sequences(maxlen=max_len, sequences=X_test, padding="post", value=len(words)-1)

In [None]:
import csv
with open('test_public_res.csv', 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(['id', 'Predicted'])
  
  cnt = 0
  idx = 0
  for i in X_TEST:
    p = model.predict(np.array([i]))
    p = np.argmax(p, axis=-1)
    count = 0
    for val in range(len(test[cnt])):
      if test_final[cnt][val] == len(words)-1:
        writer.writerow([idx,''])
      else:    
        writer.writerow([idx,classes[p[0][count]]])
      count +=1
      idx += 1
    cnt += 1
    print()











































































































































































































In [None]:
model_new = new_model = tf.keras.models.load_model('model_best.h5', custom_objects={"f1_m": f1_m})



In [None]:
test_final = mappingTest(test)
import numpy as np
X_test = []

for lst in test_final:
  X_test.append(np.array(lst, dtype=np.int32))

import csv
with open('/content/drive/MyDrive/models/prometeo/test_public_res_new.csv', 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerow(['id', 'Predicted'])
  
  cnt = 0
  idx = 0
  for i in X_TEST:
    p = model_new.predict(np.array([i]))
    p = np.argmax(p, axis=-1)
    count = 0
    for val in range(len(test[cnt])):
      if test_final[cnt][val] == len(words)-1:
        writer.writerow([idx,''])
      else:    
        writer.writerow([idx,classes[p[0][count]]])
      count +=1
      idx += 1
    cnt += 1
    print()











































































































































































































## Results



*   Epochs and Learning rate has a considerable effect on the trained model.
*   Model in `tensorflow` worked better than models in `pytorch`, as I am having a better familiarty with tesorflow.
*   Epochs tested upon `64: 0.80981`, `150: 0.81682`

