In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import re
from sklearn import metrics


from sklearn.model_selection import train_test_split

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

from keras.callbacks import EarlyStopping
from keras.optimizers import SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam

In [54]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
train = pd.read_csv('/content/drive/MyDrive/nlpclass-1207-g-outputerror-master-project/project/processed_data/train_translated_cleaned.csv')
test = pd.read_csv('/content/drive/MyDrive/nlpclass-1207-g-outputerror-master-project/project/processed_data/test_translated_cleaned.csv')
test_y = pd.read_csv("/content/drive/MyDrive/nlpclass-1207-g-outputerror-master-project/project/processed_data/test_labels_cleaned.csv")

In [56]:
train['comment_text'] = train['comment_text'].apply(lambda x: np.str_(x))
test['comment_text'] = test['comment_text'].apply(lambda x: np.str_(x))

X = train.iloc[:,1]
y = train.iloc[:,2:]

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.4, random_state=123)

In [57]:
bad_words = pd.read_csv("/content/drive/MyDrive/nlpclass-1207-g-outputerror-master-project/project/original_data/reference_data/bad_words.csv")
bad_words = list(bad_words.bad_words.values)

In [58]:
NUM_WORDS = 5000
tokenizer = Tokenizer(num_words=NUM_WORDS,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'',lower=True)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
num_badwords = len(bad_words)
n = 0
temp_bw = bad_words
for word, i in word_index.items():
    if word in bad_words:
        temp_bw.remove(word)
        n = n+1
    if i > (NUM_WORDS-num_badwords+n):
        for bw in temp_bw:
            tokenizer.word_index[bw] = i
            i=i+1
        break           

In [59]:
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_validation = tokenizer.texts_to_sequences(X_validation)

In [60]:
X_train = pad_sequences(sequences_train,maxlen=50)
X_validation = pad_sequences(sequences_validation,maxlen=X_train.shape[1])


y_train = np.asarray(y_train)
y_validation = np.asarray(y_validation)

In [61]:
embedding_vecor_length = 300
model = Sequential()
model.add(Embedding(5000, embedding_vecor_length, input_length=X_train.shape[1]))
model.add(LSTM(256))
model.add(Dense(64, activation='relu'))
model.add(Dense(6, activation='sigmoid'))

In [66]:
opt = Adam(lr=1e-3)
model.compile(loss='binary_crossentropy',optimizer=opt)

# Fitting Model to the data
callbacks = [EarlyStopping(monitor='val_loss')]
hist_adam = model.fit(X_train, y_train, batch_size=400, epochs=20, verbose=2,
                      validation_data=(X_validation, y_validation),callbacks=callbacks)

Epoch 1/20
240/240 - 13s - loss: 0.0491 - val_loss: 0.0604
Epoch 2/20
240/240 - 12s - loss: 0.0462 - val_loss: 0.0600
Epoch 3/20
240/240 - 12s - loss: 0.0433 - val_loss: 0.0655


In [67]:
sequences_test=tokenizer.texts_to_sequences(test['comment_text'])
X_test = pad_sequences(sequences_test,maxlen=X_train.shape[1])

pred_train = model.predict(X_train)
pred_test = model.predict(X_test)
pred_val = model.predict(X_validation)

In [68]:
for j in range(0,6):
    temp = pred_train[:,j]
    for i in range(0,len(temp)):
        if temp[i] > 0.75:
            pred_train[:,j][i] = 1
        else: 
            pred_train[:,j][i] = 0
    
for j in range(0,6):
    temp = pred_test[:,j]
    for i in range(0,len(temp)):
        if temp[i] > 0.75:
            pred_test[:,j][i] = 1
        else: 
            pred_test[:,j][i] = 0
            
for j in range(0,6):
    temp = pred_val[:,j]
    for i in range(0,len(temp)):
        if temp[i] > 0.75:
            pred_val[:,j][i] = 1
        else: 
            pred_val[:,j][i] = 0

In [69]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
Accuracy = np.zeros((3,6))
for i,x in enumerate(col):
    acc = np.array([metrics.accuracy_score(y_train[:,i], pred_train[:,i]),
                    metrics.accuracy_score(y_validation[:,i], pred_val[:,i]),
                    metrics.accuracy_score(test_y[x], pred_test[:,i])])
    print(x,"Train Accuracy:",acc[0],", Val Accuracy:",acc[1],", Test Accuracy:",acc[2])
    Accuracy[:,i] = acc
    
avg_accuracy = Accuracy.mean(axis=1)
print("Average Train Accuracy:",avg_accuracy[0],
      ", Average Val Accuracy:",avg_accuracy[1],
      ", Average Test Accuracy:",avg_accuracy[2])

toxic Train Accuracy: 0.9553800839756846 , Val Accuracy: 0.9504927227435805 , Test Accuracy: 0.9377285942042577
severe_toxic Train Accuracy: 0.9905266236343506 , Val Accuracy: 0.9900828776888249 , Test Accuracy: 0.9943886961142893
obscene Train Accuracy: 0.9777631551461219 , Val Accuracy: 0.9764683764433094 , Test Accuracy: 0.9653005720716497
threat Train Accuracy: 0.9970650289319212 , Val Accuracy: 0.9969136286014194 , Test Accuracy: 0.9967019913095126
insult Train Accuracy: 0.9673706419335297 , Val Accuracy: 0.9672405959673502 , Test Accuracy: 0.9593922911000656
identity_hate Train Accuracy: 0.991247310480249 , Val Accuracy: 0.9912265584608877 , Test Accuracy: 0.9891525211791553
Average Train Accuracy: 0.9798921406836429 , Average Val Accuracy: 0.9787374599842287 , Average Test Accuracy: 0.9737774443298216
