In [9]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Bidirectional,SpatialDropout1D
from keras.models import Model
from keras.models import Sequential

In [2]:
data = pd.read_csv('preprocess_data.csv')
data.drop(['task_1','Unnamed: 0','text'], axis=1, inplace=True)
data.head()

Unnamed: 0,Unnamed: 0.1,_id,task_2,text_clean
0,4986,60c5d6bf5659ea5e55defa2c,PRFN,made amp amp onli abl start make money sustain...
1,3394,60c5d6bf5659ea5e55def461,OFFN,technic still turn back clock dick head
2,1310,60c5d6bf5659ea5e55defaad,NONE,govt stop think world media liber gang ani opt...
3,3390,60c5d6bf5659ea5e55def419,OFFN,soldier japan dick head
4,4626,60c5d6bf5659ea5e55def7fa,OFFN,would better ask think sleazi shitbag lmao


In [3]:
sentences = data['text_clean'].astype(str)
tokenizer = Tokenizer(num_words = 1500,split=' ')
tokenizer.fit_on_texts(sentences)
sequence = tokenizer.texts_to_sequences(sentences)

In [5]:
max_seq_len = 2500

index_of_words = tokenizer.word_index
print("No of unique words : ",len(index_of_words))

X = pad_sequences(sequence , maxlen = max_seq_len )
Y = data['task_2']

print(X)

No of unique words :  8255
[[   0    0    0 ...  170    3  210]
 [   0    0    0 ...   72   54   73]
 [   0    0    0 ...    3   52   13]
 ...
 [   0    0    0 ...  817   45  156]
 [   0    0    0 ...  213   99   38]
 [   0    0    0 ... 1166  236   57]]


In [6]:
embed_dim = 256
vocabSize = len(index_of_words)
lstm_out = 64

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.15, random_state = 0)
Y_true = Y_test
Y_train = pd.get_dummies(Y_train).values
Y_test = pd.get_dummies(Y_test).values

In [14]:
model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = 2500))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 2500, 256)         2113280   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 2500, 256)         0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                82176     
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 260       
Total params: 2,195,716
Trainable params: 2,195,716
Non-trainable params: 0
_________________________________________________________________
None


In [15]:
from keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("hasoc_b.h5", monitor='val_loss', verbose=1, save_best_only=True,
save_weights_only=False, mode='auto')

In [22]:
print(Y_true)

1358    NONE
2200    HATE
2337    PRFN
3640    NONE
2928    PRFN
        ... 
472     PRFN
15      PRFN
1813    HATE
1721    OFFN
3690    PRFN
Name: task_2, Length: 577, dtype: object


In [23]:
print(Y_test)
classes = ['HATE','NONE','PRFN','OFFN']

[[0 1 0 0]
 [1 0 0 0]
 [0 0 0 1]
 ...
 [1 0 0 0]
 [0 0 1 0]
 [0 0 0 1]]


In [16]:
model.fit(X_train,Y_train ,batch_size = 32, epochs = 1 ,validation_data=(X_test,Y_test) , callbacks=[checkpoint])

Epoch 00001: val_loss improved from inf to 0.39771, saving model to hasoc_b.h5


<tensorflow.python.keras.callbacks.History at 0x1da15983430>

In [17]:
Y_pred = model.predict(X_test)

In [18]:
print(Y_pred)

[[0.10567279 0.78024745 0.07441134 0.03966843]
 [0.16810624 0.739315   0.06376564 0.02881311]
 [0.01539988 0.10032225 0.04804364 0.8362343 ]
 ...
 [0.57636255 0.20152661 0.20633763 0.01577327]
 [0.49971446 0.2997688  0.16714613 0.03337058]
 [0.02898476 0.2095538  0.07587679 0.68558466]]


In [25]:
pred_class = []
for i in Y_pred:
    pred_class.append(np.argmax(i))
print(pred_class)

pred_class = pd.get_dummies(pred_class).values
print(pred_class)

[1, 1, 3, 1, 3, 3, 3, 1, 3, 0, 1, 3, 0, 3, 3, 3, 1, 3, 3, 1, 1, 0, 2, 0, 1, 3, 0, 2, 1, 3, 3, 3, 1, 3, 0, 3, 2, 3, 3, 0, 1, 1, 3, 2, 1, 1, 3, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 3, 3, 1, 0, 1, 3, 3, 1, 1, 3, 3, 2, 1, 3, 3, 1, 0, 1, 3, 1, 3, 0, 3, 3, 0, 3, 1, 3, 3, 3, 3, 1, 0, 0, 0, 1, 3, 1, 1, 3, 0, 0, 1, 3, 1, 3, 3, 0, 1, 2, 3, 3, 3, 3, 1, 2, 3, 0, 1, 1, 3, 1, 3, 3, 3, 1, 1, 1, 1, 3, 1, 1, 3, 3, 3, 0, 0, 0, 3, 0, 1, 3, 1, 3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 1, 0, 1, 3, 1, 3, 3, 1, 1, 3, 1, 3, 1, 1, 0, 1, 3, 1, 3, 1, 1, 3, 1, 3, 1, 0, 1, 0, 3, 2, 1, 3, 3, 3, 1, 0, 0, 3, 1, 1, 1, 0, 3, 0, 1, 1, 3, 3, 1, 3, 3, 1, 2, 0, 3, 3, 0, 1, 1, 3, 1, 1, 1, 3, 0, 3, 1, 3, 3, 3, 3, 1, 0, 0, 0, 3, 3, 3, 1, 3, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 3, 1, 2, 3, 1, 1, 1, 3, 3, 1, 1, 3, 0, 3, 1, 3, 3, 3, 3, 1, 3, 1, 3, 1, 1, 3, 3, 1, 1, 1, 1, 1, 3, 0, 3, 3, 1, 1, 0, 1, 3, 0, 3, 1, 0, 3, 1, 0, 0, 1, 3, 1, 1, 3, 3, 1, 1, 0, 0, 2, 3, 3, 0, 0, 3, 3, 1, 1, 1, 3, 0, 1, 0, 2, 1, 0, 1, 3, 1, 0, 1, 3, 1, 2, 3, 1, 1, 1, 1, 2, 3, 0, 3, 

In [26]:
print(classification_report(Y_test , pred_class))

              precision    recall  f1-score   support

           0       0.41      0.44      0.42        96
           1       0.62      0.70      0.66       197
           2       0.35      0.09      0.14       104
           3       0.71      0.89      0.79       180

   micro avg       0.60      0.60      0.60       577
   macro avg       0.52      0.53      0.50       577
weighted avg       0.56      0.60      0.57       577
 samples avg       0.60      0.60      0.60       577

