In [1]:
# All required libraries

import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras import Sequential
from nltk.tokenize import word_tokenize 
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pickle

In [4]:
train=pd.read_csv("train.csv")

In [7]:
stop_nltk = stopwords.words("english")
stop_updated = stop_nltk + list(punctuation)
lemm = WordNetLemmatizer()

In [11]:
def  clean_txt(sent):
    tokens = word_tokenize(sent.lower().replace('[^a-z ]',' '))
    stemmed = [lemm.lemmatize(term) for term in tokens 
               if term not in stop_updated and len(term) > 2] 
    res = " ".join(stemmed)
    return res

In [12]:
train['cleaned_review'] = train.comment_text.apply(clean_txt)

In [38]:
labels=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
x=train['cleaned_review'].values
y=train[labels].values

In [39]:
tokenizer = Tokenizer(num_words=30000,filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(list(x))
tokenized_train = tokenizer.texts_to_sequences(x)
x_t = pad_sequences(tokenized_train, maxlen=200)

In [40]:
# saving Tokenizer

with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
inp = Input(shape=(maxlen, ))
embed_size = 128
x = Embedding(max_features, embed_size)(inp)

In [46]:
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)

In [47]:
x = GlobalMaxPool1D()(x)

In [48]:
x = Dropout(0.2)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.2)(x)
x = Dense(6, activation="sigmoid")(x)


In [49]:
model=Sequential()
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
print(model.summary())

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 200, 128)          3840000   
_________________________________________________________________
lstm_layer (LSTM)            (None, 200, 60)           45360     
_________________________________________________________________
global_max_pooling1d_2 (Glob (None, 60)                0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 50)                3050      
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0   

In [73]:
# checkpoint

filepath='wts.h5'
checkpoint=ModelCheckpoint(filepath,monitor='val_accuracy',verbose=1,save_best_only=True,mode='max')

In [None]:
#Model Fitting

model.fit(x_t,y, batch_size=30, epochs=1, validation_split=0.33, verbose=0, callbacks=[checkpoint])

In [68]:
# Testing the model with unseen comments

test_comments = ["Never trust Udemy. It is the most pathetic, unworthy and untrustful site.","When seeing all over the experience with Udemy is too bad and I never recommend anyone to go for udemy. Type of YouTube channels are better than udemy.. At least you can save your money.","Many of my coworkers choose to use Udemy for continuing education. I feel it has the best selection, training and curriculum vs others I have tried. Yes, the courses may be longer than others, but they're more detailed.","This bar sucks plain and simple. Dominated by hipster retro people that can not be talked to unless you know know Morrissey's new album. Pabst always a great price shit bartenders who will ignore you and make drinks like they know what they are doing. This place is great for any 20 something year old trying to fit in haha garbage place. Oh do not forget the sexist 5 dollar make charge lol get fucked","An extremely helpful and informative course, especially in conjuction with multi-modal training. Training materials were well organized and provided good case studies. Instructor was extremely professional and pleasant to learn from. Dawn did an exceptional job presenting the material. She set up by explaining what she was going to teach us, summarized, and proceeded to teach, providing relevant real life examples. She found out what we handled and catered examples to us to make the course meaningful. I am a CHMM and have taken many similar courses - this was very well done, which I attribute primarily to the instructor and secondarily to the quality materials."]

test_df = pd.DataFrame(data=test_comments, columns=['Test Comments'])

test_df['Clean_test_comm'] = test_df['Test Comments'].apply(clean_txt)

test=test_df['Clean_test_comm'].values

tokenized_test = tokenizer.texts_to_sequences(list(test))

xtest = pad_sequences(tokenized_test, maxlen=maxlen)

In [69]:
ypred=model.predict(xtest)
ypred

array([[6.24631524e-01, 1.28108263e-03, 8.85643363e-02, 5.08612394e-03,
        2.13645548e-01, 1.09324753e-02],
       [3.02195549e-05, 0.00000000e+00, 3.63588333e-06, 1.19209290e-07,
        2.29477882e-06, 1.19209290e-07],
       [1.72734261e-04, 0.00000000e+00, 1.82688236e-05, 4.76837158e-07,
        1.15036964e-05, 6.25848770e-07],
       [6.60809457e-01, 5.06237149e-03, 5.02703011e-01, 2.50238180e-03,
        2.72275239e-01, 1.26154125e-02],
       [3.25143337e-04, 8.94069672e-08, 3.43322754e-05, 1.01327896e-06,
        2.12318319e-05, 2.63981246e-06]], dtype=float32)

In [70]:
test_df_fin = pd.concat([test_df, pd.DataFrame(data=np.round(ypred), columns=labels)], axis=1)
test_df_fin.iloc[:, 1:]

Unnamed: 0,Clean_test_comm,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,never trust udemy pathetic unworthy untrustful...,1.0,0.0,0.0,0.0,0.0,0.0
1,seeing experience udemy bad never recommend an...,0.0,0.0,0.0,0.0,0.0,0.0
2,many coworkers choose use udemy continuing edu...,0.0,0.0,0.0,0.0,0.0,0.0
3,bar suck plain simple dominated hipster retro ...,1.0,0.0,1.0,0.0,0.0,0.0
4,extremely helpful informative course especiall...,0.0,0.0,0.0,0.0,0.0,0.0
