In [131]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import os
import re
import sys
import unicodedata
from tqdm import tqdm
import gensim

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.models import Model

In [132]:
def remove_stopwords(sentences):
    stop_words=set(stopwords.words('english'))
    
    new_sentences=[]
    
    for sentence in tqdm(sentences):
        sentence = [word for word in sentence.lower().split() if word not in stop_words]
        sentence = ' '.join(sentence)
        new_sentences.append(sentence)
    
    return new_sentences    

In [133]:
def punc(sentences):
    punctuation = dict.fromkeys([i for i in range(sys.maxunicode)
                                 if unicodedata.category(chr(i)).startswith('P')])

    new_sentences=[]
    
    for sentence in tqdm(sentences):
        sentence = [i.lower() for i in nltk.word_tokenize(sentence.translate(punctuation))]
        sentence= ' '.join(sentence)
        new_sentences.append(sentence)
    
    return new_sentences    

In [134]:
def tokenize(sentences):
    
    tokens=[]
    
    for sentence in sentences:
        sentence=nltk.word_tokenize(sentence)
        tokens.append(sentence)
    
    return tokens

In [135]:
df_train=pd.read_csv(os.path.join('data', 'insults', 'train.csv'))
df_test=pd.read_csv(os.path.join('data', 'insults', 'test.csv'))
df_valid=pd.read_csv(os.path.join('data', 'insults', 'valid.csv'))

df=[df_train, df_test, df_valid]
df=pd.concat(df)
print(df.columns)
all_comments=df.Comment.values
all_labels=df.Class.values
comments=[]

for comment in tqdm(all_comments):
    comment=re.sub(r"\\+[a-z0-9]{1,3}", ' ', comment)
    comment=re.sub(r'\@\w+', ' ', comment)
    comments.append(comment)

comments=punc(comments)
comments=remove_stopwords(comments)
comments=tokenize(comments)

print(comments[:5])

100%|██████████| 8829/8829 [00:00<00:00, 136963.01it/s]

Index(['Comment', 'Class'], dtype='object')



100%|██████████| 8829/8829 [00:02<00:00, 3983.03it/s]
100%|██████████| 8829/8829 [00:00<00:00, 122456.27it/s]


[['fuck', 'dad'], ['really', 'dont', 'understand', 'point', 'seems', 'mixing', 'apples', 'oranges'], ['majority', 'canadians', 'wrong', 'unless', 'youre', 'supportive', 'idea', 'nothing', 'full', 'proof', 'perfect', 'take', 'chances', 'inadvertently', 'kill', 'son', 'daughter', 'thems', 'breaks', 'always', 'regard', 'collateral', 'damage', 'like', 'wartime', 'sorry', 'cheques', 'mail'], ['listen', 'dont', 'wan', 'na', 'get', 'married', 'man', 'women', 'dont', 'would', 'bother', 'gay', 'people', 'got', 'married', 'stay', 'lane', 'let', 'god', 'nice', 'quick', 'judg', 'like', 'thought', 'wasnt', 'suppose', 'judge', 'people'], ['c', 'c', 'b', 'a1n', 'xu', 'd1ng', '11', 'b0', 'ddng', 'bi', 'c3u', 'nh', '2011', 'c', 'n', 'ho', 'kh', 'ng', 'c', 'c', 'ng', 'b0', 'n', 'ng', 'd3i', 'cu', '11', 'a7u', 'chi', 'e5', 'nh', 'e5c', 'c', 'n', 'ho', 'kh', 'ng', 'c', 'c', 'n', 'ng', 'n', 'gi', 'ef', '11', 'a5t', 'df', 'v', '03n', 'giang', 'c', 'a7n', 'th', 'a1', 'c', 'n', 'ho', 'kh', 'ng', 'r', 'd1t', '

In [136]:
max_words = 1000
max_len = 50
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(comments)
sequences = tok.texts_to_sequences(comments)
print(sequences[0])
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
print(sequences_matrix[0])
labels = pd.Series(all_labels).str.get_dummies()
labels=np.asarray(labels)

import pickle
with open('tok.pkl', 'wb') as f:
    pickle.dump(tok, f, pickle.HIGHEST_PROTOCOL)

[10, 657]
[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0  10 657]


In [137]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,100,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(256,name='FC1')(layer)
    layer = Activation('relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(2,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 50)                0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 50, 100)           100000    
_________________________________________________________________
lstm_10 (LSTM)               (None, 64)                42240     
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
activation_19 (Activation)   (None, 256)               0         
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 2)                 514       
__________

In [138]:
model.fit(sequences_matrix,labels,batch_size=128,epochs=10)
model.save('models/model2.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [142]:
del model
del tok
pred_list=['Insult', 'Not Insult']
from keras.models import load_model

model=load_model('models/model1.h5')
with open('tok.pkl', 'rb') as f:
    tok = pickle.load(f)

text=[['what', 'the', 'dad']]
sequences = tok.texts_to_sequences(text)
print(sequences)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
print(sequences_matrix)

pred=model.predict(sequences_matrix)

print(pred_list[np.argmax(pred)])

[[657]]
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0 657]]
Not Insult


In [140]:
labels

array([[1, 0],
       [0, 1],
       [0, 1],
       ...,
       [0, 1],
       [1, 0],
       [0, 1]])