In [1]:
!pip install contractions

import pandas as pd
import re
import contractions
import en_core_web_sm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras import Sequential
from keras.layers import Embedding, Dense, Dropout, GlobalMaxPool1D
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix

Collecting contractions
  Downloading contractions-0.0.24-py2.py3-none-any.whl (3.2 kB)
Collecting textsearch
  Downloading textsearch-0.0.17-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: textsearch, contractions
Successfully installed contractions-0.0.24 textsearch-0.0.17


Using TensorFlow backend.


In [2]:
#LOAD DATA
o = pd.read_csv("../input/onion-or-not/OnionOrNot.csv")

#SHOW FIVE ROWS
o.head(5)

Unnamed: 0,text,label
0,Entire Facebook Staff Laughs As Man Tightens P...,1
1,Muslim Woman Denied Soda Can for Fear She Coul...,0
2,Bold Move: Hulu Has Announced That They’re Gon...,1
3,Despondent Jeff Bezos Realizes He’ll Have To W...,1
4,"For men looking for great single women, online...",1


In [3]:
#FIX CONTRACTIONS
o['text'] = o['text'].apply(lambda x: contractions.fix(x))

#REMOVE PUNCTUATION
o['text'] = o['text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

#CONVERT TO LOWERCASE

def lowerCase(input_str):
    input_str = input_str.lower()
    return input_str

o['text'] = o['text'].apply(lambda x: lowerCase(x))

#SHOW FIVE ROWS
o.head(5)

Unnamed: 0,text,label
0,entire facebook staff laughs as man tightens p...,1
1,muslim woman denied soda can for fear she coul...,0
2,bold move hulu has announced that they are goi...,1
3,despondent jeff bezos realizes he will have to...,1
4,for men looking for great single women online ...,1


In [4]:
#LEMMATIZATION
sp = en_core_web_sm.load()

def lemma(input_str):
    s = sp(input_str)
    
    input_list = []
    for word in s:
        w = word.lemma_
        input_list.append(w)
        
    output = ' '.join(input_list)
    return output

o['text'] = o['text'].apply(lambda x: lemma(x))

#SHOW FIVE ROWS
o.head(5)

Unnamed: 0,text,label
0,entire facebook staff laugh as man tighten pri...,1
1,muslim woman deny soda can for fear -PRON- cou...,0
2,bold move hulu have announce that -PRON- be go...,1
3,despondent jeff bezos realize -PRON- will have...,1
4,for man look for great single woman online dat...,1


In [5]:
#VECTORIZE
tokenizer = Tokenizer(num_words = 10000, split = ' ')
tokenizer.fit_on_texts(o['text'].values)

X = tokenizer.texts_to_sequences(o['text'].values)
X = pad_sequences(X)

y = o['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
                                                    random_state = 42)

In [6]:
#BUILD THE MODEL
model = Sequential()

model.add(Embedding(10000, 128, input_length = X.shape[1]))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(GlobalMaxPool1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'])

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 63, 128)           1280000   
_________________________________________________________________
dense_1 (Dense)              (None, 63, 128)           16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 63, 128)           0         
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

In [7]:
#TRAIN THE MODEL
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1)

history = model.fit(X_train, y_train, 
                    epochs = 1000, batch_size = 32, verbose = 0, #YOU CAN CHANGE verbose = 1 TO SEE THE PROCESS
                    validation_data = (X_test, y_test), callbacks=[es])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 00003: early stopping


In [8]:
#SHOW ACCURACY AND CONFUSION MATRIX
y_pred = model.predict(X_test)
y_pred = y_pred > 0.5

accuracy_score(y_pred, y_test)

0.8295833333333333

In [9]:
confusion_matrix(y_pred, y_test)

array([[2538,  338],
       [ 480, 1444]])