### 2. Neural Network Classifier with Keras

In [1]:
# load libraries
import pandas as pd
import numpy as np
import json
import re
import pickle
import nltk
import sklearn

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, auc, precision_recall_fscore_support
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

In [2]:
# Read data
def read_data(file):
    '''
    Take a json file location and
    read the file into a pandas data frame
    Args: full path to file
    Returns: pandas dataframe with data from file
    '''
    
    data = []
    with open(file) as f:
        for line in f:
            data.append(json.loads(line))
    # convert to dataframe
    return pd.DataFrame(data)

In [3]:
# load data into a dataframe
con_df = pd.read_json("controversial-comments.jsonl", lines=True)

# check size, structure and categories

print('Size: ', len(con_df), '\n',
      'Shape: ', con_df.info(), '\n',
      'Categories: ', con_df.con.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950000 entries, 0 to 949999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   con     950000 non-null  int64 
 1   txt     950000 non-null  object
dtypes: int64(1), object(1)
memory usage: 14.5+ MB
Size:  950000 
 Shape:  None 
 Categories:  [0 1]


In [4]:
# pre-process data
def clean_text(text):
    """
    Remove punctuations and special characters, makes lower case
    Args: text 
    Output: text
    """
    
    text=text.lower()
    text=re.sub('</?.*?>',' <>', text)
    text=re.sub('\\d|\\W+|_',' ',text)
    text=re.sub('[^a-zA-Z]'," ", text)
    
    return text

In [5]:
# create stop word list
stop_words = stopwords.words('english')

size = 10000    # sample size
replace = True  # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]

controversy = con_df.groupby('con', as_index=False).apply(fn)

In [6]:
# free up memory

del con_df

controversy['txt'] = controversy['txt'].apply(lambda x:clean_text(x))
controversy.reset_index(drop=True, inplace=True)

controversy.head()

Unnamed: 0,con,txt
0,0,hi drpupipance thank you for participating in ...
1,0,he was put in an impossible situation created ...
2,0,so what you re saying isn t actually indicativ...
3,0,and just after ted cruz told close friends tha...
4,0,pretty sure some of them believe this


In [8]:
# assign a variable to load into the ML model
corpus = controversy
X = corpus['txt']
y = corpus['con']

In [9]:
# load library
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
# convert text to numbers
vectorizer = CountVectorizer(max_features=5000, min_df=0.5, max_df=10, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(corpus).toarray()
X

array([[1, 0],
       [0, 1]], dtype=int64)

In [11]:
# convert values into tdidf values
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()
X

array([[1., 0.],
       [0., 1.]])

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(corpus['txt']).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [13]:
# divide the data into training (80%) and testing sets (20%)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# load libraries
import keras
from keras.models import Sequential
from keras.layers import Dense

In [15]:
#Initializing Neural Network
classifier = Sequential()

In [None]:
input_dim = X_train.shape[1]
# Adding the input layer and the first hidden layer
classifier.add(Dense(output_dim = 500, init = 'uniform', activation = 'softmax', input_dim = input_dim))

# Adding the second hidden layer
classifier.add(Dense(output_dim = 150, init = 'uniform', activation = 'softmax'))

# Adding the output layer
classifier.add(Dense(output_dim = 1, init = 'uniform', activation = 'sigmoid'))

In [17]:
# Compiling Neural Network
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [18]:
# Fitting our model 
classifier.fit(X_train, y_train, batch_size = 5, nb_epoch = 10)

  



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x15a9efc2b88>

In [19]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

In [20]:
# Creating the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [21]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1154  840]
 [ 761 1245]]
              precision    recall  f1-score   support

           0       0.60      0.58      0.59      1994
           1       0.60      0.62      0.61      2006

    accuracy                           0.60      4000
   macro avg       0.60      0.60      0.60      4000
weighted avg       0.60      0.60      0.60      4000

0.59975


#### The accuracy score is 0.59975. Compared to 0.6085 from scikit-learn's method, they are very close.