In [0]:
import pandas as pd
import numpy as np
import csv
import sys
import nltk
from nltk.corpus import stopwords
import glob
import os
import string

In [0]:
#Load training file

text_file = open("train1.ft.txt", "r", encoding="utf8")
lines = text_file.read().split('\n')

labels = []
for item in lines:
    first_four_letters = item[:10]
    if first_four_letters == '__label__1':
        labels.append(int(1))
    else:
        labels.append(int(2))
        
def remove_label(s):
    return s[11:]
lines = [remove_label(s) for s in lines]

df = pd.DataFrame()
df['text'] = lines
df['label'] = labels

df = df.sample(120000)
print(len(df))
df.head()

120000


Unnamed: 0,text,label
3265217,JUNK / Bad Seller!!: I paid $7.63 for this ite...,1
3207238,An awesome book by a wonderful author: I met L...,2
1359854,"Too small and awkward: While cute, this little...",1
1715468,Very enjoyable: Feels like I've spent a month ...,2
2670191,Go back in time - Watch this movie: This is a ...,2


In [0]:
#Clean text

stop = stopwords.words('english')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

#lowercase and remove punctuation, remove stopwords        
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace('-', ' ')
df['text'] = df['text'].str.split(' ')
df['text'] = df['text'].apply(lambda x: [item for item in x if item not in stop])
df['text'] = df['text'].apply(', '.join)
df['text'] = df['text'].str.replace('[{}]'.format(string.punctuation), '')
df['text'] = df['text'].apply(lemmatize_text)
df['text'] = df['text'].apply(', '.join)
df['text'] = df['text'].str.replace('[{}]'.format(string.punctuation), '')
df['text'] = df['text'].str.replace('\\', ' ')


In [0]:
#Train Word2Vec

from gensim.models import Word2Vec

text = [row.split() for row in df['text']]
model_w2v = Word2Vec(text)

In [0]:
model_w2v.save('model_w2v.bin')

In [0]:
#Average Word2Vec Vectors for BOW

from tqdm import tqdm

text_vec = []
text_avg_vec = []
count = 0
for row in tqdm(range(len(text))):
    [word.split(' ', 1) for word in text[row]]
  
    for i in range(len(text[row])):
        try:
            text_vec.append(model_w2v[text[row][i]])
            count = count + 1
        except KeyError as e:
            text_vec.append([0]*100)
  
    average = np.add.reduce(text_vec)
    if count==0:
        count = 1
    average = np.divide(average, count)
    text_avg_vec.append(average)
    text_vec = []
    count = 0

  # This is added back by InteractiveShellApp.init_path()
100%|██████████| 120000/120000 [00:20<00:00, 5788.63it/s]


In [0]:
for i in range(len(text_avg_vec)):
    if type(text_avg_vec[i]) != np.ndarray:
        text_avg_vec[i] = np.zeros(100)

In [0]:
x_train = text_avg_vec
x_train = np.c_[x_train]

df['label'] = df['label'] - 1

y_train= np.asarray(df.label)

In [0]:
#Logistic Regression Training

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression().fit(x_train, y_train)



In [0]:
#Naive Bayes Training

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB().fit(x_train, y_train)

In [0]:
#Deep Neural net Training

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, load_model

# mini batches Nadam optimizer with dropout and batch normalization
epochs = 100
model = tf.keras.Sequential()
model.add(layers.Dense(32, input_dim=100))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(32))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(rate = 0.2))
model.add(layers.Dense(32))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(64))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(rate = 0.3))
model.add(layers.Dense(64))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(64))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(rate = 0.4))

model.add(layers.Dense(1))
model.add(layers.Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999),
              metrics=['accuracy'])
checkpoint = keras.callbacks.ModelCheckpoint("NN.model", monitor='val_accuracy', verbose=1, save_best_only=True)

model.summary()
model1 = model.fit(x_train, y_train, epochs=epochs, validation_split=0.2, callbacks=[checkpoint])
#history = model.fit(x_train, y_train, epochs = epochs, validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_14 (Dense)             (None, 32)                3232      
_________________________________________________________________
activation_14 (Activation)   (None, 32)                0         
_________________________________________________________________
batch_normalization_12 (Batc (None, 32)                128       
_________________________________________________________________
dense_15 (Dense)             (None, 32)                1056      
_________________________________________________________________
activation_15 (Activation)   (None, 32)                0         
_________________________________________________________________
batch_normalization_13 (Batc (None, 32)                128       
_________________________________________________________________
dropout_6 (Dropout)          (None, 32)               

In [0]:
x_train1 = x_train.reshape(120000, x_train.shape[1], 1)
print(x_train1.shape)

(120000, 100, 1)


In [0]:
#CNN Training

import tensorflow as tf

shape = (x_train.shape[1], 1)
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Conv1D(32, kernel_size=3, activation=tf.nn.relu, input_shape=shape))
model.add(tf.keras.layers.Conv1D(32, kernel_size=3, activation=tf.nn.relu))

model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
          
model.add(tf.keras.layers.Conv1D(64, kernel_size=3, activation=tf.nn.relu))
model.add(tf.keras.layers.Conv1D(64, kernel_size=3, activation=tf.nn.relu))
          
model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
          
model.add(tf.keras.layers.Conv1D(128, kernel_size=3, activation=tf.nn.relu))
model.add(tf.keras.layers.Conv1D(128, kernel_size=3, activation=tf.nn.relu))
          
model.add(tf.keras.layers.MaxPooling1D(pool_size=2))
          
model.add(tf.keras.layers.Conv1D(256, kernel_size=3, activation=tf.nn.relu))
model.add(tf.keras.layers.Conv1D(256, kernel_size=3, activation=tf.nn.relu))
          
model.add(tf.keras.layers.MaxPooling1D(pool_size=5))

model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.BatchNormalization())
          
model.add(tf.keras.layers.Dense(200, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(100, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid))


model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [0]:
checkpoint = tf.keras.callbacks.ModelCheckpoint("CNN.model", monitor='val_accuracy', verbose=1, save_best_only=True)
model2 = model.fit(x_train1, y_train, epochs=100, validation_split=0.2, callbacks=[checkpoint])

Train on 96000 samples, validate on 24000 samples
Epoch 1/100
Epoch 00001: val_accuracy improved from -inf to 0.83017, saving model to CNN.model
INFO:tensorflow:Assets written to: CNN.model/assets
Epoch 2/100
Epoch 00002: val_accuracy did not improve from 0.83017
Epoch 3/100
Epoch 00003: val_accuracy improved from 0.83017 to 0.84704, saving model to CNN.model
INFO:tensorflow:Assets written to: CNN.model/assets
Epoch 4/100
Epoch 00004: val_accuracy did not improve from 0.84704
Epoch 5/100
Epoch 00005: val_accuracy did not improve from 0.84704
Epoch 6/100
Epoch 00006: val_accuracy did not improve from 0.84704
Epoch 7/100
Epoch 00007: val_accuracy improved from 0.84704 to 0.84800, saving model to CNN.model
INFO:tensorflow:Assets written to: CNN.model/assets
Epoch 8/100
Epoch 00008: val_accuracy improved from 0.84800 to 0.84808, saving model to CNN.model
INFO:tensorflow:Assets written to: CNN.model/assets
Epoch 9/100
Epoch 00009: val_accuracy did not improve from 0.84808
Epoch 10/100
Epoch

In [0]:
#Load and Clean test file

text_file = open("test1.ft.txt", "r", encoding="utf8")
lines = text_file.read().split('\n')

labels = []
for item in lines:
    first_four_letters = item[:10]
    if first_four_letters == '__label__1':
        labels.append(int(1))
    else:
        labels.append(int(2))
        
def remove_label(s):
    return s[11:]
lines = [remove_label(s) for s in lines]

df = pd.DataFrame()
df['text'] = lines
df['label'] = labels

stop = stopwords.words('english')
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text)]

#lowercase and remove punctuation, remove stopwords        
df['text'] = df['text'].str.lower()
df['text'] = df['text'].str.replace('-', ' ')
df['text'] = df['text'].str.split(' ')
df['text'] = df['text'].apply(lambda x: [item for item in x if item not in stop])
df['text'] = df['text'].apply(', '.join)
df['text'] = df['text'].str.replace('[{}]'.format(string.punctuation), '')
df['text'] = df['text'].apply(lemmatize_text)
df['text'] = df['text'].apply(', '.join)
df['text'] = df['text'].str.replace('[{}]'.format(string.punctuation), '')
df['text'] = df['text'].str.replace('\\', ' ')

df = df.sample(40000)

In [0]:
#Embed Word2Vec and BOW

from tqdm import tqdm

text = [row.split() for row in df['text']]
text_vec = []
text_avg_vec = []
count = 0
for row in tqdm(range(len(text))):
    [word.split(' ', 1) for word in text[row]]
  
    for i in range(len(text[row])):
        try:
            text_vec.append(model_w2v[text[row][i]])
            count = count + 1
        except KeyError as e:
            text_vec.append([0]*100)
  
    average = np.add.reduce(text_vec)
    if count==0:
        count = 1
    average = np.divide(average, count)
    text_avg_vec.append(average)
    text_vec = []
    count = 0

  if sys.path[0] == '':
100%|██████████| 40000/40000 [00:07<00:00, 5480.97it/s]


In [0]:
for i in range(len(text_avg_vec)):
    if type(text_avg_vec[i]) != np.ndarray:
        text_avg_vec[i] = np.zeros(100)

In [0]:
x_test = text_avg_vec
x_test = np.c_[x_test]

df['label'] = df['label'] - 1

In [0]:
#Logistic Regression Results

predicted = lr.predict(x_test)

df['prediction'] = predicted

In [0]:
from sklearn.metrics import precision_recall_fscore_support as score

predicted = predicted 
y_test = df['label']

precision, recall, fscore, support = score(y_test, predicted, labels=[0, 1])

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.84339836 0.85587053]
recall: [0.85837167 0.84068186]
fscore: [0.85081914 0.84820821]
support: [19996 20004]


In [0]:
count_true = 0
false_pos = 0
false_neg = 0

for index, row in df.iterrows():
    if row['label'] == row['prediction']:
        count_true = count_true + 1
    elif row['label'] == 0 and row['prediction'] == 1:
        false_pos = false_pos + 1
    elif row['label'] == 1 and row['prediction'] == 0:
        false_neg = false_neg + 1

print("Accuracy on test set: " + str(count_true/len(df)))
print("False pos: " + str(false_pos/len(df)))
print("False neg: " + str(false_neg/len(df)))

Accuracy on test set: 0.849525
False pos: 0.0708
False neg: 0.079675


In [0]:
#Naive Bayes Results

predicted = clf.predict(x_test)

df['prediction'] = predicted

In [0]:
count_true = 0
false_pos = 0
false_neg = 0

for index, row in df.iterrows():
    if row['label'] == row['prediction']:
        count_true = count_true + 1
    elif row['label'] == 0 and row['prediction'] == 1:
        false_pos = false_pos + 1
    elif row['label'] == 1 and row['prediction'] == 0:
        false_neg = false_neg + 1

print("Accuracy on test set: " + str(count_true/len(df)))
print("False pos: " + str(false_pos/len(df)))
print("False neg: " + str(false_neg/len(df)))

Accuracy on test set: 0.72535
False pos: 0.14005
False neg: 0.1346


In [0]:
from sklearn.metrics import precision_recall_fscore_support as score

predicted = predicted 
y_test = df['label']

precision, recall, fscore, support = score(y_test, predicted, labels=[0, 1])

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.72777834 0.72297498]
recall: [0.71984397 0.73085383]
fscore: [0.72378941 0.72689305]
support: [19996 20004]


In [0]:
#Deep neural network Results

nn_model = tf.keras.models.load_model('NN.model')
predicted = nn_model.predict_classes(x_test)
df['prediction'] = predicted

In [0]:
count_true = 0
false_pos = 0
false_neg = 0

for index, row in df.iterrows():
    if row['label'] == row['prediction']:
        count_true = count_true + 1
    elif row['label'] == 0 and row['prediction'] == 1:
        false_pos = false_pos + 1
    elif row['label'] == 1 and row['prediction'] == 0:
        false_neg = false_neg + 1

print("Accuracy on test set: " + str(count_true/len(df)))
print("False pos: " + str(false_pos/len(df)))
print("False neg: " + str(false_neg/len(df)))

Accuracy on test set: 0.861675
False pos: 0.069775
False neg: 0.06855


In [0]:
from sklearn.metrics import precision_recall_fscore_support as score

predicted = predicted 
y_test = df['label']

precision, recall, fscore, support = score(y_test, predicted, labels=[0, 1])

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.86253572 0.86081883]
recall: [0.86042208 0.86292741]
fscore: [0.86147761 0.86187183]
support: [19996 20004]


In [0]:
#CNN Results

x_test1 = x_test.reshape(40000, x_test.shape[1], 1)

cnn_model = tf.keras.models.load_model('CNN.model')
predicted = cnn_model.predict_classes(x_test1)
df['prediction'] = predicted

In [0]:
count_true = 0
false_pos = 0
false_neg = 0

for index, row in df.iterrows():
    if row['label'] == row['prediction']:
        count_true = count_true + 1
    elif row['label'] == 0 and row['prediction'] == 1:
        false_pos = false_pos + 1
    elif row['label'] == 1 and row['prediction'] == 0:
        false_neg = false_neg + 1

print("Accuracy on test set: " + str(count_true/len(df)))
print("False pos: " + str(false_pos/len(df)))
print("False neg: " + str(false_neg/len(df)))

Accuracy on test set: 0.854125
False pos: 0.072325
False neg: 0.07355


In [0]:
from sklearn.metrics import precision_recall_fscore_support as score

predicted = predicted 
y_test = df['label']

precision, recall, fscore, support = score(y_test, predicted, labels=[0, 1])

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.85323023 0.8550238 ]
recall: [0.85532106 0.85292941]
fscore: [0.85427437 0.85397532]
support: [19996 20004]


In [0]:
#Training SVM

from sklearn import svm

svm_model = svm.SVC(kernel='linear')

svm_model.fit(x_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [0]:
import pickle
# save the classifier
with open('svm_model.pkl', 'wb') as fid:
    pickle.dump(svm_model, fid)    

In [0]:
#SVM Results

predicted = svm_model.predict(x_test)
df['prediction'] = predicted

In [0]:
count_true = 0
false_pos = 0
false_neg = 0

for index, row in df.iterrows():
    if row['label'] == row['prediction']:
        count_true = count_true + 1
    elif row['label'] == 0 and row['prediction'] == 1:
        false_pos = false_pos + 1
    elif row['label'] == 1 and row['prediction'] == 0:
        false_neg = false_neg + 1

print("Accuracy on test set: " + str(count_true/len(df)))
print("False pos: " + str(false_pos/len(df)))
print("False neg: " + str(false_neg/len(df)))

Accuracy on test set: 0.849375
False pos: 0.071425
False neg: 0.0792


In [0]:
from sklearn.metrics import precision_recall_fscore_support as score

predicted = predicted 
y_test = df['label']

precision, recall, fscore, support = score(y_test, predicted, labels=[0, 1])

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.84399468 0.85492307]
recall: [0.85712142 0.84163167]
fscore: [0.85050741 0.84822531]
support: [19996 20004]


In [0]:
#Training Random Forest

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, criterion='entropy', random_state=456)

rf.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=456,
                       verbose=0, warm_start=False)

In [0]:
#Random Forest Results

predicted = rf.predict(x_test)
df['prediction'] = predicted

In [0]:
count_true = 0
false_pos = 0
false_neg = 0

for index, row in df.iterrows():
    if row['label'] == row['prediction']:
        count_true = count_true + 1
    elif row['label'] == 0 and row['prediction'] == 1:
        false_pos = false_pos + 1
    elif row['label'] == 1 and row['prediction'] == 0:
        false_neg = false_neg + 1

print("Accuracy on test set: " + str(count_true/len(df)))
print("False pos: " + str(false_pos/len(df)))
print("False neg: " + str(false_neg/len(df)))

Accuracy on test set: 0.832175
False pos: 0.078425
False neg: 0.0894


In [0]:
from sklearn.metrics import precision_recall_fscore_support as score

predicted = predicted 
y_test = df['label']

precision, recall, fscore, support = score(y_test, predicted, labels=[0, 1])

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.82500612 0.83966266]
recall: [0.84311862 0.82123575]
fscore: [0.83396404 0.83034699]
support: [19996 20004]


In [0]:
#Training second layer Random Forest (Combined models)

lr_predict = lr.predict(x_train)
nb_predict = clf.predict(x_train)
svm_predict = svm_model.predict(x_train)
rf_predict = rf.predict(x_train)
nn_predict = nn_model.predict_classes(x_train)
cnn_predict = cnn_model.predict_classes(x_train1)

In [0]:
new_features = pd.DataFrame()
new_features['lr_predict'] = lr_predict
new_features['nb_predict'] = nb_predict
new_features['svm_predict'] = svm_predict
new_features['rf_predict'] = rf_predict
new_features['nn_predict'] = nn_predict
new_features['cnn_predict'] = cnn_predict

new_features.head()

Unnamed: 0,lr_predict,nb_predict,svm_predict,rf_predict,nn_predict,cnn_predict
0,0,0,0,0,0,0
1,1,1,1,1,1,1
2,1,0,1,0,1,1
3,1,1,1,1,1,1
4,1,0,1,1,1,1


In [0]:
rf_2 = RandomForestClassifier(n_estimators=500, criterion='entropy')

rf_2.fit(new_features, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
#Combined model results (2nd layer Random Forest)

lr_predict = lr.predict(x_test)
nb_predict = clf.predict(x_test)
svm_predict = svm_model.predict(x_test)
rf_predict = rf.predict(x_test)
nn_predict = nn_model.predict_classes(x_test)
cnn_predict = cnn_model.predict_classes(x_test1)

In [0]:
new_features = pd.DataFrame()
new_features['lr_predict'] = lr_predict
new_features['nb_predict'] = nb_predict
new_features['svm_predict'] = svm_predict
new_features['rf_predict'] = rf_predict
new_features['nn_predict'] = nn_predict
new_features['cnn_predict'] = cnn_predict

new_features.head()

Unnamed: 0,lr_predict,nb_predict,svm_predict,rf_predict,nn_predict,cnn_predict
0,1,1,1,1,1,1
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,1,1,1,1,1,1
4,0,0,0,0,0,0


In [0]:
predicted = rf_2.predict(new_features)
df['prediction'] = predicted

In [0]:
count_true = 0
false_pos = 0
false_neg = 0

for index, row in df.iterrows():
    if row['label'] == row['prediction']:
        count_true = count_true + 1
    elif row['label'] == 0 and row['prediction'] == 1:
        false_pos = false_pos + 1
    elif row['label'] == 1 and row['prediction'] == 0:
        false_neg = false_neg + 1

print("Accuracy on test set: " + str(count_true/len(df)))
print("False pos: " + str(false_pos/len(df)))
print("False neg: " + str(false_neg/len(df)))

Accuracy on test set: 0.832175
False pos: 0.078425
False neg: 0.0894


In [0]:
predicted = predicted 
y_test = df['label']

precision, recall, fscore, support = score(y_test, predicted, labels=[0, 1])

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.82500612 0.83966266]
recall: [0.84311862 0.82123575]
fscore: [0.83396404 0.83034699]
support: [19996 20004]


In [0]:
#Trial: Training deep neural net as second layer (Combined models)

lr_predict_train = lr.predict(x_train)
nb_predict_train = clf.predict(x_train)
svm_predict_train = svm_model.predict(x_train)
rf_predict_train = rf.predict(x_train)
nn_predict_train = nn_model.predict_classes(x_train)
cnn_predict_train = cnn_model.predict_classes(x_train1)

new_features_train = pd.DataFrame()
new_features_train['lr_predict'] = lr_predict_train
new_features_train['nb_predict'] = nb_predict_train
new_features_train['svm_predict'] = svm_predict_train
new_features_train['rf_predict'] = rf_predict_train
new_features_train['nn_predict'] = nn_predict_train
new_features_train['cnn_predict'] = cnn_predict_train

new_features_train.head()

Unnamed: 0,lr_predict,nb_predict,svm_predict,rf_predict,nn_predict,cnn_predict
0,0,0,0,0,0,0
1,1,1,1,1,1,1
2,1,0,1,0,1,1
3,1,1,1,1,1,1
4,1,0,1,1,1,1


In [0]:
# mini batches Nadam optimizer with dropout and batch normalization
epochs = 10
model = tf.keras.Sequential()
model.add(layers.Dense(16, input_dim=6))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(16))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(rate = 0.2))
model.add(layers.Dense(16))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(32))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(rate = 0.3))
model.add(layers.Dense(32))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(32))
model.add(layers.Activation('relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(rate = 0.4))

model.add(layers.Dense(1))
model.add(layers.Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Nadam(lr=0.002, beta_1=0.9, beta_2=0.999),
              metrics=['accuracy'])
checkpoint = keras.callbacks.ModelCheckpoint("NN2.model", monitor='val_accuracy', verbose=1, save_best_only=True)

model.summary()
model1 = model.fit(new_features_train, y_train, epochs=epochs, validation_split=0.2, callbacks=[checkpoint])

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_34 (Dense)             (None, 16)                112       
_________________________________________________________________
activation_28 (Activation)   (None, 16)                0         
_________________________________________________________________
batch_normalization_26 (Batc (None, 16)                64        
_________________________________________________________________
dense_35 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_29 (Activation)   (None, 16)                0         
_________________________________________________________________
batch_normalization_27 (Batc (None, 16)                64        
_________________________________________________________________
dropout_12 (Dropout)         (None, 16)               

In [0]:
#Combined model results (Deep Neural Net 2nd Layer)

nn2_model = tf.keras.models.load_model('NN2.model')
predicted = nn2_model.predict_classes(new_features)
df['prediction'] = predicted



In [0]:
count_true = 0
false_pos = 0
false_neg = 0

for index, row in df.iterrows():
    if row['label'] == row['prediction']:
        count_true = count_true + 1
    elif row['label'] == 0 and row['prediction'] == 1:
        false_pos = false_pos + 1
    elif row['label'] == 1 and row['prediction'] == 0:
        false_neg = false_neg + 1

print("Accuracy on test set: " + str(count_true/len(df)))
print("False pos: " + str(false_pos/len(df)))
print("False neg: " + str(false_neg/len(df)))

Accuracy on test set: 0.832175
False pos: 0.078425
False neg: 0.0894


In [0]:
predicted = predicted 
y_test = df['label']

precision, recall, fscore, support = score(y_test, predicted, labels=[0, 1])

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

precision: [0.82500612 0.83966266]
recall: [0.84311862 0.82123575]
fscore: [0.83396404 0.83034699]
support: [19996 20004]
