In [405]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from datetime import datetime
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud, STOPWORDS
import seaborn as sns
import nltk
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score,f1_score,precision_score,roc_auc_score,recall_score,confusion_matrix
import eli5

from keras.preprocessing.text import Tokenizer                    
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input, Embedding, Conv1D, Dense, MaxPooling1D, Flatten, Dropout
from nltk.tokenize import TreebankWordTokenizer
import tensorflow as tf

In [420]:
#df.dropna(subset = ["comment"], inplace = True)
df = pd.read_csv("train-balanced.csv")
df.drop(['ups', 'downs', 'date', 'created_utc', 'parent_comment'], axis=1, inplace = True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,label,comment,author,subreddit,score
0,0,NC and NH.,Trumpbart,politics,2
1,0,You do know west teams play against west teams...,Shbshb906,nba,-4
2,0,"They were underdogs earlier today, but since G...",Creepeth,nfl,3
3,0,"This meme isn't funny none of the ""new york ni...",icebrotha,BlackPeopleTwitter,-8
4,0,I could use one of those tools.,cush2push,MaddenUltimateTeam,6


In [None]:
'''
df = pd.read_csv("train-balanced.csv")
df.dropna(subset = ["comment"], inplace = True)
df.drop(['ups', 'downs'], axis=1, inplace = True)
'''

In [417]:
from keras.layers import LSTM, SpatialDropout1D
from keras import backend as K
from keras.callbacks import EarlyStopping

In [462]:
encoded_author = np.asarray(df['author'].factorize()[0])
encoded_subreddit = np.asarray(df['subreddit'].factorize()[0])
author_reshaped = np.reshape(np.log(encoded_author+0.1)/np.log(10) - 1, (len(encoded_author), 1))
subreddit_reshaped = np.reshape(np.log(encoded_subreddit+0.1), (len(encoded_subreddit), 1))
score_reshaped = np.reshape(df['score'].values*30000/np.linalg.norm(df['score'].values), (len(df['score']), 1))
other_features = np.hstack((author_reshaped, subreddit_reshaped, score_reshaped))
xtrain, xval, ytrain, yval = train_test_split(other_features, df['label'], test_size=0.2)
xtrain

array([[ 4.23760933,  6.46630023,  0.61106807],
       [ 4.37799267,  5.07579862, 15.27670167],
       [ 3.6728097 ,  4.89110073,  0.61106807],
       ...,
       [ 4.32070511,  3.95316495,  0.61106807],
       [ 4.28832324,  8.61886491,  0.61106807],
       [ 3.47101432,  6.03810955,  4.27747647]])

In [384]:
##hyper parameters
batch_size = 64
embedding_dims = 30 #Length of the token vectors
epochs = 5
max_sequence_length = 100 #max length of comment
max_number_words = 100000 #max vocab size of tokenizer

In [385]:
def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath, 'r', encoding="utf-8") as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

In [386]:
TBtokenizer = TreebankWordTokenizer()
    
tokenized = []
for comment in df['comment']:
  temp = TBtokenizer.tokenize(comment)
  tokenized.append([word for word in temp])

In [387]:
tokenizer = Tokenizer(num_words=max_number_words)
tokenizer.fit_on_texts(tokenized)
sequences = tokenizer.texts_to_sequences(tokenized)

word_index = tokenizer.word_index
vocab_length = len(word_index) + 1
print('Found %s unique tokens.' % len(word_index))

instances_pad = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

Found 227320 unique tokens.


In [388]:
embedding_matrix = create_embedding_matrix("glove.6B.50d.txt", tokenizer.word_index, embedding_dims)

In [463]:
instances_pad_input = Input(shape=(max_sequence_length))
mode = Embedding(vocab_length, embedding_dims, weights=[embedding_matrix], input_length=max_sequence_length)(instances_pad_input)
mode = LSTM(64, recurrent_dropout=0.2, dropout=0.1, return_sequences=False)(mode)
other_features_input = Input(shape=(3))
concat = concatenate([mode, other_features_input], axis=1)
mode = Dense(64, activation='relu')(mode)
output = Dense(1, activation='sigmoid')(mode)

model = Model([instances_pad_input, other_features_input], output)
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [151]:
model2 = Sequential()

model2.add(Embedding(vocab_length, embedding_dims, weights=[embedding_matrix], input_length=max_sequence_length))
#model2.add(SpatialDropout1D(0.4))
model2.add(LSTM(64, recurrent_dropout=0.2, return_sequences=False))
model2.add(Dense(64, activation = 'relu'))
model2.add(Dense(1, activation = 'sigmoid'))
model2.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
K.set_value(model2.optimizer.learning_rate, 0.001)

In [464]:
model.summary()

Model: "model_18"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_96 (InputLayer)           [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding_69 (Embedding)        (None, 100, 30)      6819630     input_96[0][0]                   
__________________________________________________________________________________________________
lstm_51 (LSTM)                  (None, 64)           24320       embedding_69[0][0]               
__________________________________________________________________________________________________
dense_86 (Dense)                (None, 64)           4160        lstm_51[0][0]                    
___________________________________________________________________________________________

In [None]:
tf.config.run_functions_eagerly(True)

xtrain2, xval2, ytrain2, yval2 = train_test_split(instances_pad, df['label'], test_size=0.2)
callback = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3)
#history = model2.fit(xtrain2, ytrain2, epochs=20, validation_data=(xval2, yval2), batch_size=batch_size, callbacks=[callback], verbose = 1)
history = model.fit([xtrain2, xtrain], ytrain2, epochs=10, validation_data=[xval2, yval2], batch_size=batch_size, callbacks=[callback])


#plot
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Epoch 1/10
  695/12635 [>.............................] - ETA: 2:09:52 - loss: 0.6932 - accuracy: 0.4999

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, confusion_matrix

In [None]:
y_pred = model2.predict(xval2)

In [None]:
def print_metrics(y_pred, y_val):
    y_pred2 = list(map(lambda x: 1 if x > 0.5 else 0, y_pred))
    print("Accuracy:",round(accuracy_score(y_val, y_pred2),4))
    print('Precision:', round(precision_score(y_val, y_pred2),4))
    print('F1:', round(f1_score(y_val,y_pred2),4))
    print('AUC:',round(roc_auc_score(y_val, y_pred2),4))
    print('Recall: ', round(recall_score(y_val, y_pred2), 4))

In [None]:
print_metrics(y_pred, yval2)