In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import re
import spacy
import nltk
from bs4 import BeautifulSoup
import unicodedata
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
from tqdm import tqdm
#Feature Engineering Models
from gensim.models import Word2Vec  ## We are importing python function for training a word2vec model.

# Preparing the dataset
from sklearn.model_selection import train_test_split   # split the dataset into training and testing set
from sklearn.model_selection import cross_val_score    # Perform cross validation 
from sklearn.model_selection import StratifiedKFold    # Stratify the data in each fold
from sklearn.model_selection import KFold 
from imblearn.under_sampling import RandomUnderSampler
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.neural_network import MLPClassifier

# Evaluation Metrics
from sklearn.metrics import classification_report     # to get the performance measures
from sklearn.metrics import confusion_matrix          # To compute the false positives and false negatives
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score            # Accuracy measures

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

## Load the data.

In [None]:
train =pd.read_csv('data/df_train_spell_hash.csv')
df = train[['target','comment_text']]

In [None]:
train.head()

In [None]:
norm_text = train['comment_text_normalized'].tolist()
norm_text = [str(x) for x in norm_text]

In [None]:
norm_text[0]

In [None]:
doc = [x.split() for x in norm_text]
#doc

## Train Word2Vec model

In [None]:
#w2v_model=Word2Vec(sentences=doc, vector_size=300, window=7, epochs=20)  ## comment this out if loading an already saved model
w2v_model = Word2Vec.load("model/word2vec.model")  ## uncomment this out if loading an already saved model

In [None]:
#w2v_model.save("model/word2vec.model") ## comment this out if loading an already saved model

In [None]:
def document_vector(doc):
    """Create document vectors by averaging word vectors. Remove out-of-vocabulary words."""
    #print(doc)
    try:
        doc_vec = [word for word in doc if word in w2v_model.wv.index_to_key]
        #print(doc)
        return np.mean(w2v_model.wv[doc_vec], axis=0)
    except:
        pass
        #print(doc)

In [None]:
word_vecs = [document_vector(filtered_doc) for filtered_doc in tqdm(doc)]
#word_vecs

In [None]:
len(word_vecs)

In [None]:
error_indices = [i for i in range(len(word_vecs)) if word_vecs[i] is None]
error_indices

## Drop records that threw an error

In [None]:
X_train_w2v = [i for i in tqdm(word_vecs) if i is not None]

In [None]:
X_train_w2v_df = pd.DataFrame(np.array(X_train_w2v))

In [None]:
X_train_w2v_df.shape

In [None]:
y_train_w2v = train['target']
print(len(y_train_w2v))
y_train_w2v.drop(y_train_w2v.index[error_indices], inplace=True)
print(len(y_train_w2v))

In [None]:
X_train_w2v_df.to_csv('data/X_train_w2v.csv', index=False)
y_train_w2v.to_csv('data/y_train_w2v.csv', index=False)

In [None]:
X_train_w2v_df =pd.read_csv('data/X_train_w2v.csv')
y_train_w2v =pd.read_csv('data/y_train_w2v.csv')

## Split the data into 2 sets: 1) Training: 80%, 2) Testing: 20% and use Random Undersampling to balance the training set.

In [None]:
# define undersample strategy
undersample = RandomUnderSampler(sampling_strategy='majority')

In [None]:
#Y_train_w2v = np.where(y_train_w2v >=0.5,1.,0.)

In [None]:
X_train_w2v, X_test_w2v, Y_train_w2v, Y_test_w2v=train_test_split(X_train_w2v_df, y_train_w2v, test_size=0.2)

In [None]:
#X_over_w2v, y_over_w2v = undersample.fit_resample(X_train_w2v, Y_train_w2v)
X_over_w2v, y_over_w2v = X_train_w2v, Y_train_w2v

In [None]:
len(y_over_w2v)

In [None]:
np.unique(y_over_w2v, return_counts=True)

## Use 5-fold Cross Validation to evaluate the training set

In [None]:
kfold = KFold(n_splits=5, random_state=7, shuffle=True)

In [None]:
XGB=XGBRegressor(n_jobs=-1)
results = cross_val_score(XGB, X_over_w2v, y_over_w2v, cv=kfold, scoring='neg_mean_absolute_error')#, verbose=3)
print("MAE: %.3f" % (-1 * results.mean()))#*100.0, results.std()*100.0))

In [None]:
print("MAE: %.3f" % (-1 * results.mean()))#*100.0, results.std()*100.0))

In [None]:
XGB=XGBRegressor(verbosity=1)

In [None]:
XGB.fit(X_over_w2v, y_over_w2v)

In [None]:
xgb.plot_tree(XGB)

In [None]:
XGB.predict(X_test_w2v)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
mean_absolute_error(Y_test_w2v, XGB.predict(X_test_w2v))

In [None]:
mean_squared_error(Y_test_w2v, XGB.predict(X_test_w2v))

In [None]:
mean_squared_error(Y_test_w2v, XGB.predict(X_test_w2v), squared=False)

## Implementing MLP

### Holding out 10% as a validation set

In [None]:
from keras.models import Sequential
from keras.layers import Dense
nn_reg2 = Sequential()
n_hidden = 64
n_input = X_over_w2v.shape[1]

# hidden layers
nn_reg2.add(Dense(units=n_hidden, activation='relu',input_shape=(n_input,)))
nn_reg2.add(Dense(units=n_hidden, activation='relu'))
nn_reg2.add(Dense(units=n_hidden, activation='relu'))
nn_reg2.add(Dense(units=n_hidden, activation='relu'))
nn_reg2.add(Dense(units=n_hidden, activation='relu'))
nn_reg2.add(Dense(units=n_hidden, activation='relu'))

# output layer
nn_reg2.add(Dense(units=1, activation=None))
nn_reg2.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse','rmse','mae'])
nn_reg2.summary()

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(history.history['mae'], label='Train MAE')
ax.plot(history.history['val_mae'], label='Validation MAE')
ax.set_title("MAE vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.set_ylim(0.0875,0.1)
ax.grid();

In [None]:
fig.savefig('ValidationSet_LogLoss_Plot.pdf')

In [None]:
nn_reg2.evaluate(x=X_test_w2v, y=Y_test_w2v)

In [None]:
predictions = nn_reg2.predict(X_test_w2v)
predictions

### Holding out 10% as a validation set and implementing early stopping

In [None]:
early_stoping = EarlyStopping(monitor='val_mae',
                min_delta=5,
                patience=20,
                verbose=1,
                mode='auto')

In [None]:
nn_reg2 = Sequential()
n_hidden = 64
n_input = X_over_w2v.shape[1]

# hidden layers
nn_reg2.add(Dense(units=n_hidden, activation='relu',input_shape=(n_input,)))
nn_reg2.add(Dense(units=n_hidden, activation='relu'))
nn_reg2.add(Dense(units=n_hidden, activation='relu'))
nn_reg2.add(Dense(units=n_hidden, activation='relu'))
nn_reg2.add(Dense(units=n_hidden, activation='relu'))
nn_reg2.add(Dense(units=n_hidden, activation='relu'))

# output layer
nn_reg2.add(Dense(units=1, activation=None))
nn_reg2.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse','rmse','mae'])
nn_reg2.summary()

In [None]:
batch_size = 64
n_epochs = 300
history = nn_reg2.fit(X_over_w2v, y_over_w2v,
epochs=n_epochs,
batch_size=batch_size,
validation_split=0.1,
callbacks=[early_stoping])

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(history.history['mae'], label='Train MAE')
ax.plot(history.history['val_mae'], label='Validation MAE')
ax.set_title("MAE vs. epochs", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.set_ylim(0.0875,0.1)
ax.grid();

In [None]:
fig.savefig('2 - EarlyStopping_LogLoss_Plot.pdf')

In [None]:
nn_reg2.evaluate(x=X_test_w2v, y=Y_test_w2v)

In [None]:
predictions = nn_reg2.predict(X_test_w2v)
predictions

### Holding out 10% as a validation set and implementing early stopping & dropout

In [None]:
nn_reg_dropout = Sequential()
n_hidden = 64
dropout_rate = 0.3

## Dropout for input layer
nn_reg_dropout.add(Dropout(rate=dropout_rate, input_shape=(n_input,)))

## Now adding four hidden layers + dropout for each of them
nn_reg_dropout.add(Dense(units=n_hidden, activation='relu',input_shape=(n_input,)))
nn_reg_dropout.add(Dropout(rate=dropout_rate))
nn_reg_dropout.add(Dense(units=n_hidden, activation='relu'))
nn_reg_dropout.add(Dropout(rate=dropout_rate))
nn_reg_dropout.add(Dense(units=n_hidden, activation='relu'))
nn_reg_dropout.add(Dropout(rate=dropout_rate))
nn_reg_dropout.add(Dense(units=n_hidden, activation='relu'))
nn_reg_dropout.add(Dropout(rate=dropout_rate))
nn_reg_dropout.add(Dense(units=n_hidden, activation='relu'))
nn_reg_dropout.add(Dropout(rate=dropout_rate))
nn_reg_dropout.add(Dense(units=n_hidden, activation='relu'))
nn_reg_dropout.add(Dropout(rate=dropout_rate))
nn_reg_dropout.add(Dense(units=1, activation=None))

nn_reg_dropout.summary()

In [None]:
nn_reg_dropout.compile(loss='mean_squared_error', optimizer='adam',
metrics=['mse','mae'])

In [None]:
batch_size = 64
n_epochs = 300
early_stoping = EarlyStopping(monitor='val_mae',
                min_delta=5,
                patience=40,
                verbose=1,
                mode='auto')

history = nn_reg_dropout.fit(X_over_w2v, y_over_w2v,
                            epochs=n_epochs,
                            batch_size=batch_size,
                            validation_split=0.1,
                            callbacks=[early_stoping])

In [None]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(history.history['mae'], label='Train MAE')
ax.plot(history.history['val_mae'], label='Validation MAE')
ax.set_title("MAE vs. epochs using dropout", fontsize=15)
ax.set_xlabel("epoch number", fontsize=14)
ax.legend(fontsize=12)
ax.grid();

In [None]:
fig.savefig('3. Dropout_LogLoss_Plot.pdf')

In [None]:
nn_reg_dropout.evaluate(x=X_test_w2v, y=Y_test_w2v)

In [None]:
predictions = nn_reg_dropout.predict(X_test_w2v)
predictions

## Evaluating dataset on new tweets pulled from Twitter

In [None]:
tweets =pd.read_csv('data/tweets_normalized.csv')
#df = train[['target','comment_text']]

In [None]:
tweets.head()

In [None]:
norm_text = tweets['tweet_normalized'].tolist()
norm_text = [str(x) for x in norm_text]

In [None]:
norm_text[0]

In [None]:
doc = [x.split() for x in norm_text]
doc

In [None]:
word_vecs = [document_vector(filtered_doc) for filtered_doc in tqdm(doc)]
word_vecs

In [None]:
len(word_vecs)

In [None]:
error_indices = [i for i in range(len(word_vecs)) if word_vecs[i] is None]
error_indices

In [None]:
X_tweets_w2v = [i for i in tqdm(word_vecs) if i is not None]

In [None]:
X_tweets_w2v_df = pd.DataFrame(np.array(X_tweets_w2v))

In [None]:
X_tweets_w2v_df.shape

In [None]:
X_tweets_w2v_df.to_csv('data/X_tweets_w2v.csv', index=False)

In [None]:
X_tweets_w2v_df =pd.read_csv('data/X_tweets_w2v.csv')

In [None]:
predictions = nn_reg_dropout.predict(pd.DataFrame(X_tweets_w2v))
predictions

In [None]:
tweets['Prediction'] = predictions
tweets.to_csv('data/tweets_predicted.csv', index=False)

In [None]:
print(X_test_w2v)

In [None]:
print(pd.DataFrame(X_tweets_w2v))