In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, multilabel_confusion_matrix

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

import nltk
import tensorflow as tf

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.naive_bayes import MultinomialNB
from keras.layers import Dense, Input, LSTM, GRU, Conv1D, Dropout, Flatten, Layer, BatchNormalization
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from tensorflow import keras

from gensim.models import KeyedVectors
from gensim import models

import warnings
warnings.filterwarnings('ignore')

In [3]:
#Importing the Preprocessed data 
ei = pd.read_csv('EI.csv')
ei.drop(columns=['Unnamed: 0'],inplace=True)

In [4]:
ei.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 370832 entries, 0 to 370831
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   Posts   370832 non-null  object
 1   type    370832 non-null  object
dtypes: object(2)
memory usage: 5.7+ MB


There are in total of 370832 sample sizes which is not possible to run within a local server,
either require cloud or Pyspark to run in parallel.
<br>
This project is to show how the model works and predicts well, therefore we will divide the entire dataset into 8

In [5]:
# Since the data has been expanded: 50 posts each row -> 1 posts each row
# Pyspark might be required to process the large amount of data, but here we will simply divide the dataset into 8 portions
I = 284212
E = 10827
testing = pd.concat([ei.iloc[:35526], ei.iloc[I:I + E]])

## Bag of Word - CountVectorizer

In [6]:
bow = CountVectorizer(min_df = 25, max_df = 0.8)
bow_words = bow.fit_transform(testing['Posts'])
bow_vectorized_data = pd.DataFrame(data = bow_words.toarray(), columns = bow.get_feature_names())

In [7]:
#visualizing how the vectorized sentences look like (3099 different columns/words)
bow_vectorized_data.head()

Unnamed: 0,ability,able,absolute,absolutely,abstract,absurd,abuse,accept,acceptable,accepted,...,yesterday,yet,yo,yoga,young,younger,youtube,yup,zero,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Count Vectorizer counts the occurence of the word within a sentence and creates a sparse matrix
for i in bow_vectorized_data.iloc[0]:
    if i > 0:
        print(i)

1
1
1
1


## TF-IDF

In [9]:
tfidf_vectorizer = TfidfVectorizer(min_df = 25, max_df = 0.8)
tfidf_words = tfidf_vectorizer.fit_transform(testing["Posts"])
tfidf_vectorized_data = pd.DataFrame(
    data=tfidf_words.toarray(), columns=tfidf_vectorizer.get_feature_names()
)

#min df = infrequent word ignore (if a certain word doesnt appear in more than 25 times)
#max df = frequent word ignore (if a certain word appears in more than 80% of the document -> ignore)

In [10]:
tfidf_vectorized_data.head()

Unnamed: 0,ability,able,absolute,absolutely,abstract,absurd,abuse,accept,acceptable,accepted,...,yesterday,yet,yo,yoga,young,younger,youtube,yup,zero,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# TF-IDF provides a score to the word base on their importance 
for i in tfidf_vectorized_data.iloc[0]:
    if i > 0:
        print(i)

0.6115046370177148
0.4536504412003629
0.5388813059189429
0.36037521312604637


## Word2Vec - Pretrained

In [14]:
#Importing Pretrained Word2Vec
w2v = KeyedVectors.load_word2vec_format('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz',binary=True,limit=100000)

Because Word2Vec is designed for each word within n-th dimensional space, each sentence with k-words
will have a matrix of 300 x k (Rows represents the Dimension of Word2Vec for each word) Columns represents the total number of words for a given document
<br>
Since the typical ML models doesnt take input as (m x n x l) we need to convert the multidimensional word vector into sentnence vector, one way is to average all the values within a sentence (ex. 5 word sentence having 300 dim vectors for each word -> element wise addition then element wise division)

In [16]:
# Function that averages each sentence's word vector
def w2v_mean(sentence):
    words=[]
    
    for i in sentence:
        try:
            word = w2v[i]
            words.append(word)
        # If a certain word is not presetn within a Word2Vec, replace with vector with the same dimension but 0s
        except:
            words.append(np.zeros(300,))
    mean = np.mean(words,axis=0)
    
    return mean

In [17]:
# Tokenizing each sentence - Testing purpose
t = testing['Posts'].apply(lambda x: x.split())
t.reset_index(inplace=True, drop=True)

In [18]:
# For every sentences within a dataset perform that average function
word2vec_mean = []
for i in t:
    try:
        word2vec_mean.append(w2v_mean(i))
    except:
        break

In [19]:
embedded_sentence = pd.DataFrame(word2vec_mean)
# Each sentence is in 300 Dimension (Same as Word2Vec Pretraiend model)
embedded_sentence.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.051453,0.03064,-0.048828,0.033447,0.008301,0.035919,-0.050171,-0.05603,0.053749,0.012573,...,-0.186279,0.048126,-0.127289,-0.037422,-0.081116,0.069092,0.109131,0.027954,-0.014259,0.051376
1,0.024414,-0.009064,0.002319,0.040771,-0.029541,0.036591,0.012817,-0.056183,0.034698,-0.017212,...,-0.094849,0.015869,0.008881,0.028158,0.013916,0.05542,0.060547,-0.062378,-0.003502,0.011398
2,0.065449,0.060204,0.000253,0.112965,-0.076793,-0.004001,0.056985,-0.107077,0.157815,0.089499,...,-0.031616,0.037964,-0.101028,-0.083056,-0.052663,-0.010561,0.003953,0.01655,-0.011679,-0.023034
3,-0.0017,0.026698,-0.015302,0.135875,-0.051105,-0.044892,0.036547,0.002686,0.058603,0.046406,...,-0.083194,0.057546,-0.020444,0.001233,-0.005906,-0.004411,0.033447,0.021855,-0.006804,0.041594
4,0.057475,0.042348,0.008135,0.163468,-0.086201,-0.048431,0.136312,-0.050812,0.064033,0.021683,...,-0.097992,0.051473,-0.018185,-0.01309,-0.026467,0.001295,0.057375,-0.077337,-0.000253,-0.032067


## TF-IDF Word2Vec Hybrid

Another Novel method for transforming Word Vectors in multidimension to a sentence vector is to perform TF-IDF Word2Vec Hybrid approach.
<br>
The logic here is to multiply the TFIDF score identified for each word with the 300dim embedding vector and get the mean

In [20]:
#Extracting the TFIDF score from existing data 
def w2v_tfidf(t, i):
    tfidf_score = []
    for word in t[i]:
            if word in tfidf_vectorized_data.columns:
                score = tfidf_vectorized_data.iloc[i][word]
                tfidf_score.append(score)
            else:
                tfidf_score.append(1)
                
            
    return tfidf_score

In [21]:
# Inserting the found value into a list (For the Entire dataset)
tfidf_w2v = []
for i in range(len(t)):
    try:
        tfidf_w2v.append(w2v_tfidf(t,i))
    except:
        break

In [22]:
#Example of tfidf score for doc number 35000
tfidf_w2v[35000]

[0.23720098635613482,
 0.26584570822097026,
 0.326849302605578,
 0.4596548985978448,
 0.23369445516197762,
 0.33115997467658087,
 0.2912605569526351,
 0.3371869277762571,
 0.4383387908600179]

In [23]:
# Getting the Embedding representation of each sentence
def get_embed_vect(t,i):
    embed_vec = []
    for word in t[i]:
        try:
            embed_vec.append(w2v[i])
        except:
            embed_vec.append(np.ones(300,))
    return embed_vec

In [24]:
#The first Sentnece having 4 words in total will have 4 rows with 300 dimensions each 
np.array(get_embed_vect(t, 0)).shape

(4, 300)

In [25]:
# Retrieve Embedding vector for every sample 
embedding = []
for i in range(len(t)):
    try:
        embedding.append(get_embed_vect(t,i))
    except:
        break

In [26]:
# vector Shape
print('Embedding Vector Shape:{}'.format(np.array(embedding[2]).T.shape))
print('TFIDF Score Vector Shape{}'.format(np.array(tfidf_w2v[2]).shape))

Embedding Vector Shape:(300, 17)
TFIDF Score Vector Shape(17,)


In [27]:
# Element wise multiplication (word2vec embedded word vector * TFIDF Score of each word)
z = np.array(embedding[13]).T * np.array(tfidf_w2v[13])
#Each Word has a vector (300 Dimension - from Word2Vec) however the values are multiplied with TFIDF Score
pd.DataFrame(z, columns=[t[13]] )

Unnamed: 0,get,high,backyard,roast,eat,marshmellows,backyard.1,conversing,something,intellectual,followed,massages,kiss
0,0.018247,0.026826,0.084961,0.084961,0.033658,0.084961,0.084961,0.084961,0.020901,0.037860,0.039013,0.084961,0.040506
1,-0.020450,-0.030064,-0.095215,-0.095215,-0.037720,-0.095215,-0.095215,-0.095215,-0.023423,-0.042429,-0.043721,-0.095215,-0.045394
2,0.025588,0.037618,0.119141,0.119141,0.047198,0.119141,0.119141,0.119141,0.029309,0.053091,0.054708,0.119141,0.056801
3,0.024015,0.035306,0.111816,0.111816,0.044296,0.111816,0.111816,0.111816,0.027508,0.049827,0.051344,0.111816,0.053309
4,-0.023910,-0.035152,-0.111328,-0.111328,-0.044103,-0.111328,-0.111328,-0.111328,-0.027387,-0.049610,-0.051120,-0.111328,-0.053076
...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.006869,0.010098,0.031982,0.031982,0.012670,0.031982,0.031982,0.031982,0.007868,0.014252,0.014686,0.031982,0.015248
296,0.013633,0.020043,0.063477,0.063477,0.025146,0.063477,0.063477,0.063477,0.015616,0.028286,0.029147,0.063477,0.030263
297,-0.023386,-0.034381,-0.108887,-0.108887,-0.043136,-0.108887,-0.108887,-0.108887,-0.026787,-0.048522,-0.049999,-0.108887,-0.051912
298,0.010487,0.015417,0.048828,0.048828,0.019343,0.048828,0.048828,0.048828,0.012012,0.021759,0.022421,0.048828,0.023279


In [28]:
# computing Element wise multiplication TFIDF * Word2Vec then take the mean to get sentence vector
def word_to_sentence(tfidf_score, w2v_vector, num_sample):
    sentence_vector = []
    for sentence in range(num_sample):
        # Embedding * tfidf score (element wise multiplication)
        z = np.array(w2v_vector[sentence]).T * np.array(tfidf_score[sentence])
        mean = np.mean(z.T, axis=0)
        sentence_vector.append(mean)
    return sentence_vector

In [29]:
hybrid = word_to_sentence(tfidf_w2v, embedding, len(testing))
hybrid = pd.DataFrame(hybrid)
hybrid

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.000555,-0.000440,0.000156,0.000753,0.000543,-0.000689,-0.000015,-0.000206,-0.000283,0.000528,...,-0.000761,0.000674,-0.000030,-0.000405,0.000652,0.000570,0.000279,-0.000768,-0.000060,-0.000042
1,0.047836,0.059130,0.059795,0.042521,0.047171,-0.074079,-0.055476,-0.104973,0.014118,0.089692,...,-0.114938,-0.060459,-0.054812,0.044182,0.041690,-0.032223,-0.040029,-0.032389,0.009841,-0.042521
2,-0.004199,-0.016882,0.015925,0.022625,-0.006483,-0.022799,-0.000468,-0.025758,0.022973,0.030805,...,-0.008049,0.001327,-0.029413,0.029065,0.002828,0.000170,0.006570,0.025410,-0.012444,0.008615
3,-0.010278,-0.018485,0.054499,0.032827,-0.072028,0.020716,-0.009282,-0.058642,0.076809,0.077127,...,-0.007370,-0.010199,-0.021831,-0.013465,-0.012669,0.041751,0.013147,0.004482,0.039838,-0.096887
4,0.002761,-0.028691,0.067327,0.008846,-0.052026,0.077656,0.044184,-0.042271,0.028117,0.008177,...,-0.091427,-0.014250,-0.036724,0.071535,0.010616,0.050113,-0.009707,0.004399,0.064267,0.041888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46348,0.083499,0.077932,0.035905,0.145845,-0.011899,-0.001948,0.037296,0.100199,0.051769,-0.106322,...,-0.102425,-0.210417,-0.127475,0.084056,-0.058171,-0.194831,0.032843,-0.074592,-0.107992,0.107992
46349,-0.116711,0.248983,0.010780,0.197112,-0.095962,0.263248,0.060625,0.010293,-0.302152,0.178957,...,0.075538,0.069702,0.116711,0.328088,-0.134218,0.004985,-0.079428,0.182847,0.313823,0.057059
46350,-0.019572,-0.005799,-0.133381,0.030031,-0.235280,-0.101485,0.111012,0.083259,0.012634,0.065448,...,0.063791,-0.095686,-0.070004,-0.046186,-0.065033,0.171489,-0.021125,0.130067,0.063791,-0.140837
46351,0.070160,0.026070,0.043489,0.056945,-0.003544,-0.080732,-0.127825,-0.141281,-0.103798,0.110045,...,-0.161464,0.114370,0.010993,0.092746,0.002313,0.202791,-0.226818,-0.088901,0.015317,0.162425
