In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

OUTPUT_FOLDER = '../model/'

from enum import Enum
class CellStatus(Enum):
    RUN = 1
    SKIPPED = 0
    


In [2]:
column_names = ["target", "id", "date", "flag", "user", "text"]

# Read the CSV file with specified column names
df = pd.read_csv("../dataset/training.1600000.processed.noemoticon.csv", 
                 encoding="ISO-8859-1", names=column_names)

Columns in dataset

In [3]:
print("Columns in the original dataset:\n")
print(df.columns)

Columns in the original dataset:

Index(['target', 'id', 'date', 'flag', 'user', 'text'], dtype='object')


Example of an Row in dataset

In [4]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df.describe()


Unnamed: 0,target,id
count,1600000.0,1600000.0
mean,2.0,1998818000.0
std,2.000001,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,2.0,2002102000.0
75%,4.0,2177059000.0
max,4.0,2329206000.0


Clean data

In [6]:
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop_duplicates()
df_cleaned = df_cleaned.drop(columns=["date", "id", "flag", "user"])
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned.describe()

Unnamed: 0,target
count,1600000.0
mean,2.0
std,2.000001
min,0.0
25%,0.0
50%,2.0
75%,4.0
max,4.0


In [7]:
df_cleaned.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Remove twitter tag and hashtag

In [8]:
import re

df_cleaned['text'] = df_cleaned['text'].apply(lambda x: re.sub(r"http\S+|@\S+|#\S+", "", x))


In [9]:
df_cleaned.head()

Unnamed: 0,target,text
0,0,"- Awww, that's a bummer. You shoulda got Da..."
1,0,is upset that he can't update his Facebook by ...
2,0,I dived many times for the ball. Managed to s...
3,0,my whole body feels itchy and like its on fire
4,0,"no, it's not behaving at all. i'm mad. why am..."


convert target back to -1 0 1


In [10]:
df_cleaned['target'] = df_cleaned['target'].map({0: -1, 2: 0, 4: 1})

df_cleaned['target'].value_counts()

target
-1    800000
 1    800000
Name: count, dtype: int64

Tokenisation

In [11]:
from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
df_cleaned['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_cleaned['text']]
df_cleaned['tokenized_text'].head(10)

0    [awww, that, bummer, you, shoulda, got, david,...
1    [is, upset, that, he, can, update, his, facebo...
2    [dived, many, times, for, the, ball, managed, ...
3    [my, whole, body, feels, itchy, and, like, its...
4    [no, it, not, behaving, at, all, mad, why, am,...
5                              [not, the, whole, crew]
6                                          [need, hug]
7    [hey, long, time, no, see, yes, rains, bit, on...
8                         [nope, they, didn, have, it]
9                                     [que, me, muera]
Name: tokenized_text, dtype: object

# Reduce the size of df

In [12]:
df_percent = df_cleaned.sample(frac=0.2, random_state=42)
df_percent = df_percent.reset_index(drop=True)

# Stemming & Lemma

In [13]:
df_to_be_stemmed = df_percent.copy()

### PoterStammer

In [14]:
from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
df_potter_stemmed = df_to_be_stemmed.copy()
# Get the stemmed_tokens
df_potter_stemmed['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] 
                                       for tokens in df_potter_stemmed['tokenized_text']]
df_potter_stemmed['stemmed_tokens'].head(10)

0                               [ahhh, hope, your, ok]
1          [cool, have, no, tweet, app, for, my, razr]
2    [know, just, famili, drama, it, lame, hei, nex...
3    [school, email, won, open, and, have, geograph...
4                             [upper, airwai, problem]
5            [go, to, miss, pastor, sermon, on, faith]
6         [on, lunch, dj, should, come, eat, with, me]
7                 [oh, why, ar, you, feel, like, that]
8    [gahh, noo, peyton, need, to, live, thi, is, h...
9    [thank, you, glad, you, like, it, there, is, p...
Name: stemmed_tokens, dtype: object

### Lancaster

In [15]:
status = CellStatus.SKIPPED
df_lancaster_stemmed = df_to_be_stemmed.copy()

if status == CellStatus.RUN:
    from nltk.stem.lancaster import LancasterStemmer
    lancaster_stemmer = LancasterStemmer()
    # Get the stemmed_tokens
    df_lancaster_stemmed['stemmed_tokens'] = [[lancaster_stemmer.stem(word) for word in tokens] 
                                            for tokens in df_to_be_stemmed['tokenized_text']]
    df_lancaster_stemmed['stemmed_tokens'].head(10)

### Snowball

In [16]:
status = CellStatus.SKIPPED
df_snowball_stemmed = df_to_be_stemmed.copy()

if status == CellStatus.RUN:
    from nltk.stem.snowball import EnglishStemmer
    snowball_stemmer = EnglishStemmer()
    # Get the stemmed_tokens
    df_snowball_stemmed['stemmed_tokens'] = [[snowball_stemmer.stem(word) for word in tokens] 
                                            for tokens in df_to_be_stemmed['tokenized_text']]
    df_snowball_stemmed['stemmed_tokens'].head(10)

### Lemmatisation

In [17]:
status = CellStatus.SKIPPED
df_lemmatized = df_to_be_stemmed.copy()

if status == CellStatus.RUN:
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    # Get the lemmatized_tokens
    df_lemmatized['lemmatized_tokens'] = [[wordnet_lemmatizer.lemmatize(word) for word in tokens] 
                                          for tokens in df_to_be_stemmed['tokenized_text']]
    df_lemmatized['lemmatized_tokens'].head(10)


In [18]:
df_lemmatized.head()

Unnamed: 0,target,text,tokenized_text
0,-1,AHHH I HOPE YOUR OK!!!,"[ahhh, hope, your, ok]"
1,-1,"cool , i have no tweet apps for my razr 2","[cool, have, no, tweet, apps, for, my, razr]"
2,-1,i know just family drama. its lame.hey next ...,"[know, just, family, drama, its, lame, hey, ne..."
3,-1,School email won't open and I have geography ...,"[school, email, won, open, and, have, geograph..."
4,-1,upper airways problem,"[upper, airways, problem]"


## Split into Train and Test Sets

- Train data ( Subset of data for training ML Model) ~70%
- Test data (Subset of data for testing ML Model trained from the train data)

In [19]:
from sklearn.model_selection import train_test_split

def split_train_test(data, sentiment_value_col, tokenised_text_col, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split( data[tokenised_text_col],
                                                        data[sentiment_value_col], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = split_train_test(df_potter_stemmed, 'target', 'stemmed_tokens')

Value counts for Train sentiments
target
 1    112187
-1    111813
Name: count, dtype: int64
Value counts for Test sentiments
target
 1    48319
-1    47681
Name: count, dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
    index                                     stemmed_tokens
0   68594  [try, my, hand, at, henna, think, made, it, to...
1   18222                     [it, sure, is, see, ya, there]
2  312566        [we, go, togeth, like, vegemit, and, toast]
3  105661                 [xoxo, on, love, god, bless, xoxo]
4  121600  [sox, got, cheeki, to, get, the, lowdown, on, ...


# Word2Vec 

## Save-gram approach

### Generate model

In [20]:
genModelStatus = CellStatus.RUN
from gensim.models import Word2Vec
import time
# Skip-gram model (sg = 1)
vector_size = 1000
window = 3
min_count = 1
workers = 3
sg = 1
word2vec_model_file = OUTPUT_FOLDER + 'word2vec_' + str(vector_size) + 'savegram' + '.model'
reduced_mode_file = OUTPUT_FOLDER + 'word2vec_' + str(vector_size) + 'savegram_reduced' + '.model'
if genModelStatus == CellStatus.RUN:
    start_time = time.time()
    stemmed_tokens = pd.Series(df_potter_stemmed['stemmed_tokens']).values
    # Train the Word2Vec Model
    w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, vector_size = vector_size, workers = workers, window = window, sg = sg)
    w2v_model.save(reduced_mode_file)

### Load model

In [21]:
import numpy as np
# Load the model from the model file
sg_w2v_model = Word2Vec.load(reduced_mode_file)
sg_w2v_model_wv = sg_w2v_model.wv
# Unique ID of the word
print("Index of the word 'action':")
print(sg_w2v_model_wv.key_to_index["action"])
# Total number of the words 
print(len(sg_w2v_model_wv.key_to_index))
# Print the size of the word2vec vector for one word
print("Length of the vector generated for a word")
print(len(sg_w2v_model_wv['action']))
# Get the mean for the vectors for an example review
print("Print the length after taking average of all word vectors in a sentence:")
print(np.mean([sg_w2v_model_wv[token] for token in df_potter_stemmed['stemmed_tokens'][0]], axis=0))

Index of the word 'action':
1718
80036
Length of the vector generated for a word
1000
Print the length after taking average of all word vectors in a sentence:
[ 1.15445293e-01  7.10414350e-02  1.28901303e-01  9.97356847e-02
 -6.46951273e-02 -2.51880437e-02  8.22479948e-02  5.04307598e-02
  1.65715776e-02 -5.64654684e-03  2.86032427e-02 -2.19950899e-02
  2.00843383e-02  4.44456488e-02  1.82320014e-01 -2.08851602e-02
 -9.42177698e-02 -6.62700161e-02  6.79949373e-02 -2.14365363e-01
  8.09222013e-02 -6.92176893e-02 -2.97145955e-02 -2.75057480e-02
  3.15546468e-02 -1.87980995e-01  1.51497394e-01 -4.21678536e-02
 -1.55515343e-01  6.84469566e-02  1.70881599e-01  4.86952364e-02
  9.16128047e-03 -9.65458974e-02  1.05141453e-01 -1.42927051e-01
  1.84374064e-01 -8.77424609e-03 -3.12299803e-02 -1.37113035e-01
 -2.44332775e-01  7.41657913e-02 -9.23033729e-02  1.42419904e-01
 -4.39604931e-02 -4.07571308e-02 -1.10421732e-01  8.35528225e-02
 -1.14122748e-01  4.85246107e-02 -5.54377437e-02 -4.81206737e

In [22]:
saveWord2VecCSVStatus = CellStatus.RUN
if saveWord2VecCSVStatus == CellStatus.RUN:
    word2vec_filename = OUTPUT_FOLDER + 'train_review_word2vec_sg.csv'
    with open(word2vec_filename, 'w+') as word2vec_file:
        for index, row in X_train.iterrows():
            model_vector = (np.mean([sg_w2v_model_wv[token] for token in row['stemmed_tokens']], axis=0))
            v_norm = model_vector / (np.linalg.norm(model_vector) + 1e-16)
            v_norm = v_norm.tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(1000))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            # Check if the line exists else it is vector of zeros
            if type(v_norm) is list:  
                line1 = ",".join( [str(vector_element) for vector_element in v_norm] )
            else:
                line1 = ",".join([str(0) for i in range(1000)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


# Training Model

In [None]:
test_features_word2vec = []
for index, row in X_test.iterrows():
    model_vector = np.mean([sg_w2v_model_wv[token] for token in row['stemmed_tokens']], axis=0)
    v_norm = model_vector / (np.linalg.norm(model_vector) + 1e-16)
    v_norm = v_norm.tolist()
    if type(v_norm) is list:
        test_features_word2vec.append(v_norm)
    else:
        test_features_word2vec.append(np.array([0 for i in range(1000)]))

## Decision Tree Classifier

In [23]:
from sklearn.tree import DecisionTreeClassifier
# Load from the filename
word2vec_df = pd.read_csv("../model/train_review_word2vec_sg.csv")
#Initialize the model
clf_decision_word2vec = DecisionTreeClassifier()

# Fit the model
clf_decision_word2vec.fit(word2vec_df, Y_train['target'])

Testing the model

In [36]:
from sklearn.metrics import classification_report
test_features_word2vec = []
for index, row in X_test.iterrows():
    model_vector = np.mean([sg_w2v_model_wv[token] for token in row['stemmed_tokens']], axis=0)
    v_norm = model_vector / (np.linalg.norm(model_vector) + 1e-16)
    v_norm = v_norm.tolist()
    if type(v_norm) is list:
        test_features_word2vec.append(v_norm)
    else:
        test_features_word2vec.append(np.array([0 for i in range(1000)]))
test_predictions_word2vec = clf_decision_word2vec.predict(test_features_word2vec)

print(classification_report(Y_test['target'],test_predictions_word2vec))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


              precision    recall  f1-score   support

          -1       0.64      0.64      0.64     47681
           1       0.64      0.64      0.64     48319

    accuracy                           0.64     96000
   macro avg       0.64      0.64      0.64     96000
weighted avg       0.64      0.64      0.64     96000



In [37]:
len(test_features_word2vec)

96000

In [25]:
import joblib
joblib.dump(clf_decision_word2vec, OUTPUT_FOLDER + 'decision_tree_word2vec.pkl')

['../model/decision_tree_word2vec.pkl']

In [26]:
Y_test['predicted'] = pd.Series(test_predictions_word2vec)

## SVM

### Standard

In [27]:
from sklearn.svm import SVC
#Initialize the model
svm_classifier = SVC()

# Fit the model
svm_classifier.fit(word2vec_df, Y_train['target'])
joblib.dump(svm_classifier, OUTPUT_FOLDER + 'svm_classifier.pkl')

['../model/svm_classifier.pkl']

In [45]:
from sklearn.metrics import classification_report

test_predictions_word2vec_svm = svm_classifier.predict(test_features_word2vec)
print(len(test_features_word2vec))
print(len(test_predictions_word2vec_svm))


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [None]:
print(classification_report(Y_test['target'],test_predictions_word2vec_svm))

In [39]:
len(test_predictions_word2vec_svm)


383760

### with standard scaler

In [None]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
#Initialize the model
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))

# Fit the model
clf.fit(word2vec_df, Y_train['target'])
joblib.dump(clf, OUTPUT_FOLDER + 'svm_classifier_scl.pkl')


In [None]:
from sklearn.metrics import classification_report
        
test_predictions_word2vec_svm_scaled = clf.predict(test_features_word2vec)

print(classification_report(Y_test['target'], test_predictions_word2vec_svm_scaled))