In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

OUTPUT_FOLDER = '../model/'

from enum import Enum
class CellStatus(Enum):
    RUN = 1
    SKIPPED = 0

In [3]:
column_names = ["target", "id", "date", "flag", "user", "text"]

# Read the CSV file with specified column names
df = pd.read_csv("../dataset/training.1600000.processed.noemoticon.csv", 
                 encoding="ISO-8859-1", names=column_names)

Columns in dataset

In [4]:
print("Columns in the original dataset:\n")
print(df.columns)

Columns in the original dataset:

Index(['target', 'id', 'date', 'flag', 'user', 'text'], dtype='object')


Example of an Row in dataset

In [5]:
df.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [6]:
df.describe()


Unnamed: 0,target,id
count,1600000.0,1600000.0
mean,2.0,1998818000.0
std,2.000001,193576100.0
min,0.0,1467810000.0
25%,0.0,1956916000.0
50%,2.0,2002102000.0
75%,4.0,2177059000.0
max,4.0,2329206000.0


Clean data

In [7]:
df_cleaned = df.dropna()
df_cleaned = df_cleaned.drop_duplicates()
df_cleaned = df_cleaned.drop(columns=["date", "id", "flag", "user"])
df_cleaned = df_cleaned.reset_index(drop=True)
df_cleaned.describe()

Unnamed: 0,target
count,1600000.0
mean,2.0
std,2.000001
min,0.0
25%,0.0
50%,2.0
75%,4.0
max,4.0


In [8]:
df_cleaned.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Remove twitter tag and hashtag

In [9]:
import re

df_cleaned['text'] = df_cleaned['text'].apply(lambda x: re.sub(r"http\S+|@\S+|#\S+", "", x))


In [10]:
df_cleaned.head()

Unnamed: 0,target,text
0,0,"- Awww, that's a bummer. You shoulda got Da..."
1,0,is upset that he can't update his Facebook by ...
2,0,I dived many times for the ball. Managed to s...
3,0,my whole body feels itchy and like its on fire
4,0,"no, it's not behaving at all. i'm mad. why am..."


convert target back to -1 0 1


In [11]:
df_cleaned['target'] = df_cleaned['target'].map({0: -1, 2: 0, 4: 1})

df_cleaned['target'].value_counts()

target
-1    800000
 1    800000
Name: count, dtype: int64

Tokenisation

In [12]:
from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
df_cleaned['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in df_cleaned['text']]
df_cleaned['tokenized_text'].head(10)

KeyboardInterrupt: 

## Stemming & Lemma

### PoterStammer

In [None]:
df_to_be_stemmed = df_cleaned.copy()

In [None]:
from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
df_potter_stemmed = df_to_be_stemmed.copy()
# Get the stemmed_tokens
df_potter_stemmed['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] 
                                       for tokens in df_to_be_stemmed['tokenized_text']]
df_potter_stemmed['stemmed_tokens'].head(10)

0    [awww, that, bummer, you, shoulda, got, david,...
1    [is, upset, that, he, can, updat, hi, facebook...
2    [dive, mani, time, for, the, ball, manag, to, ...
3    [my, whole, bodi, feel, itchi, and, like, it, ...
4    [no, it, not, behav, at, all, mad, why, am, he...
5                              [not, the, whole, crew]
6                                          [need, hug]
7    [hei, long, time, no, see, ye, rain, bit, onli...
8                         [nope, thei, didn, have, it]
9                                     [que, me, muera]
Name: stemmed_tokens, dtype: object

### Lancaster

In [None]:
status = CellStatus.SKIPPED
df_lancaster_stemmed = df_to_be_stemmed.copy()

if status == CellStatus.RUN:
    from nltk.stem.lancaster import LancasterStemmer
    lancaster_stemmer = LancasterStemmer()
    # Get the stemmed_tokens
    df_lancaster_stemmed['stemmed_tokens'] = [[lancaster_stemmer.stem(word) for word in tokens] 
                                            for tokens in df_to_be_stemmed['tokenized_text']]
    df_lancaster_stemmed['stemmed_tokens'].head(10)

0    [awww, that, bum, you, should, got, david, car...
1    [is, upset, that, he, can, upd, his, facebook,...
2    [div, many, tim, for, the, bal, man, to, sav, ...
3    [my, whol, body, feel, itchy, and, lik, it, on...
4    [no, it, not, behav, at, al, mad, why, am, her...
5                               [not, the, whol, crew]
6                                           [nee, hug]
7    [hey, long, tim, no, see, ye, rain, bit, on, b...
8                           [nop, they, didn, hav, it]
9                                      [que, me, muer]
Name: stemmed_tokens, dtype: object

### Snowball

In [None]:
status = CellStatus.SKIPPED
df_snowball_stemmed = df_to_be_stemmed.copy()

if status == CellStatus.RUN:
    from nltk.stem.snowball import EnglishStemmer
    snowball_stemmer = EnglishStemmer()
    # Get the stemmed_tokens
    df_snowball_stemmed['stemmed_tokens'] = [[snowball_stemmer.stem(word) for word in tokens] 
                                            for tokens in df_to_be_stemmed['tokenized_text']]
    df_snowball_stemmed['stemmed_tokens'].head(10)

0    [awww, that, bummer, you, shoulda, got, david,...
1    [is, upset, that, he, can, updat, his, faceboo...
2    [dive, mani, time, for, the, ball, manag, to, ...
3    [my, whole, bodi, feel, itchi, and, like, it, ...
4    [no, it, not, behav, at, all, mad, whi, am, he...
5                              [not, the, whole, crew]
6                                          [need, hug]
7    [hey, long, time, no, see, yes, rain, bit, onl...
8                         [nope, they, didn, have, it]
9                                     [que, me, muera]
Name: stemmed_tokens, dtype: object

### Lemmatisation

In [None]:
status = CellStatus.SKIPPED
df_lemmatized = df_to_be_stemmed.copy()

if status == CellStatus.RUN:
    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    # Get the lemmatized_tokens
    df_lemmatized['lemmatized_tokens'] = [[wordnet_lemmatizer.lemmatize(word) for word in tokens] 
                                          for tokens in df_to_be_stemmed['tokenized_text']]
    df_lemmatized['lemmatized_tokens'].head(10)


0    [awww, that, bummer, you, shoulda, got, david,...
1    [is, upset, that, he, can, update, his, facebo...
2    [dived, many, time, for, the, ball, managed, t...
3    [my, whole, body, feel, itchy, and, like, it, ...
4    [no, it, not, behaving, at, all, mad, why, am,...
5                              [not, the, whole, crew]
6                                          [need, hug]
7    [hey, long, time, no, see, yes, rain, bit, onl...
8                         [nope, they, didn, have, it]
9                                     [que, me, muera]
Name: lemmatized_tokens, dtype: object

In [None]:
df_lemmatized.head()

Unnamed: 0,target,text,tokenized_text,lemmatized_tokens
0,-1,"- Awww, that's a bummer. You shoulda got Da...","[awww, that, bummer, you, shoulda, got, david,...","[awww, that, bummer, you, shoulda, got, david,..."
1,-1,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can, update, his, facebo...","[is, upset, that, he, can, update, his, facebo..."
2,-1,I dived many times for the ball. Managed to s...,"[dived, many, times, for, the, ball, managed, ...","[dived, many, time, for, the, ball, managed, t..."
3,-1,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[my, whole, body, feel, itchy, and, like, it, ..."
4,-1,"no, it's not behaving at all. i'm mad. why am...","[no, it, not, behaving, at, all, mad, why, am,...","[no, it, not, behaving, at, all, mad, why, am,..."


## Split into Train and Test Sets

- Train data ( Subset of data for training ML Model) ~70%
- Test data (Subset of data for testing ML Model trained from the train data)

In [None]:
from sklearn.model_selection import train_test_split

def split_train_test(data, sentiment_value_col, tokenised_text_col, test_size=0.3, shuffle_state=True):
    X_train, X_test, Y_train, Y_test = train_test_split( data[tokenised_text_col],
                                                        data[sentiment_value_col], 
                                                        shuffle=shuffle_state,
                                                        test_size=test_size, 
                                                        random_state=15)
    print("Value counts for Train sentiments")
    print(Y_train.value_counts())
    print("Value counts for Test sentiments")
    print(Y_test.value_counts())
    print(type(X_train))
    print(type(Y_train))
    X_train = X_train.reset_index()
    X_test = X_test.reset_index()
    Y_train = Y_train.to_frame()
    Y_train = Y_train.reset_index()
    Y_test = Y_test.to_frame()
    Y_test = Y_test.reset_index()
    print(X_train.head())
    return X_train, X_test, Y_train, Y_test

X_train, X_test, Y_train, Y_test = split_train_test(df_potter_stemmed, 'target', 'stemmed_tokens')

Value counts for Train sentiments
target
 1    560461
-1    559539
Name: count, dtype: int64
Value counts for Test sentiments
target
-1    240461
 1    239539
Name: count, dtype: int64
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
     index                                     stemmed_tokens
0  1448643  [and, listen, to, the, song, too, paper, mean,...
1  1423081  [think, im, gonna, read, the, half, blood, pri...
2  1598349  [just, woke, up, text, emili, goin, out, to, f...
3   405940  [wai, to, make, me, jealou, no, bug, in, my, g...
4  1050615  [hah, wa, read, and, found, out, he, made, old...


# Word2Vec 

## Save-gram approach

### Generate model

In [None]:
from gensim.models import Word2Vec
import time
# Skip-gram model (sg = 1)
vector_size = 1000
window = 3
min_count = 1
workers = 3
sg = 1

word2vec_model_file = OUTPUT_FOLDER + 'word2vec_' + str(vector_size) + 'savegram' + '.model'
start_time = time.time()
stemmed_tokens = pd.Series(df_potter_stemmed['stemmed_tokens']).values
# Train the Word2Vec Model
w2v_model = Word2Vec(stemmed_tokens, min_count = min_count, vector_size = vector_size, workers = workers, window = window, sg = sg)
w2v_model.save(word2vec_model_file)

### Load model

In [None]:
import numpy as np
# Load the model from the model file
sg_w2v_model = Word2Vec.load(word2vec_model_file)
sg_w2v_model_wv = sg_w2v_model.wv
# Unique ID of the word
print("Index of the word 'action':")
print(sg_w2v_model_wv.key_to_index["action"])
# Total number of the words 
print(len(sg_w2v_model_wv.key_to_index))
# Print the size of the word2vec vector for one word
print("Length of the vector generated for a word")
print(len(sg_w2v_model_wv['action']))
# Get the mean for the vectors for an example review
print("Print the length after taking average of all word vectors in a sentence:")
print(np.mean([sg_w2v_model_wv[token] for token in df_potter_stemmed['stemmed_tokens'][0]], axis=0))

Index of the word 'action':
1725
212909
Length of the vector generated for a word
1000
Print the length after taking average of all word vectors in a sentence:
[ 1.23712711e-01  2.62127500e-02  6.71377555e-02  9.57544819e-02
 -8.67230294e-04  8.37140810e-03  5.66953830e-02  5.25718667e-02
 -8.88865665e-02  1.18375942e-01 -3.54103930e-02 -2.81088352e-02
  1.31027149e-02  1.21999085e-02  1.03207193e-01 -4.61583920e-02
 -7.42727965e-02  9.92470421e-03  2.68388987e-02 -2.13900417e-01
  7.09337965e-02  1.75605398e-02  1.79810245e-02  2.84867249e-02
  8.40547681e-02 -1.13794887e-02  8.05658028e-02 -4.93748225e-02
 -2.28698328e-01  8.61635581e-02  5.02824001e-02 -9.04745888e-03
 -1.69876087e-02 -8.65495279e-02  9.35147479e-02  1.80817209e-02
  1.08934522e-01  6.13867529e-02 -1.09635241e-01 -1.32616594e-01
 -1.48112461e-01  6.82336092e-02 -7.32604414e-02  1.31077066e-01
 -4.97542098e-02 -3.25298794e-02 -1.17841065e-01  8.74581262e-02
 -1.12966195e-01  4.94225845e-02 -2.34737378e-02 -5.92518412

In [None]:
word2vec_filename = OUTPUT_FOLDER + 'train_review_word2vec_sg.csv'
with open(word2vec_filename, 'w+') as word2vec_file:
    for index, row in X_train.iterrows():
        model_vector = (np.mean([sg_w2v_model_wv[token] for token in row['stemmed_tokens']], axis=0)).tolist()
        if index == 0:
            header = ",".join(str(ele) for ele in range(1000))
            word2vec_file.write(header)
            word2vec_file.write("\n")
        # Check if the line exists else it is vector of zeros
        if type(model_vector) is list:  
            line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
        else:
            line1 = ",".join([str(0) for i in range(1000)])
        word2vec_file.write(line1)
        word2vec_file.write('\n')

NameError: name 'OUTPUT_FOLDER' is not defined