In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from gensim.models import Word2Vec
# from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import numpy as np

Dataset: https://www.kaggle.com/kazanova/sentiment140

In [2]:
df = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

In [3]:
# Select equal number of positives and negatives
SIZE = 100_000
HALF = df.shape[0]//2
df = df[HALF-SIZE//2 : HALF + SIZE//2]
df = df[['target', 'text']]

# Convert 4 (positive sentiment) to 1
df['target'].loc[df['target']==4]=1

In [4]:
df['target'].value_counts()

0    50000
1    50000
Name: target, dtype: int64

#### Preprocessing:

From: https://github.com/francisbautista/cs174/blob/master/Notebooks/L04%20-%20Word2Vec.ipynb

In [5]:
stop = set(stopwords.words('english'))
def preprocess(text):
    text=text.lower()
    text=re.sub('[^0-9a-z]+',' ',text)
    split = text.split()
    stopped = [i for i in split if i not in stop]
    return(stopped)

In [6]:
df['text'] = df['text'].apply(preprocess)

#### Find word embeddings with Word2Vec

In [7]:
model = Word2Vec(vector_size=100,window=5,min_count=30, sg=0, alpha = 0.025)
model.build_vocab(df['text'])

In [8]:
model.train(df['text'], total_examples=model.corpus_count, epochs=20)

(10628337, 15747320)

In [9]:
model.wv['test']

array([ 0.8401327 , -0.12655145, -0.2615346 , -0.08966108, -0.2411643 ,
        0.17588767, -0.05176534, -0.77485263, -0.9294774 , -0.04847171,
       -1.0483187 ,  0.03595317,  0.6951262 ,  0.71502125, -0.80962217,
        1.4862753 , -1.1999292 , -1.9038987 ,  1.0247109 , -0.7031982 ,
       -0.3057754 ,  1.122382  ,  0.04555465,  0.4490463 ,  0.5786158 ,
       -0.18380849, -0.5258619 , -0.50375044, -0.5838184 , -1.2942679 ,
        1.7767905 , -0.10134457, -0.18711028,  0.5720447 ,  1.6598778 ,
       -0.4235221 ,  0.18281151, -1.0474579 ,  1.0910946 ,  0.9474104 ,
        0.26177794, -0.87623984, -0.64355826,  0.4212355 , -1.2992284 ,
       -1.9732645 ,  1.4332736 ,  0.97949386,  1.228907  ,  0.39390478,
       -0.03572264,  0.28478923,  0.3840845 ,  0.8590543 ,  0.15114051,
        2.1014726 ,  1.9921099 , -1.7943714 , -0.15593657,  1.31003   ,
        0.13420065,  0.44884428, -1.4568623 , -0.44079947, -1.4011059 ,
        0.8235898 ,  1.575923  , -1.8408499 ,  1.3058144 , -0.12

In [10]:
model.wv.most_similar('test')

[('exam', 0.7302709817886353),
 ('math', 0.6260785460472107),
 ('maths', 0.5837710499763489),
 ('essay', 0.5488294363021851),
 ('tests', 0.5474931001663208),
 ('study', 0.517629861831665),
 ('lesson', 0.49804648756980896),
 ('classes', 0.48566314578056335),
 ('results', 0.4845176339149475),
 ('report', 0.47734901309013367)]

#### Get avg. vector for each tweet

In [11]:
model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x106798dc0>

In [12]:
def get_avg_vec(list_of_words, model=None):
    vec = np.zeros(model.wv.vector_size)
    for word in list_of_words:
        try:
            vec += model.wv[word]
        except:
            pass
    vec = vec / np.sqrt(vec @ vec)
    return vec

In [13]:
df['avg_vec'] = df['text'].apply(get_avg_vec, model=model)

In [14]:
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,target,text,avg_vec
0,0,"[opotopo, small, slip, tryfan, weeks, back, fe...","[0.016425926464839584, 0.12428556633569253, 0...."
1,0,"[idristwilight, post, han, want, great, still,...","[-0.026964444126323477, 0.18897072704175477, -..."
2,0,"[rose, 7, ohh, poor, jan, please, tell, cans, ...","[0.07908398287381788, 0.14591578719337936, 0.0..."
3,0,"[finally, home, work, looong, day, monday]","[0.08054435665207227, 0.10106137663281431, -0...."
4,0,"[im, sad, 4, chantelle, tom]","[0.07889970227565675, 0.03331143744581845, -0...."
...,...,...,...
99995,1,"[need, 8, followers, compleate, 1000, follow, ...","[-0.10657259495495827, -0.024300254978452996, ..."
99996,1,"[knew, explain, something, friend, said, star,...","[0.03454132665704356, 0.09592750909769618, -0...."
99997,1,"[done, tweeting, til, tomorrow]","[0.06679701033272802, -0.011243648040071, -0.0..."
99998,1,"[cmozilo, act, ii, set, pretty, breath, taking...","[0.1404100067437189, 0.1036930500500789, -0.04..."


#### Split into train, test, validation sets

* 70% = training
* 15% = test
* 15% = validation

Shuffle the dataset:

In [15]:
df_shuffled = df.sample(frac=1)
df_shuffled.reset_index(inplace=True, drop=True)

In [16]:
train_set = df_shuffled.iloc[:70_000]
test_set = df_shuffled.iloc[70_000:85_000]
val_set = df_shuffled.iloc[85_000:]

X_train = train_set['avg_vec'].to_numpy().reshape(-1, 1)
y_train = train_set['target'].to_numpy()

X_test = test_set['avg_vec'].to_numpy().reshape(-1, 1)
y_test = test_set['target']

X_val = val_set['avg_vec'].to_numpy().reshape(-1, 1)
y_val = val_set['target']

In [17]:
X_train = np.concatenate(np.concatenate(X_train, axis=0), axis=0).reshape(-1, 100)
X_test = np.concatenate(np.concatenate(X_test, axis=0), axis=0).reshape(-1, 100)
X_val = np.concatenate(np.concatenate(X_val, axis=0), axis=0).reshape(-1, 100)

In [18]:
X_train.shape

(70000, 100)

In [19]:
y_train.shape

(70000,)

#### Classify with RandomForest

Remove null and nan values:

In [20]:
X_train = np.nan_to_num(X_train)

In [21]:
forest = RandomForestClassifier()
forest.fit(X_train, y_train)

RandomForestClassifier()

In [23]:
for i in range(30):
    row = test_set.iloc[i]
    try:
        pred = forest.predict(row['avg_vec'].reshape(1, -1))[0]
    except:
        continue
    print(f"Text: {row['text']}")
    print(f"Prediction: {pred}")
    print(f"Actual: {row['target']}")
    
    print("*"*20)

Text: ['never', 'go', 'see', 'movie', 'without', 'first', 'reading', 'matt', 'mungle', 'review', 'transformers']
Prediction: 0
Actual: 0
********************
Text: ['woop', '200', 'updates', 'god', 'boring']
Prediction: 0
Actual: 1
********************
Text: ['awake', 'neither', 'caffeinated', 'functional', 'yet', 'awake']
Prediction: 0
Actual: 1
********************
Text: ['williammm', 'lol', 'know', 'always', 'think', 'go', 'vagina', 'monoglues', 'see', 'chocolates']
Prediction: 0
Actual: 1
********************
Text: ['need', 'start', 'take', 'home', 'test']
Prediction: 0
Actual: 0
********************
Text: ['cartoonbeardy', 'oooo', 'jammy', 'git', 'ever', 'look', 'forward', 'post', 'tweets']
Prediction: 1
Actual: 1
********************
Text: ['hypnoticzexy', 'cut', 'shorter', 'imagined', 'growing', 'getting', 'thick', 'miss', 'already']
Prediction: 0
Actual: 0
********************
Text: ['another', 'power', 'day', 'way', 'home']
Prediction: 0
Actual: 1
********************
Text: ['