# Import

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import sklearn

from collections import Counter

# Load data

In [2]:
def load(f):
    data = []
    with open(f, encoding="latin-1") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append((row['SentimentText'], int(row['Sentiment'])))
    return data

In [3]:
train = load('data/train.csv')

In [4]:
labels = {
    0: "negative",
    1: "positive",
}

# Train / test split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
x = [i[0] for i in train]
y = [i[1] for i in train]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

# Text vectorization

In [10]:
import numpy as np

In [11]:
import gensim
w2v = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True) 

In [65]:
def get_w2v(t):
    #return [w2v[w] if w in w2v else np.zeros(300) for w in t.split()]
    return np.mean([w2v[w] if w in w2v else np.zeros(300) for w in t.split()], axis=0)

In [66]:
x_train_vec = [get_w2v(w) for w in x_train]

In [67]:
x_train_vec[:2]

[array([ 0.05782064,  0.00990194,  0.05829264,  0.12293294, -0.06476237,
         0.02371419,  0.04605306, -0.03364258,  0.04059652,  0.03964691,
        -0.07250926, -0.1015625 , -0.04216309,  0.01258952, -0.11850586,
         0.0619161 ,  0.04527181,  0.02780762,  0.04323018, -0.06141764,
        -0.03304685,  0.01449382,  0.0433431 , -0.00578461,  0.0295887 ,
        -0.02105713, -0.03505452,  0.04740295, -0.00938721,  0.04040527,
        -0.03079427,  0.03886312, -0.02034505, -0.02212524, -0.01647695,
         0.00868327,  0.04377441, -0.00889587,  0.02038981,  0.03836263,
         0.06206868, -0.05369059,  0.14355469, -0.05217082, -0.02637126,
         0.02607117, -0.00088298, -0.01057739,  0.03606364, -0.04659831,
        -0.02378845,  0.07169189, -0.02211914,  0.01251628,  0.02906087,
         0.04298503, -0.0008372 , -0.04524536,  0.05209859, -0.01716105,
         0.01054688,  0.05267334, -0.06131185, -0.02805583,  0.00571289,
        -0.01030986, -0.07556152,  0.11896973, -0.0

In [68]:
len(x_train_vec[0])

300

# Train model

In [69]:
from sklearn.svm import LinearSVC

In [70]:
model = LinearSVC()
model.fit(x_train_vec[:10], y_train[:10])

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

# Test model

In [74]:
def test_it(i):
    print(x_test[i])
    print("{} ({})".format(labels[model.predict([get_w2v(x_test[i])])[0]], labels[y_test[i]]))

In [76]:
for i in range(10):
    test_it(i)
    print()

@Allieandra wheeee! 
negative (positive)

@a02toyota Thank you for the FF! Good to meet ya 
positive (positive)

@ electricbath Eewwww. Gross! So sorry hayward hates you like that. 
positive (negative)

#followfriday - I'm a little late, but here's a special shoutout for @SomersetMarcy - my missus! 
positive (positive)

#icanhelp in shopping (deals), personal assistant, event planning!! I own GET IT TOGETHER, those are my services  jennifer.git@gmail.com
positive (positive)

 broken hearts will heal with time...
positive (negative)

..I've already listened to all the S4 commentary except the finale 
positive (negative)

&quot;Everybody make mistakes.&quot; I'm gonna go get some sleep because I have an other show tomorrow night and I want it to be peeeeerfect! 
positive (positive)

#I Believe...that if you smile at someone, friend or stranger, you will make TWO people feel good.  
positive (positive)

@andreacFOD I think I'm done at twitterland too. I will tweet David one last time tomo

# Evaluate model

In [16]:
from sklearn.metrics import classification_report

In [17]:
y_pred = model.predict(get_w2v(x_test) for i in x_test])
print(classification_report(y_test, y_pred, target_names=labels.values()))

             precision    recall  f1-score   support

   negative       0.74      0.71      0.73      8750
   positive       0.78      0.80      0.79     11248

avg / total       0.76      0.76      0.76     19998

