# Import

In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
import sklearn

from collections import Counter

In [8]:
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


# Load data

In [9]:
def load(f):
    data = []
    with open(f, encoding="latin-1") as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append((row['SentimentText'], int(row['Sentiment'])))
    return data

In [10]:
train = load('data/train.csv')

In [11]:
labels = {
    0: "negative",
    1: "positive",
}

# Train / test split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
x = [i[0] for i in train]
y = [i[1] for i in train]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

# Text vectorization

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer = TfidfVectorizer(max_features=400)
x_train_vec = vectorizer.fit_transform(x_train)

In [16]:
x_train_vec

<79991x400 sparse matrix of type '<class 'numpy.float64'>'
	with 595687 stored elements in Compressed Sparse Row format>

# Train model

In [17]:
Dense = tf.keras.layers.Dense
Sequential = tf.keras.models.Sequential

In [18]:
x_train_vec.shape

(79991, 400)

In [19]:
model = Sequential([
    Dense(x_train_vec.shape[1], input_shape=(x_train_vec.shape[1],), activation="relu"),
    #Dense(2048, activation="relu"),
    Dense(2, activation="softmax"),
])

In [20]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [29]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 400)               160400    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 802       
Total params: 161,202
Trainable params: 161,202
Non-trainable params: 0
_________________________________________________________________


In [21]:
one_hot_labels = tf.keras.utils.to_categorical(y_train, num_classes=2)

In [22]:
np.array(x_train_vec[:2])

array(<2x400 sparse matrix of type '<class 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>, dtype=object)

In [28]:
model.fit(x_train_vec.toarray(), one_hot_labels, epochs=10, batch_size=32)

Epoch 1/10

Epoch 2/10

Epoch 3/10

Epoch 4/10

Epoch 5/10

Epoch 6/10

Epoch 7/10

Epoch 8/10

Epoch 9/10

Epoch 10/10



<tensorflow.python.keras._impl.keras.callbacks.History at 0x7f39bafce240>

# Test model

In [39]:
def test_it(i):
    print(x_test[i])
    print("{} ({})".format(labels[model.predict_classes(vectorizer.transform([x_test[i]]).toarray())[0]], labels[y_test[i]]))

In [40]:
for i in range(10):
    test_it(i)
    print()

@Allieandra wheeee! 
positive (positive)

@a02toyota Thank you for the FF! Good to meet ya 
positive (positive)

@ electricbath Eewwww. Gross! So sorry hayward hates you like that. 
negative (negative)

#followfriday - I'm a little late, but here's a special shoutout for @SomersetMarcy - my missus! 
positive (positive)

#icanhelp in shopping (deals), personal assistant, event planning!! I own GET IT TOGETHER, those are my services  jennifer.git@gmail.com
negative (positive)

 broken hearts will heal with time...
positive (negative)

..I've already listened to all the S4 commentary except the finale 
negative (negative)

&quot;Everybody make mistakes.&quot; I'm gonna go get some sleep because I have an other show tomorrow night and I want it to be peeeeerfect! 
positive (positive)

#I Believe...that if you smile at someone, friend or stranger, you will make TWO people feel good.  
positive (positive)

@andreacFOD I think I'm done at twitterland too. I will tweet David one last time tomo

In [43]:
labels[model.predict_classes(vectorizer.transform(["love ML talk Jihlava"]).toarray())[0]]

'positive'

In [44]:
labels[model.predict_classes(vectorizer.transform(["hate ML talk Jihlava"]).toarray())[0]]

'negative'

# Evaluate model

In [45]:
from sklearn.metrics import classification_report

In [47]:
y_pred = model.predict_classes(vectorizer.transform(x_test).toarray())
print(classification_report(y_test, y_pred, target_names=labels.values()))

             precision    recall  f1-score   support

   negative       0.67      0.62      0.65      8750
   positive       0.72      0.76      0.74     11248

avg / total       0.70      0.70      0.70     19998

