In [8]:
import numpy as np
import pandas as pd
import spacy
import tensorflow as tf

from sklearn.model_selection import train_test_split
from tensorflow import keras

In [2]:
nlp = spacy.load('en_core_web_lg')

In [10]:
training_df = pd.read_hdf('data/training_data.hdf', 'id')
training_df.head()

Unnamed: 0,text,polarity,intensity,vector
296909,She wears her school jacket open.,0.5,0.0,"[0.2938973, 0.02742757, -0.080566145, 0.054517..."
204220,"Highly recommended read.In a quite elegant, su...",1.0,1.0,"[-0.04066607, 0.19786465, -0.16860504, -0.0083..."
202973,The faculties of design are best suited to dra...,0.5,0.0,"[0.074057005, 0.081415884, -0.049602706, -0.12..."
56697,"But, unfortunately, I ordered this item and ne...",0.0,1.0,"[-0.004936785, 0.24104582, -0.12420635, -0.125..."
32884,Well thats it... Peace Out!,0.0,1.0,"[-0.02340915, 0.256462, -0.11517024, -0.132226..."


In [11]:
training_df['polarity'].value_counts(normalize=True)

0.5    0.5006
0.0    0.2538
1.0    0.2456
Name: polarity, dtype: float64

In [12]:
training_df['int_high'] = training_df['intensity'].apply(lambda x: 1 if x > 0.6 else 0)
training_df['int_med'] = training_df['intensity'].apply(lambda x: 1 if (x < 0.6 and x > 0.4) else 0)
training_df['int_low'] = training_df['intensity'].apply(lambda x: 1 if x < 0.4 else 0)
training_df['pol_pos'] = training_df['polarity'].apply(lambda x: 1 if x > 0.6 else 0)
training_df['pol_neu'] = training_df['polarity'].apply(lambda x: 1 if (x < 0.6 and x > 0.4) else 0)
training_df['pol_neg'] = training_df['polarity'].apply(lambda x: 1 if x < 0.4 else 0)

In [14]:
print(training_df['int_high'].value_counts(normalize=True))
print(training_df['int_med'].value_counts(normalize=True))
print(training_df['int_low'].value_counts(normalize=True))
print(training_df['pol_pos'].value_counts(normalize=True))
print(training_df['pol_neu'].value_counts(normalize=True))
print(training_df['pol_neg'].value_counts(normalize=True))

0    0.6664
1    0.3336
Name: int_high, dtype: float64
0    0.9382
1    0.0618
Name: int_med, dtype: float64
1    0.6046
0    0.3954
Name: int_low, dtype: float64
0    0.7544
1    0.2456
Name: pol_pos, dtype: float64
1    0.5006
0    0.4994
Name: pol_neu, dtype: float64
0    0.7462
1    0.2538
Name: pol_neg, dtype: float64


In [15]:
training_df['vector'].values[0].shape

(300,)

## Creating a basic model

In [37]:
model = keras.Sequential()
model.add(keras.layers.Dense(300, activation=tf.nn.relu))
model.add(keras.layers.Dense(32, activation=tf.nn.relu))
model.add(keras.layers.Dense(6, activation=tf.nn.sigmoid))

In [38]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [39]:
input_data = np.stack(training_df['vector'].values, axis=0)
input_data.shape

(10000, 300)

In [40]:
label_data = training_df[['int_high', 'int_med', 'int_low', 'pol_pos', 'pol_neu', 'pol_neg']].values
label_data.shape

(10000, 6)

In [41]:
split_frac = 0.2
cutoff = int(len(input_data) * split_frac)
x_val = input_data[:cutoff]
x_train = input_data[cutoff:]

y_val = label_data[:cutoff]
y_train = label_data[cutoff:]

In [42]:
history = model.fit(x_train,
                    y_train,
                    epochs=80,
                    batch_size=1024,
                    validation_data=(x_val, y_val))

Train on 8000 samples, validate on 2000 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80


Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [28]:
def readable_output(output):
    intensity = output[0][:3]
    polarity = output[0][3:]
    pol_str = ""
    int_str = ""
    if max(intensity) == intensity[0]:
        int_str = "high"
    elif max(intensity) == intensity[1]:
        int_str = "med"
    else:
        int_str = "low"
    if max(polarity) == polarity[0]:
        pol_str = "pos"
    elif max(polarity) == polarity[1]:
        pol_str = "neutral"
    else:
        pol_str = "neg"
    return "{} {}".format(int_str, pol_str)

In [33]:
docs = [
    "I really hate this",
    "I don't like this",
    "I think this is nice",
    "I love this thing so much",
    "I feel relatively indifferent",]
for doc in docs:
    result = model.predict(np.asmatrix(nlp(doc).vector))
    print(result)
    print(doc, "\n", readable_output(result), "\n")

[[0.9006553  0.01879829 0.02905652 0.32951632 0.00907019 0.57859606]]
I really hate this 
 high neg 

[[0.9238026  0.05397011 0.02437352 0.41188955 0.00351907 0.7160095 ]]
I don't like this 
 high neg 

[[0.89268696 0.01106873 0.04471983 0.81357646 0.00932928 0.1418184 ]]
I think this is nice 
 high pos 

[[0.5061053  0.05087366 0.23811758 0.8225473  0.02424105 0.0759644 ]]
I love this thing so much 
 high pos 

[[0.0276257  0.00144556 0.97583145 0.00902619 0.79991555 0.1624663 ]]
I feel relatively indifferent 
 low neutral 



## Simpler Labels

In [49]:
model = keras.Sequential()
model.add(keras.layers.Dense(300, activation=tf.nn.relu))
model.add(keras.layers.Dense(32, activation=tf.nn.relu))
model.add(keras.layers.Dense(2, activation=tf.nn.sigmoid))

In [53]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['mae'])

In [54]:
input_data = np.stack(training_df['vector'].values, axis=0)
input_data.shape

(10000, 300)

In [55]:
label_data = training_df[['intensity', 'polarity']].values
label_data.shape

(10000, 2)

In [56]:
split_frac = 0.2
cutoff = int(len(input_data) * split_frac)
x_val = input_data[:cutoff]
x_train = input_data[cutoff:]

y_val = label_data[:cutoff]
y_train = label_data[cutoff:]

In [57]:
history = model.fit(x_train,
                    y_train,
                    epochs=80,
                    batch_size=1024,
                    validation_data=(x_val, y_val))

Train on 8000 samples, validate on 2000 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80


Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [58]:
docs = [
    "I really hate this",
    "I don't like this",
    "I think this is nice",
    "I love this thing so much",
    "I feel relatively indifferent",]
for doc in docs:
    result = model.predict(np.asmatrix(nlp(doc).vector))
    print(result)

[[0.15353452 0.02435592]]
[[0.07771567 0.00667944]]
[[0.5506569 0.7326522]]
[[0.00711581 0.6554239 ]]
[[0.00308359 0.8789249 ]]
