In [1]:
import pandas as pd
import spacy
import tensorflow as tf
from tensorflow import keras
import numpy as np


  from ._conv import register_converters as _register_converters


In [2]:
nlp = spacy.load('en_core_web_lg')

In [46]:
clean_text_df = pd.read_csv('clean_text.csv')
clean_text_df.head()

Unnamed: 0,intensity,polarity,text
0,1.0,0.0,Don't buy: This company will set you back a bi...
1,0.0,0.5,"experimentation or other means), also called e..."
2,1.0,1.0,Beaugh: Excellent basic material. Well organiz...
3,0.0,0.5,"""An eye for an eye"", ""a tooth for a tooth"" or ..."
4,1.0,1.0,Can it get any better?: I have not read all th...


In [47]:
clean_text_df['polarity'].value_counts(normalize=True)

0.5    0.5406
1.0    0.2297
0.0    0.2297
Name: polarity, dtype: float64

In [49]:
sample = clean_text_df.sample(frac=0.01)
print(len(sample))
print(sample['polarity'].value_counts(normalize=True))
sample.head(3)

1000
0.5    0.516
0.0    0.248
1.0    0.236
Name: polarity, dtype: float64


Unnamed: 0,intensity,polarity,text
4531,1.0,0.0,I HATE THIS BOOK!: I tore this book to shreds ...
57908,0.0,0.5,mm pupil to about 3 arcminutes per line pair. ...
65921,1.0,1.0,Classic Ian Matthews: I only own the LP from w...


In [50]:
sample['int_high'] = sample['intensity'].apply(lambda x: 1 if x > 0.6 else 0)
sample['int_med'] = sample['intensity'].apply(lambda x: 1 if (x < 0.6 and x > 0.4) else 0)
sample['int_low'] = sample['intensity'].apply(lambda x: 1 if x < 0.4 else 0)
sample['pol_pos'] = sample['polarity'].apply(lambda x: 1 if x > 0.6 else 0)
sample['pol_neu'] = sample['polarity'].apply(lambda x: 1 if (x < 0.6 and x > 0.4) else 0)
sample['pol_neg'] = sample['polarity'].apply(lambda x: 1 if x < 0.4 else 0)

In [51]:
print(sample['int_high'].value_counts(normalize=True))
print(sample['int_med'].value_counts(normalize=True))
print(sample['int_low'].value_counts(normalize=True))
print(sample['pol_pos'].value_counts(normalize=True))
print(sample['pol_neu'].value_counts(normalize=True))
print(sample['pol_neg'].value_counts(normalize=True))

0    0.544
1    0.456
Name: int_high, dtype: float64
0    0.972
1    0.028
Name: int_med, dtype: float64
1    0.516
0    0.484
Name: int_low, dtype: float64
0    0.764
1    0.236
Name: pol_pos, dtype: float64
1    0.516
0    0.484
Name: pol_neu, dtype: float64
0    0.752
1    0.248
Name: pol_neg, dtype: float64


In [52]:
sample['vector'] = sample['text'].apply(lambda x: nlp(x).vector)

In [53]:
sample['vector'].values[0].shape

(300,)

## Creating a basic model

In [54]:
model = keras.Sequential()
model.add(keras.layers.Dense(300, activation=tf.nn.relu))
model.add(keras.layers.Dense(6, activation=tf.nn.sigmoid))

In [55]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [56]:
input_data = np.stack(sample['vector'].values, axis=0)
input_data.shape

(1000, 300)

In [57]:
label_data = sample[['int_high', 'int_med', 'int_low', 'pol_pos', 'pol_neu', 'pol_neg']].values
label_data.shape

(1000, 6)

In [58]:
split_frac = 0.2
cutoff = int(len(input_data) * split_frac)
x_val = input_data[:cutoff]
x_train = input_data[cutoff:]

y_val = label_data[:cutoff]
y_train = label_data[cutoff:]

In [59]:
history = model.fit(x_train,
                    y_train,
                    epochs=40,
                    batch_size=512,
                    validation_data=(x_val, y_val))

Train on 800 samples, validate on 200 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [66]:
def readable_output(output):
    intensity = output[0][:3]
    polarity = output[0][3:]
    pol_str = ""
    int_str = ""
    if max(intensity) == intensity[0]:
        int_str = "high"
    elif max(intensity) == intensity[1]:
        int_str = "med"
    else:
        int_str = "low"
    if max(polarity) == polarity[0]:
        pol_str = "pos"
    elif max(polarity) == polarity[1]:
        pol_str = "neutral"
    else:
        pol_str = "neg"
    return "{} {}".format(int_str, pol_str)

In [75]:
docs = [
    "I really hate this",
    "stuff"
]
for doc in docs:
    result = model.predict(np.asmatrix(nlp(doc).vector))
    print(result)
    print(doc, "\n", readable_output(result), "\n")

[[0.998516   0.00423395 0.0018444  0.38055375 0.00141962 0.95365673]]
I really hate this 
 high neg 

[[0.9985176  0.00747729 0.00159849 0.57308686 0.00138725 0.925542  ]]
stuff 
 high neg 



In [56]:
print(nlp("I really hate this").vector.shape)
np.asmatrix(nlp("I really hate this").vector).shape

(300,)


(1, 300)