In [0]:
%matplotlib inline
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
import nltk
import re
from scipy.spatial.distance import cdist
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, GRU, Embedding
from tensorflow.python.keras.optimizers import Adam
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

In [0]:
os.getcwd()

'/workspace/tensorflow'

In [0]:
df = pd.read_csv("./wilson_sentiments.csv", delimiter=",", header=0)

In [0]:
df = df.head(1061)

In [0]:
df = df[["id", "sentiments", "quote"]]

In [0]:
df

Unnamed: 0,id,sentiments,quote
0,0,0.0,"I'm selfish, impatient and a little insecure. ..."
1,1,1.0,Be yourself; everyone else is already taken.
2,2,0.0,Two things are infinite: the universe and huma...
3,3,0.0,"So many books, so little time."
4,4,0.0,"Be who you are and say what you feel, because ..."
...,...,...,...
1056,1056,1.0,"Remember that wherever your heart is, there yo..."
1057,1057,1.0,History will be kind to me for I intend to wri...
1058,1058,1.0,Respect other people's feelings. It might mean...
1059,1059,1.0,"Take responsibility of your own happiness, nev..."


In [0]:
from nltk.stem import WordNetLemmatizer
stemmer = WordNetLemmatizer()
# nltk.download('wordnet')

processed_quotes = []

for sen in df["quote"]:
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(sen))

    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    processed_quotes.append(document)
    
df["processed_quotes"] = processed_quotes
df

Unnamed: 0,id,sentiments,quote,processed_quotes
0,0,0.0,"I'm selfish, impatient and a little insecure. ...",i selfish impatient and little insecure make m...
1,1,1.0,Be yourself; everyone else is already taken.,be yourself everyone else is already taken
2,2,0.0,Two things are infinite: the universe and huma...,two thing are infinite the universe and human ...
3,3,0.0,"So many books, so little time.",so many book so little time
4,4,0.0,"Be who you are and say what you feel, because ...",be who you are and say what you feel because t...
...,...,...,...,...
1056,1056,1.0,"Remember that wherever your heart is, there yo...",remember that wherever your heart is there you...
1057,1057,1.0,History will be kind to me for I intend to wri...,history will be kind to me for intend to write it
1058,1058,1.0,Respect other people's feelings. It might mean...,respect other people feeling it might mean not...
1059,1059,1.0,"Take responsibility of your own happiness, nev...",take responsibility of your own happiness neve...


In [0]:
# df.iloc[:,3]  # take only the quotes
# df.iloc[:,1]  # take only the sentiments

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.iloc[:,3], df.iloc[:,1], test_size=0.20, random_state=42)
text = pd.concat([x_train,x_test])

In [0]:
text

72     if you can explain it to six year old you don ...
898    true love is rare and it the only thing that g...
44     yesterday is history tomorrow is mystery today...
309             i would die for you but won live for you
784    it strange because sometimes read book and thi...
                             ...                        
581         think before you speak read before you think
55     i am enough of an artist to draw freely upon m...
120    it matter not what someone is born but what th...
388    nobody ha ever measured not even poet how much...
25     insanity is doing the same thing over and over...
Name: processed_quotes, Length: 1061, dtype: object

In [0]:
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
# print(tokenizer)
tokenizer.fit_on_texts(text)
x_train_tokens = tokenizer.texts_to_sequences(x_train)
x_test_tokens = tokenizer.texts_to_sequences(x_test)

In [0]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)
print("Mean length is " + str(np.mean(num_tokens)))

Mean length is 23.158341187558907


In [0]:
#Chose to 2 S.D to cover ~95% of data
max_tokens = np.mean(num_tokens) + 2*np.std(num_tokens)
max_tokens = int(max_tokens)
print(max_tokens)

77


In [0]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens,
                            padding='pre', truncating='pre')
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens,
                           padding='pre', truncating='pre')

In [0]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [0]:
def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token != 0]
    text = " ".join(words)
    return text

In [0]:
model = Sequential()
embedding_size = 100

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='layer_embedding'))
model.add(GRU(units=16, return_sequences=True))
model.add(GRU(units=8, return_sequences=True))
model.add(GRU(units=4))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=1e-3),
              metrics=['accuracy'])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
model.fit(x_train_pad, y_train,
          validation_split=0.05, epochs=50, batch_size=32)

Train on 805 samples, validate on 43 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7ff287b9d668>

In [0]:
results_gru = model.evaluate(x_test_pad, y_test)
print("Accuracy: "+ str(results_gru[1]))

Accuracy: 0.6150235


In [0]:
predictions_gru = model.predict_classes(x_test_pad)

# Symbolic AI

In [0]:
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score
 
 
vader = SentimentIntensityAnalyzer()
def vader_polarity(text):
    """ Transform the output to a binary 0/1 result """
    score = vader.polarity_scores(text)
    return 1 if score['pos'] > score['neg'] else 0

predictions_vader = [vader_polarity(text) for text in x_test['processed_quotes']]

# Bert

In [0]:
!pip3 install ktrain

import ktrain
from ktrain import text

In [0]:
train_bert = pd.DataFrame(x_train, y_train, columns=['preprocessed_quotes', 'sentiments']) 

In [0]:
(x_train, y_train), preproc = text.texts_from_df(train_bert, 
                   text_column='preprocessed_quotes', \
                   label_columns = ['sentiments'], \
                   random_state=0, \
                    maxlen=500, \
                    preprocess_mode='bert',
                    classes=['0', '1']) \

In [0]:
test_bert = pd.DataFrame(x_test, y_test, columns=['preprocessed_quotes', 'sentiments'])

In [0]:
(x_test, y_test), preproc = text.texts_from_df(test_bert, 
                   text_column='preprocessed_quotes', \
                   label_columns = ['sentiments'], \
                   random_state=0, \
                    maxlen=500, \
                    preprocess_mode='bert', \
                    classes=['0', '1']) \

In [0]:
model_bert = text.text_classifier('bert', (x_train, y_train), preproc=preproc)
learner = ktrain.get_learner(model_bert,train_data=(x_train, y_train), val_data=(x_test, y_test), batch_size=6)

In [0]:
learner.fit_onecycle(2e-5, 1)

In [0]:
predictions_bert = model_bert.predict_classes(x_test)

# Ensemble RFC

In [0]:
df_ensemble = pd.DataFrame([predictions_gru, predictions_vader], columns=['predictions_gru','predictions_vader'])

In [0]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10, random_state=0)
clf = clf.fit(df_ensemble, y_test)