# Sentiment Analysis using GRU

# Introduction

# 1) Method

## 1.1) Import and load the datasets (train+test)

In [8]:
import pandas as pd
import talos as ta
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
#from keras.layers.normalization import BatchNormalization
from tensorflow.keras.layers import BatchNormalization

from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
import nltk
nltk.download('punkt')
import talos as ta
from talos.model.early_stopper import early_stopper
from talos.model.normalizers import lr_normalizer
from keras.wrappers.scikit_learn import KerasClassifier 
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#%matplotlib inline
eng_stopwords = set(stopwords.words("english"))
pd.options.mode.chained_assignment = None



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Valentin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## 1.2) Preprocessing


Read in the data and create a pandas dataframe of it

In [9]:
#Cov = pd.read_csv("path/to/file.txt", sep='\t', 
#                  names = ["Text, "Label])
#Frame=pd.DataFrame([Cov], columns = ["Text", "Label"])

In [10]:
# Opening and Reading the files into a list
with open("../input/sentimentanalysis/imdb_labelled.txt","r") as text_file:
    lines = text_file.read().split('\n')
# Read the lines from both the files and append in same list
with open("../input/sentimentanalysis/yelp_labelled.txt","r") as text_file:
    lines += text_file.read().split('\n')
with open("../input/sentimentanalysis/amazon_cells_labelled.txt","r") as text_file:
    lines += text_file.read().split('\n')

# split by tab and remove corrupted data if any or lines which are not tab seperated
lines = [line.split("\t") for line in lines if len(line.split("\t"))==2 and line.split("\t")[1]!='']
train_documents = [line[0] for line in lines ]
train_labels = [int(line[1]) for line in lines]

# Now we have split the sentences and the labels in two lists of the same order. Every data refers two one row.
data_full = [[train_documents[i], train_labels[i]] for i in range(len(train_documents))]
train_text = [train_documents[i] for i in range(len(train_documents))]
train_label = [train_labels[i] for i in range(len(train_documents))]
from numpy import array
train_text = array(train_text)
train_label = array(train_label)

df = pd.DataFrame(data_full)
df_text = pd.DataFrame(train_text)
df_label = pd.DataFrame(train_label)
train_documents = array(train_documents)

FileNotFoundError: [Errno 2] No such file or directory: '../input/sentimentanalysis/imdb_labelled.txt'

In [None]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(df_label)
type(y)

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(train_documents, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)


In [None]:
print (xtrain, type(xtrain), xtrain.shape, type(xtrain[0]))

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(xtrain, ytrain, 
                                                  stratify=ytrain, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

Word2vec produces one vector per word, whereas tf-idf produces a score. Word2vec is great for going deeper into the documents we have and helps in identifying content and subsets of content. Its vectors represent each word’s context.

In [None]:
print (xtrain, type(xtrain), xtrain.shape, type(xtrain[0]))

In [None]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)
ytest_enc = np_utils.to_categorical(ytest)


In [None]:
print(ytrain_enc.shape, ytrain_enc)

In [None]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('../input/glove42/glove.42B.300d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

f.close()

print('Found %s word vectors.' % len(embeddings_index))

This part is used for GRU model

In [None]:
# using keras tokenizer here
token = text.Tokenizer(num_words=None)
max_len = 70

token.fit_on_texts(list(xtrain) + list(xvalid) + list(xtest))
xtrain_seq = token.texts_to_sequences(xtrain)
xvalid_seq = token.texts_to_sequences(xvalid)
xtest_seq = token.texts_to_sequences(xtest)

# zero pad the sequences
xtrain_pad = sequence.pad_sequences(xtrain_seq, maxlen=max_len)
xvalid_pad = sequence.pad_sequences(xvalid_seq, maxlen=max_len)
xtest_pad = sequence.pad_sequences(xtest_seq, maxlen=max_len)


word_index = token.word_index

In [None]:
print(xtrain_pad.shape)

In [None]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## 1.3) Build the model, compile and set the parameters for grid search

For this network (and GRU as well), I used Keras because I never used it before and I wanted to try it. Besides, I also chose to use Tanos, a custom package to do grid search, and find the best parameters.

In [None]:
import talos as ta
from talos.model.early_stopper import early_stopper
from talos.model.normalizers import lr_normalizer
from keras.wrappers.scikit_learn import KerasClassifier

def create_model(xtrain_pad, ytrain_enc, xvalid_pad, yvalid_enc, params):
    model = Sequential()                            
    #Dense1:
    model.add(Embedding(len(word_index) + 1,
                        300,
                        weights=[embedding_matrix],
                        input_length=max_len,
                        trainable=False))
    model.add(SpatialDropout1D(0.3))
    model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
    model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))
    
    #Dense(2):
    model.add(Dense(params['second_neuron'], activation='relu'))
    model.add(Dropout(params['dropout']))
    
    #Dense(3):
    model.add(Dense(params['third_neuron'], activation='relu'))
    model.add(Dropout(params['dropout']))
    
    #Dense(4):
    model.add(Dense(ytrain_enc.shape[1], 
                    activation=params['last_activation']))
    #Compile:
    model.compile(optimizer=params['optimizer'](lr=lr_normalizer(params['lr'], params['optimizer'])),
                  loss=params['loss'],
                  metrics=['acc'])

    out = model.fit(xtrain_pad, ytrain_enc,
                    batch_size=params['batch_size'],
                    epochs=params['epochs'],
                    verbose=1,
                    validation_data=[xvalid_pad, yvalid_enc],
                    callbacks=early_stopper(params['epochs'], patience=3, mode='moderate', monitor='val_loss'))
    
    return out, model




In [None]:
from keras.optimizers import Adam, Nadam
from keras.activations import softmax
from keras.losses import categorical_crossentropy, logcosh
p = {'lr': (0.1, 10, 10),
     'second_neuron': [800, 900, 1000, 1100, 1200, 1500],
     'third_neuron': [800, 900, 1000, 1100, 1200],
     'batch_size': [2000],
     'epochs': [100],
     'dropout': [0.3, 0.4, 0.5, 0.6, 0.7, 0.8],
     'optimizer': [Adam],
     'loss': ['categorical_crossentropy'],
     'last_activation': ['softmax'],
     'weight_regulizer': [None]}

In [None]:
h = ta.Scan(xtrain_pad, ytrain_enc, params=p, model=create_model, grid_downsample=0.01)

# 2) Result

In [None]:
h.data

In [None]:
h.peak_epochs_df

In [None]:
# access the summary details
h.details

In [None]:
e = ta.Evaluate(h)
e.evaluate(xtest_pad, ytest_enc, folds=2, average='macro')

In [None]:
# use Scan object as input
r = ta.Reporting(h)

In [None]:
# get the highest result ('val_acc' by default)
r.high('val_acc')

In [None]:
# get the highest result ('val_acc' by default)
r.high('acc')

In [None]:
# get the round with the best result
r.rounds2high()

In [None]:
# get the best paramaters
r.best_params('acc')

In [None]:
# get the best paramaters
r.best_params('val_acc')

In [None]:
# GRU with glove embeddings and two dense layers
#model = Sequential()
#model.add(Embedding(len(word_index) + 1,
#                     300,
#                     weights=[embedding_matrix],
#                     input_length=max_len,
#                     trainable=False))
#model.add(SpatialDropout1D(0.3))
#model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
#model.add(GRU(300, dropout=0.3, recurrent_dropout=0.3))
#
#model.add(Dense(1024, activation='relu'))
#model.add(Dropout(0.8))

#model.add(Dense(1024, activation='relu'))
#model.add(Dropout(0.8))

#model.add(Dense(2))
#model.add(Activation('sigmoid'))

#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy'])

# Fit the model with early stopping callback
#earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='auto')
#model.fit(xtrain_pad, y=ytrain_enc, batch_size=1000, epochs=100, verbose=1, validation_data=(xvalid_pad, yvalid_enc), callbacks=[earlystop])

   

In [None]:
#e = model.evaluate(x=xtest_pad, y=ytest_enc, batch_size=1000, verbose=1, sample_weight=None, steps=None)
#print(e)