In [1]:
# importing required packages
import re
import csv
import numpy as np
import tensorflow as tf
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras import callbacks
from keras.preprocessing import sequence
from keras.models import load_model
from keras.utils.vis_utils import plot_model
from keras.utils import np_utils

# fix random seed for reproducibility
np.random.seed(7)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Loading Data

In [2]:
phrase = []
labels = []
test_phrase = []

# load training data
with open("train.tsv") as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')
    for row in rd:
        phrase.append(row[2])
        labels.append(row[3])

# load testing data
with open("test.tsv") as testing:
    test = csv.reader(testing, delimiter="\t", quotechar='"')
    for s in test:
        test_phrase.append(s[2])

## Data preprocessing

In [3]:
# function to preprocess the sentences of the reviews.
def clean_phrase(phrase):
    #Remove punctuation (with a regular expression) and convert to lower case
    words = (re.sub("[^a-zA-Z]", " ", phrase)).lower()
    return words

In [4]:
# remove the first row of the train dataset which is currently the header
del phrase[0]
# display first 10 rows
phrase[:10]

['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .',
 'A series of escapades demonstrating the adage that what is good for the goose',
 'A series',
 'A',
 'series',
 'of escapades demonstrating the adage that what is good for the goose',
 'of',
 'escapades demonstrating the adage that what is good for the goose',
 'escapades',
 'demonstrating the adage that what is good for the goose']

In [5]:
# remove the first row of the testing dataset which is currently the header
del test_phrase[0]
# display first 10 rows
test_phrase[:10]

['An intermittently pleasing but mostly routine effort .',
 'An intermittently pleasing but mostly routine effort',
 'An',
 'intermittently pleasing but mostly routine effort',
 'intermittently pleasing but mostly routine',
 'intermittently pleasing but',
 'intermittently pleasing',
 'intermittently',
 'pleasing',
 'but']

In [6]:
# run preprocessing function on train dataset
clean_phrases = []

for x in phrase:
    new = clean_phrase(x)
    clean_phrases.append(new)
    
# run preprocessing function  on test dataset
test_clean_phrases = []

for xw in test_phrase:
    new_test = clean_phrase(xw)
    test_clean_phrases.append(new_test)

In [7]:
# display first 10 rows
clean_phrases[:10]

['a series of escapades demonstrating the adage that what is good for the goose is also good for the gander   some of which occasionally amuses but none of which amounts to much of a story  ',
 'a series of escapades demonstrating the adage that what is good for the goose',
 'a series',
 'a',
 'series',
 'of escapades demonstrating the adage that what is good for the goose',
 'of',
 'escapades demonstrating the adage that what is good for the goose',
 'escapades',
 'demonstrating the adage that what is good for the goose']

In [8]:
# display first 10 rows
test_clean_phrases[:10]

['an intermittently pleasing but mostly routine effort  ',
 'an intermittently pleasing but mostly routine effort',
 'an',
 'intermittently pleasing but mostly routine effort',
 'intermittently pleasing but mostly routine',
 'intermittently pleasing but',
 'intermittently pleasing',
 'intermittently',
 'pleasing',
 'but']

In [9]:
# join the rows as a string with '/n' as delimiter
all_text=' /n '.join(clean_phrases)

test_all_text=' /n '.join(test_clean_phrases)

In [10]:
# display first 100 characters
all_text[:100]

'a series of escapades demonstrating the adage that what is good for the goose is also good for the g'

In [11]:
# display first 100 characters
test_all_text[:100]

'an intermittently pleasing but mostly routine effort   /n an intermittently pleasing but mostly rout'

In [12]:
# split each reviews of the training dataset and join them as a string
reviews = all_text.split(' /n ')
all_text = ' '.join(reviews)

# split each word of the training dataset in the string to a list
words = all_text.split()

In [13]:
# split each reviews of the training dataset and join them as a string
test_reviews = test_all_text.split(' /n ')
test_all_text = ' '.join(test_reviews)

# split each word of the training dataset in the string to a list
test_words = test_all_text.split()

In [14]:
words[:10]

['a',
 'series',
 'of',
 'escapades',
 'demonstrating',
 'the',
 'adage',
 'that',
 'what',
 'is']

In [15]:
test_words[:10]

['an',
 'intermittently',
 'pleasing',
 'but',
 'mostly',
 'routine',
 'effort',
 'an',
 'intermittently',
 'pleasing']

In [16]:
# print no of rows for train and test 
print("Train reviews: {}".format(len(reviews)))
print("Test reviews: {}".format(len(test_reviews)))

Train reviews: 156060
Test reviews: 66292


In [17]:
reviews[:10]

['a series of escapades demonstrating the adage that what is good for the goose is also good for the gander   some of which occasionally amuses but none of which amounts to much of a story  ',
 'a series of escapades demonstrating the adage that what is good for the goose',
 'a series',
 'a',
 'series',
 'of escapades demonstrating the adage that what is good for the goose',
 'of',
 'escapades demonstrating the adage that what is good for the goose',
 'escapades',
 'demonstrating the adage that what is good for the goose']

In [18]:
test_reviews[:10]

['an intermittently pleasing but mostly routine effort  ',
 'an intermittently pleasing but mostly routine effort',
 'an',
 'intermittently pleasing but mostly routine effort',
 'intermittently pleasing but mostly routine',
 'intermittently pleasing but',
 'intermittently pleasing',
 'intermittently',
 'pleasing',
 'but']

In [19]:
# remove the first row of the labels which is currently the header
del labels[0]

# display first 10 rows
labels[:10]

['1', '2', '2', '2', '2', '2', '2', '2', '2', '2']

In [20]:
# preprocessing on the label list
labels_cleaned = '\n'.join(labels)
labels_cleaned_last = labels_cleaned.split('\n')


len(labels_cleaned_last)

156060

In [21]:
# convert list to an array
labels_sentiment = [int(i) for i in labels_cleaned_last]
labels = np.array(labels_sentiment)
labels

array([1, 2, 2, ..., 3, 2, 2])

In [22]:
# check number of unique labels in the labels array
np.unique(labels)

array([0, 1, 2, 3, 4])

In [23]:
len(labels)

156060

In [24]:
len(words)

1072621

In [25]:
len(test_words)

423806

In [26]:
# combine the list that contains the individual words in the datasets
full_words = words + test_words

In [27]:
len(full_words)

1496427

In [28]:
#create dictionaries that map the words in the vocabulary to integers. 
#Then we can convert each of our reviews into integers so they can be passed into the network.

from collections import Counter
counts = Counter(full_words)
vocab = sorted(counts, key=counts.get, reverse=True)

#Build a dictionary that maps words to integers
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [29]:
#Encode the words with integers. 

reviews_ints = []
for each in reviews:
    reviews_ints.append([vocab_to_int[word] for word in each.split( )])
    
test_reviews_ints = []
for eachs in test_reviews:
    test_reviews_ints.append([vocab_to_int[word] for word in eachs.split( )])

In [30]:
# check no of unique words in the corpus
# this will be the features to be extracted
len(vocab_to_int)

17582

In [31]:
len(reviews_ints)

156060

In [32]:
len(test_reviews_ints)

66292

In [33]:
reviews_ints[:10]

[[2,
  315,
  3,
  16674,
  7795,
  1,
  8458,
  9,
  53,
  8,
  47,
  13,
  1,
  3992,
  8,
  186,
  47,
  13,
  1,
  13174,
  61,
  3,
  88,
  592,
  12055,
  19,
  617,
  3,
  88,
  2789,
  5,
  52,
  3,
  2,
  42],
 [2, 315, 3, 16674, 7795, 1, 8458, 9, 53, 8, 47, 13, 1, 3992],
 [2, 315],
 [2],
 [315],
 [3, 16674, 7795, 1, 8458, 9, 53, 8, 47, 13, 1, 3992],
 [3],
 [16674, 7795, 1, 8458, 9, 53, 8, 47, 13, 1, 3992],
 [16674],
 [7795, 1, 8458, 9, 53, 8, 47, 13, 1, 3992]]

In [34]:
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 159
Maximum review length: 48


In [35]:
# check total no of rows not having zero length reviews
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zero_idx)

155901

In [36]:
# remove zero length reviews
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
labels = np.array([labels[ii] for ii in non_zero_idx])

In [37]:
#check again
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 0
Maximum review length: 48


In [38]:
len(reviews_ints)

155901

In [39]:
#As maximum review length too many steps for RNN. Let's truncate to 12 steps. 
#For reviews shorter than 12 steps, we'll pad with 0s. For reviews longer than 12 steps,
# we will truncate them to the first 12 characters.

max_review_length = 12
X_train = sequence.pad_sequences(reviews_ints, maxlen=max_review_length)
x_test = sequence.pad_sequences(test_reviews_ints, maxlen=max_review_length)

In [40]:
print(X_train.shape)

(155901, 12)


In [41]:
print(x_test.shape)

(66292, 12)


In [42]:
X_train[:5]

array([[  592, 12055,    19,   617,     3,    88,  2789,     5,    52,
            3,     2,    42],
       [    3, 16674,  7795,     1,  8458,     9,    53,     8,    47,
           13,     1,  3992],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     2,   315],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     2],
       [    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,   315]], dtype=int32)

In [43]:
x_test[:5]

array([[   0,    0,    0,    0,    0,   16, 2821, 1787,   19,  528, 1018,
         396],
       [   0,    0,    0,    0,    0,   16, 2821, 1787,   19,  528, 1018,
         396],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          16],
       [   0,    0,    0,    0,    0,    0, 2821, 1787,   19,  528, 1018,
         396],
       [   0,    0,    0,    0,    0,    0,    0, 2821, 1787,   19,  528,
        1018]], dtype=int32)

In [44]:
# check no of unique words in the corpus
# Adding 1 because we use 0's for padding, dictionary started at 1
# this value will be passed to the embedding layer
top_words = len(vocab_to_int) + 1
print(top_words)

17583


In [45]:
# One Hot Encoding the labels
y_train = np_utils.to_categorical(labels, 5)

In [46]:
y_train.shape

(155901, 5)

## Training

In [47]:
# Creating Callbacks
# ModelCheckpoints is used to save the model after every epoch
# EarlyStopping is used to stop training when the validation loss has not improved after 2 epochs
# Tensorboard is used tovisualize dynamic graphs of the training and test metrics
cbks = [callbacks.ModelCheckpoint(filepath='./checkpoint_model.h5', monitor='val_loss', save_best_only=True),
            callbacks.EarlyStopping(monitor='val_loss', patience=2),callbacks.TensorBoard(log_dir='./logs', histogram_freq=0, write_graph=True, write_images=False)]

In [48]:
# Final Model Architecture

# embedding layer size
embedding_vecor_length = 32

model = Sequential()
model.add(Embedding(19479, embedding_vecor_length, input_length=max_review_length, dropout=0.2))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
# 1 layer of 100 units in the hidden layers of the LSTM cells
model.add(LSTM(100))
model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train,validation_split=0.20, epochs=5,verbose=1, batch_size=32,callbacks=cbks)

  import sys


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 12, 32)            623328    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 12, 32)            3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 6, 32)             0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 505       
Total params: 680,137
Trainable params: 680,137
Non-trainable params: 0
_________________________________________________________________
None
Train on 124720 samples, validate on 31181 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


<keras.callbacks.History at 0x7f9524c5d2b0>

In [None]:
#to visualize the training graphs
#run "tensorboard --logdir='./logs' "  from the command terminal

## Testing

In [2]:
# load the saved model
# returns a compiled model
model = load_model('checkpoint_model.h5')

In [3]:
# visualize model architecture
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
# run prediction
test_pred = model.predict_classes(x_test)

In [None]:
# edits the test file to input the prediction labels
test_df = pd.read_csv('test.tsv', sep='\t', header=0)

In [None]:
test_df['Sentiment'] = test_pred.reshape(-1,1) 
header = ['PhraseId', 'Sentiment']
test_df.to_csv('./final_predicted_model.csv', columns=header, index=False, header=True)