In [0]:
import tensorflow as tf
from tensorflow.contrib import rnn
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
from string import punctuation as punc
from collections import Counter

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
data=pd.read_csv('drive/My Drive/A1-Q3_Dataset/mrdata.tsv',delimiter='\t',encoding='utf-8')
data.head()
print(data.shape)

(156060, 4)


In [0]:
val = [True] * len(data)
Modified_data = data[val]
Modified_data = Modified_data.drop(columns="SentenceId")
Modified_data = Modified_data.drop(columns="PhraseId")
Modified_data = shuffle(Modified_data)
print(Modified_data.head())

                                                   Phrase  Sentiment
93831                   with an important message to tell          3
107302  by taking your expectations and twisting them ...          3
89266                                          the places          2
66099                                        Elvis person          2
48096                              your own precious life          3


In [0]:
class_labels = Modified_data.iloc[:, 1].values
content = Modified_data.iloc[:, 0].values
print(class_labels)
print(content)

[3 3 2 ... 0 2 3]
['with an important message to tell'
 'by taking your expectations and twisting them just a bit' 'the places'
 ...
 'until its absurd , contrived , overblown , and entirely implausible finale'
 'few decades' 'draws us in long']


In [0]:
data_changed = []

for i in content:
    i_cool_one = ''.join([i for i in i if i not in punc])
    data_changed.append(i_cool_one)
    
words = []
word_set = []
for i in data_changed:
    words.append(i.lower().split())
    for word in i.split():
        word_set.append(word.lower())

In [0]:
counter = Counter(word_set)
vocab = sorted(counter, key=counter.get, reverse=True)

dic_index = {word: i for i, word in enumerate(vocab, 1)}

cont_index = []
for i in words:
    cont_index.append([dic_index[word] for word in i])
    
print(i)

['draws', 'us', 'in', 'long']


In [0]:
length_consider = 17

content_mod = np.zeros((len(cont_index), length_consider), dtype=int)
for i, x in enumerate(cont_index):
    if(len(x)!=0):
        content_mod[i, -len(x):] = np.array(x)[:length_consider]
        
print(content_mod)

[[   0    0    0 ...  443    5  540]
 [   0    0    0 ...   61    2  248]
 [   0    0    0 ...    0    1  762]
 ...
 [   0    0    0 ...  563 2136 1578]
 [   0    0    0 ...    0  128 1092]
 [   0    0    0 ...   86    7  129]]


In [0]:
i,j = content_mod.shape
print(i,j)

156060 17


In [0]:
split = i * 0.8
split = int(split)

features_train = content_mod[:split]
train_labels = class_labels[:split]

features_test = content_mod[split:]
test_labels = class_labels[split:]

print(features_train)
print(features_test)

[[    0     0     0 ...   443     5   540]
 [    0     0     0 ...    61     2   248]
 [    0     0     0 ...     0     1   762]
 ...
 [    0     0     0 ...     0     0 10901]
 [    0     0     0 ...     0     3  5531]
 [    0     0     0 ...  1366    64  5690]]
[[   0    0    0 ...    0 1309 2073]
 [   0    0    0 ...    1   17  168]
 [   0    0    0 ...    0 9988    6]
 ...
 [   0    0    0 ...  563 2136 1578]
 [   0    0    0 ...    0  128 1092]
 [   0    0    0 ...   86    7  129]]


In [0]:
epochs = 10
len_batch = 1000 
word_count = len(dic_index) + 1 
eeta = 0.001 
number_of_layers = 3 
rate_of_drop = 0.8 
embed_size = 300 
number_of_hidLayer = 512

tf.reset_default_graph()

inp = tf.placeholder(tf.int32, [None, None], name='inp')
label = tf.placeholder(tf.int32, [None, None], name='label')
embeding = tf.Variable(tf.random_uniform((word_count, embed_size), -1, 1))
embed = tf.nn.embedding_lookup(embeding, inp)

print(embeding.shape)

(16404, 300)


In [0]:
hidLayer = tf.contrib.rnn.BasicLSTMCell(number_of_hidLayer)
hidLayer = tf.contrib.rnn.DropoutWrapper(hidLayer, rate_of_drop)

cell = tf.contrib.rnn.MultiRNNCell([hidLayer] * number_of_layers)
state1 = cell.zero_state(len_batch, tf.float32)
rnn_output, rnn_state = tf.nn.dynamic_rnn(cell, embed, initial_state=state1)
print(hidLayer)

<tensorflow.python.ops.rnn_cell_impl.DropoutWrapper object at 0x7f6698eada20>


In [0]:
use_to_predict = tf.layers.dense(rnn_output[:, -1], 1, activation=tf.sigmoid)
loss_fun = tf.losses.mean_squared_error(label, use_to_predict)
optimizer = tf.train.AdamOptimizer(eeta).minimize(loss_fun)
correct_classified = tf.equal(tf.cast(tf.round(use_to_predict), tf.int32), label)
accuracy = tf.reduce_mean(tf.cast(correct_classified, tf.float32))

In [0]:
session = tf.Session()
session.run(tf.global_variables_initializer())

In [0]:
for i in range(epochs):
    loss_for_epoch = []
    trainAccuracy = []
    itera = 0
    print(len(features_train))
    while itera + len_batch <= len(features_train):
        y_batch = train_labels[itera:itera+len_batch].reshape(-1, 1)
        X_batch = features_train[itera:itera+len_batch]
        a, o, _ = session.run([accuracy, loss_fun, optimizer], feed_dict={inp:X_batch, label:y_batch})
        trainAccuracy.append(a)
        loss_for_epoch.append(o)
        itera += len_batch
    print('Epoch number: {}/{}'.format(i, epochs))
    print('Loss in this epoch: {}'.format(np.mean(loss_for_epoch)))
    print('Training accuracy in this epoch: {:.5}'.format(np.mean(trainAccuracy)*100))

124848
Epoch number: 0/10
Loss in this epoch: 1.9477348327636719
Training accuracy in this epoch: 17.369
124848
Epoch number: 1/10
Loss in this epoch: 1.9322104454040527
Training accuracy in this epoch: 17.41
124848
Epoch number: 2/10
Loss in this epoch: 1.9321941137313843
Training accuracy in this epoch: 17.411
124848
Epoch number: 3/10
Loss in this epoch: 1.9321942329406738
Training accuracy in this epoch: 17.411
124848
Epoch number: 4/10
Loss in this epoch: 1.9321939945220947
Training accuracy in this epoch: 17.411
124848
Epoch number: 5/10
Loss in this epoch: 1.9321939945220947
Training accuracy in this epoch: 17.411
124848
Epoch number: 6/10
Loss in this epoch: 1.932193636894226
Training accuracy in this epoch: 17.411
124848
Epoch number: 7/10
Loss in this epoch: 1.9322062730789185
Training accuracy in this epoch: 17.41
124848
Epoch number: 8/10
Loss in this epoch: 1.932201623916626
Training accuracy in this epoch: 17.41
124848
Epoch number: 9/10
Loss in this epoch: 1.932193756103

In [0]:
test_accuracy = []
itera = 0
while itera + len_batch <= len(features_test):
    X_batch = features_test[itera:itera+len_batch]
    y_batch = test_labels[itera:itera+len_batch].reshape(-1, 1)

    a = session.run([accuracy], feed_dict={inp:X_batch, label:y_batch})
    
    test_accuracy.append(a)
    itera += len_batch
print("Test accuracy is {:.5f} %".format(np.mean(test_accuracy) * 100))

Test accuracy is 17.78387 %
