In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [0]:
train_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/sentiment-analysis-on-movie-reviews/train.tsv',delimiter='\t',encoding='utf-8')
test_data = pd.read_csv('/content/drive/My Drive/Colab Notebooks/sentiment-analysis-on-movie-reviews/test.tsv',delimiter='\t',encoding='utf-8')

In [0]:
train_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [0]:
test_data.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [0]:
train = np.array(train_data.drop(["PhraseId","SentenceId"], axis = 1))
test = np.array(test_data.drop(["PhraseId","SentenceId"], axis = 1))
print(train)
print(test)

[['A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .'
  1]
 ['A series of escapades demonstrating the adage that what is good for the goose'
  2]
 ['A series' 2]
 ...
 ['avuncular chortles' 3]
 ['avuncular' 2]
 ['chortles' 2]]
[['An intermittently pleasing but mostly routine effort .']
 ['An intermittently pleasing but mostly routine effort']
 ['An']
 ...
 ['A long-winded ,']
 ['A long-winded']
 ['predictable scenario']]


In [0]:
train_sentences = []
train_labels = []
test_sentences = []
test_labels = []

for i in range(train.shape[0]):
  train_sentences.append(train[i, 0])
  train_labels.append(train[i, 1])
train_sentences_final = np.array(train_sentences)
train_labels_final = np.array(train_labels)

for j in range(test.shape[0]):
  test_sentences.append(test[j, 0])
test_sentences_final = np.array(test_sentences)

In [0]:
vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(train_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(test_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

In [0]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[100]))
print(train_sentences[100])

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? would have a hard time sitting through this one
would have a hard time sitting through this one .


In [0]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(5, activation='softmax')
])
model.compile(loss='sparse_categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense_15 (Dense)             (None, 512)               983552    
_________________________________________________________________
dense_16 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_17 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_18 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_19 (Dense)             (None, 32)               

In [0]:
num_epochs = 10
model.fit(padded, train_labels_final, epochs=num_epochs)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f81e52f0128>

In [0]:
model.predict(padded)

array([[3.2727609e-03, 9.8189443e-01, 1.4832647e-02, 1.7032865e-07,
        2.4944107e-14],
       [3.0856940e-03, 1.5084457e-01, 8.0958915e-01, 3.5837214e-02,
        6.4331008e-04],
       [2.7259058e-04, 2.5930142e-02, 9.1822851e-01, 5.5068433e-02,
        5.0027180e-04],
       ...,
       [1.1481746e-03, 3.2462247e-02, 4.5544139e-01, 4.7109449e-01,
        3.9853703e-02],
       [2.8553104e-06, 3.6054479e-03, 9.8159271e-01, 1.4783661e-02,
        1.5348205e-05],
       [8.9359604e-04, 5.9822679e-02, 8.9871752e-01, 4.0103529e-02,
        4.6271563e-04]], dtype=float32)