In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.DataFrame()
df = pd.read_csv('/content/movie_data.csv')
df.head(20)

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
5,"I saw this film on September 1st, 2005 in Indi...",1
6,"Maybe I'm reading into this too much, but I wo...",1
7,I felt this film did have many good qualities....,1
8,This movie is amazing because the fact that th...,1
9,"""Quitting"" may be as much about exiting a pre-...",1


In [None]:
import nltk
nltk.download('punkt') # Used for sentence tokenizer
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

review_lines = list()
lines = df['review'].values.tolist()

for line in lines:
    tokens = word_tokenize(line)
    # convert to lower case
    tokens = [w.lower() for w in tokens]

    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    #Replace the characters in the first argument with the corresponding characters in the second argument
    #Third argument represents a string of characters that must be removed from the original string
    stripped = [w.translate(table) for w in tokens]

    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()] #.isalpha returns True if the chaarcters are aphabets

    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    review_lines.append(words)

In [None]:
len(review_lines)

50000

In [None]:
import gensim # Process plain text

EMBEDDING_DIM = 100
# train word2vec model
model = gensim.models.Word2Vec(sentences=review_lines, size=EMBEDDING_DIM, window=5, min_count=1)
'''
size : Number of dimensions of the embeddings
window : sliding window size
min_count : Minimum count of words to consider while training a model. Default is 5
'''
# vocab size
words = list(model.wv.vocab) # wv is the object that contain mappings between words and embeddings
print('Vocabulary size: %d' % len(words))

Vocabulary size: 134156


In [None]:
import os

embeddings_index = {}
#f = open(os.path.join('', 'imdb_embedding_word2vec.txt'),  encoding = "utf-8")
f = open('/content/imdb_embedding_word2vec.txt')
for line in f:
  print(line)
  values = line.split()
  #print(values)
  word = values[0]
  coefs = np.asarray(values[1:])
  embeddings_index[word] = coefs
f.close()

In [None]:
embeddings_index

In [None]:
max_length = 100 # try other options like mean of sentence lengths

In [None]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

VALIDATION_SPLIT = 0.2

# vectorize the text samples into a 2D integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(review_lines)
sequences = tokenizer_obj.texts_to_sequences(review_lines) # Transforms text into a sequence of integers

In [None]:
# pad sequences
'''
pad_sequences : convert a list of sequences into a 2D numpy array of shape (num_samples,num_timesteps)
num_timesteps : maxlen(if provided) or length of the longest sequence
pre padding or truncating from the beginning is the default, can change it to post
'''
word_index = tokenizer_obj.word_index # Maps words to their numeric representation
print('Found %s unique tokens.' % len(word_index))

review_pad = pad_sequences(sequences, maxlen=max_length)
sentiment =  df['sentiment'].values
print('Shape of review tensor:', review_pad.shape)
print('Shape of sentiment tensor:', sentiment.shape)



Found 134156 unique tokens.
Shape of review tensor: (50000, 100)
Shape of sentiment tensor: (50000,)


In [None]:
review_pad[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,   288,   101,     2,   128,   198, 28328,   228,
        1590,   816,  5464,    13,   524,  9564,  9037,   351,   106,
         237,  9037,   142,    30,  3111, 15552,    14,  1422,  6790,
         142,   991,  8092, 10922,  1822,     7,     2,  2953,  1147,
           5,   472,   312,   613,  2809,  2975,  1944,    22,   236,
           2,  1446,  1515,   222,   236, 11399,   613,   101,    32,
         239,  1515,    32,   259,  2172,   220,    14,   149,  2589,
         164,   135,    13,  2289,     2,    20,  1268,    53,    13,
        1474], dtype=int32)

In [None]:
# split the data into a training set and a validation set
indices = np.arange(review_pad.shape[0])
print(indices)
#print(len(indices))
np.random.shuffle(indices)
print(indices)
#print(len(indices))


[    0     1     2 ... 49997 49998 49999]
50000
[13472 39872 34300 ... 13885 10652 18104]
50000


In [None]:
review_pad = review_pad[indices]
sentiment = sentiment[indices]
num_validation_samples = int(VALIDATION_SPLIT * review_pad.shape[0])

In [None]:
X_train_pad = review_pad[:-num_validation_samples]
y_train = sentiment[:-num_validation_samples]
X_test_pad = review_pad[-num_validation_samples:]
y_test = sentiment[-num_validation_samples:]

In [None]:
print('Shape of X_train_pad tensor:', X_train_pad.shape)
print('Shape of y_train tensor:', y_train.shape)

print('Shape of X_test_pad tensor:', X_test_pad.shape)
print('Shape of y_test tensor:', y_test.shape)

Shape of X_train_pad tensor: (40000, 100)
Shape of y_train tensor: (40000,)
Shape of X_test_pad tensor: (10000, 100)
Shape of y_test tensor: (10000,)


In [None]:
from keras.models import load_model
from keras.models import model_from_json

In [None]:
# Model reconstruction from JSON file
with open('/content/model_architecture_Sentiment_classifier_word2vec_first_try.json', 'r') as f:
    model = model_from_json(f.read())

In [None]:
model.load_weights('/content/Sentiment_Classifier_word2vec_first_try.h5')

In [None]:
test_sample_1 = "This movie is fantastic! I really like it because it is so good!"
test_sample_2 = "Good movie!"
test_sample_3 = "Maybe I like this movie."
test_sample_4 = "Not to my taste, will skip and watch another movie"
test_sample_5 = "if you like action, then this movie might be good for you."
test_sample_6 = "Bad movie!"
test_sample_7 = "Not a good movie!"
test_sample_8 = "This movie really sucks! Can I get my money back please?"
test_samples = [test_sample_1, test_sample_2, test_sample_3, test_sample_4, test_sample_5, test_sample_6, test_sample_7, test_sample_8]

test_samples_tokens = tokenizer_obj.texts_to_sequences(test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=max_length)

#predict
model.predict(x=test_samples_tokens_pad)

array([[0.982698  ],
       [0.77318543],
       [0.45511347],
       [0.34137696],
       [0.39119244],
       [0.08242244],
       [0.77318543],
       [0.03846601]], dtype=float32)