#### Generate Word Vector using word2Vec library on IMDB Dataset.

In [3]:
# pip install seaborn

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np 
import pandas as pd 


##### Dataset import and review

In [5]:
data=pd.read_csv("../Dataset/IMDB Dataset.csv")

In [6]:
data.shape

(50000, 2)

In [7]:
data.head(1)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive


In [8]:
x=data["review"]
y=data["sentiment"]

In [9]:
x

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. <br /><br />The...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [10]:
y

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
49995    positive
49996    negative
49997    negative
49998    negative
49999    negative
Name: sentiment, Length: 50000, dtype: object

In [11]:
y.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

#### Preprocessing

In [12]:
# Lemmatization is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.
# Eg: The word "walks" has "walk" as its lemma. This link is useful for doing sentiment analysis.
lemmatizer=WordNetLemmatizer()
lemmatizer.lemmatize("walks")

'walk'

In [13]:
# for regular expression
import re

In [14]:
# Downloads the data.
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Using the stopwords.
from nltk.corpus import stopwords

# Initialize the stopwords
stoplist = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vicky_gupta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vicky_gupta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
def preprocessing(x):
    # Convert to lower case
    x=x.lower() 
    # Remove special characters
    x=re.sub("[^a-zA-Z]"," ",x) 
    # Split the sentence into words
    x=x.split() 
    # Lemmatize the words and remove the stopwords
    x=[lemmatizer.lemmatize(word) for word in x if word not in stoplist]
    # Join the words
    x=" ".join(x)
    return x

In [16]:
para="Hello my name is VICKY and I am a stuDent of JMI University. I tRy to learn new things everYday."
preprocessing(para) 

'hello name vicky student jmi university try learn new thing everyday'

In [17]:
x=x.apply(preprocessing)

In [18]:
x[0]

'one reviewer mentioned watching oz episode hooked right exactly happened br br first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word br br called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away br br would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experi

In [19]:
data["review"][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [20]:
def preprocessing1(x):
    # Convert to lower case
    x=x.lower() 
    # Remove special characters
    x=re.sub("[^a-zA-Z]"," ",x) 
    # Split the sentence into words
    x=x.split() 
    # Lemmatize the words and remove the stopwords
    x=[lemmatizer.lemmatize(word) for word in x if word not in stoplist]
    return x

In [21]:
preprocessing1(para)

['hello',
 'name',
 'vicky',
 'student',
 'jmi',
 'university',
 'try',
 'learn',
 'new',
 'thing',
 'everyday']

In [22]:
data["new_review"]=data["review"].apply(preprocessing1)

In [23]:
data.head()

Unnamed: 0,review,sentiment,new_review
0,One of the other reviewers has mentioned that ...,positive,"[one, reviewer, mentioned, watching, oz, episo..."
1,A wonderful little production. <br /><br />The...,positive,"[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,negative,"[basically, family, little, boy, jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"[petter, mattei, love, time, money, visually, ..."


In [24]:
print(data["review"][0])
print(data["new_review"][0])

One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fac

In [25]:
y=pd.get_dummies(y,drop_first=True)
print(y)
print(y.value_counts())

       positive
0          True
1          True
2          True
3         False
4          True
...         ...
49995      True
49996     False
49997     False
49998     False
49999     False

[50000 rows x 1 columns]
positive
False       25000
True        25000
Name: count, dtype: int64


In [26]:
from sklearn.model_selection import train_test_split

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.2, 
                                                    random_state=0)

In [28]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(40000,)
(10000,)
(40000, 1)
(10000, 1)


In [29]:
x_train[0]

'one reviewer mentioned watching oz episode hooked right exactly happened br br first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word br br called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away br br would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experi

In [30]:
corpus=[]
for i in x_train:
    word=nltk.word_tokenize(i)
    corpus.append(word)

In [31]:
corpus[0]

['redeeming',
 'quality',
 'movie',
 'otherwise',
 'insult',
 'viewer',
 'intelligence',
 'losing',
 'track',
 'time',
 'plot',
 'reason',
 'produced',
 'br',
 'br',
 'plus',
 'guy',
 'glass',
 'ever',
 'got',
 'gig',
 'hollywood',
 'beyond']

In [32]:
x_train[0]

'one reviewer mentioned watching oz episode hooked right exactly happened br br first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word br br called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far away br br would say main appeal show due fact go show dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experi

In [33]:
# pip install gensim

In [34]:
import gensim

In [35]:
vector_size=32
window=5
model= gensim.models.Word2Vec(corpus,vector_size=vector_size,window=window,min_count=1)
X_vocb= list(model.wv.key_to_index)


In [36]:
model.wv.similar_by_word("glass")

[('window', 0.8606956005096436),
 ('kitchen', 0.8442209959030151),
 ('ceiling', 0.8400110006332397),
 ('pipe', 0.8391174077987671),
 ('cigarette', 0.8386237621307373),
 ('elevator', 0.8367401361465454),
 ('tree', 0.8356301784515381),
 ('wall', 0.8335075378417969),
 ('water', 0.8298806548118591),
 ('coat', 0.8283363580703735)]

In [37]:
len(X_vocb)

81730

In [38]:
model.wv.similar_by_word("one")

[('probably', 0.7284210324287415),
 ('movie', 0.7145125269889832),
 ('many', 0.6982259154319763),
 ('easily', 0.690430223941803),
 ('definitely', 0.6824599504470825),
 ('least', 0.6812196969985962),
 ('another', 0.6773499250411987),
 ('brianiac', 0.6532714366912842),
 ('last', 0.6502249836921692),
 ('ever', 0.6498934030532837)]

In [39]:
model1= gensim.models.Word2Vec(data["new_review"])
X_vocb1= list(model1.wv.key_to_index)

In [40]:
model1.wv.similar_by_word("one")

[('probably', 0.5489322543144226),
 ('movie', 0.485360324382782),
 ('definitely', 0.47225436568260193),
 ('think', 0.45367658138275146),
 ('single', 0.448491632938385),
 ('every', 0.4408940374851227),
 ('possibly', 0.43670061230659485),
 ('twice', 0.43669435381889343),
 ('easily', 0.4335622191429138),
 ('three', 0.4325866103172302)]

In [41]:
# print the whole word vector
model1.wv["one"]

array([ 0.52834255, -0.50330436,  0.4482659 , -0.08968177,  0.681257  ,
        1.1141015 ,  0.3508549 , -0.22301328,  0.0576657 , -0.1195665 ,
        1.0177532 ,  1.3058525 ,  0.7450179 , -0.30928335,  0.5554545 ,
       -1.0633174 , -0.19704634,  0.15568288, -0.6543135 ,  0.98056823,
        0.80841964, -0.5994649 ,  1.118312  ,  1.9589052 , -0.6946888 ,
        0.6219167 ,  0.6928124 , -0.37874144, -0.46450546,  0.9417454 ,
        0.48539653, -0.79899937,  0.684559  , -0.7488109 , -0.4891625 ,
        1.0736047 ,  2.2077312 , -0.07852761, -0.20961998,  0.65336907,
        0.0959382 ,  1.3473923 ,  2.4177248 , -0.6499496 ,  0.7428712 ,
       -0.7160431 ,  0.22993961,  0.28622478, -0.6211661 , -0.40738073,
        0.06454261, -1.0743816 , -1.7360379 ,  0.39937508, -0.7381881 ,
        0.11569172,  0.4760637 ,  0.7446749 ,  1.1373228 ,  0.16065545,
        0.25688848,  0.24707776,  1.3551484 ,  0.26472256,  0.2554532 ,
       -1.027778  , -0.22240956,  1.7850578 , -1.4650702 , -0.30

In [42]:
# import tensorflow as tf
# from tensorflow.keras.layers import Dropout
# from tensorflow.keras.layers import Bidirectional
# from tensorflow.keras.optimizers import Adam
# from keras.models import Sequential

In [43]:
# lstm_model = Sequential()
# lstm_model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
# lstm_model.add(Bidirectional(LSTM(128)))
# #model.add(LSTM(128))
# lstm_model.add(Dense(64, activation = 'relu'))
# lstm_model.add(Dense(1, activation = 'sigmoid'))

# lstm_model.summary()
# lstm model


In [44]:
# lstm_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [45]:
# history = model.fit(x_train, y_train, epochs=5, verbose=1,steps_per_epoch = len(x_train)/256,batch_size = 256,validation_split=0.1)

In [46]:
# def plot_graphs(history, string):
#     plt.plot(history.history[string])
#     plt.plot(history.history['val_'+string])
#     plt.xlabel("Epochs")
#     plt.ylabel(string)
#     plt.legend([string, 'val_'+string])
#     plt.show()

# plot_graphs(history, "accuracy")
# plot_graphs(history, "loss")

In [47]:
# from sklearn.metrics import accuracy_score
# prediction = model.predict(X_test_padded)
# # Get labels based on probability 1 if p>= 0.5 else 0
# y_pred = []
# for i in prediction:
#     if i >= 0.5:
#         y_pred.append(1)
#     else:
#         y_pred.append(0)
# print("Accuracy of prediction on test set : ", accuracy_score(y_test,y_pred))