In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from bs4 import BeautifulSoup
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
print(df.shape)
df.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()
# Removing the square brackets
def remove_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

# Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_square_brackets(text)
    return text
df['review'] = df['review'].apply(denoise_text)

# Remove the special characters from the dataset
def remove_special_chars(text, remove_digits = True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text
df['review'] = df['review'].apply(remove_special_chars)

In [4]:
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    out_data = ""
    for words in text:
        out_data += lemmatizer.lemmatize(words)
    return out_data
df['review'] = df['review'].apply(lemmatization)

In [5]:
df['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production The filming tech...
2        I thought this was a wonderful way to spend ti...
3        Basically theres a family where a little boy J...
4        Petter Matteis Love in the Time of Money is a ...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot bad dialogue bad acting idiotic direc...
49997    I am a Catholic taught in parochial elementary...
49998    Im going to have to disagree with the previous...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [6]:
df['review'] = list(df['review'].str.split())
df['review']

0        [One, of, the, other, reviewers, has, mentione...
1        [A, wonderful, little, production, The, filmin...
2        [I, thought, this, was, a, wonderful, way, to,...
3        [Basically, theres, a, family, where, a, littl...
4        [Petter, Matteis, Love, in, the, Time, of, Mon...
                               ...                        
49995    [I, thought, this, movie, did, a, down, right,...
49996    [Bad, plot, bad, dialogue, bad, acting, idioti...
49997    [I, am, a, Catholic, taught, in, parochial, el...
49998    [Im, going, to, have, to, disagree, with, the,...
49999    [No, one, expects, the, Star, Trek, movies, to...
Name: review, Length: 50000, dtype: object

In [7]:
model = Word2Vec(df['review'], min_count = 3, epochs = 30, seed = 42)
word_list = list(model.wv.index_to_key)

In [8]:
print(word_list[:50])

['the', 'a', 'and', 'of', 'to', 'is', 'in', 'I', 'that', 'it', 'this', 'was', 'as', 'with', 'for', 'movie', 'The', 'film', 'but', 'on', 'are', 'not', 'have', 'his', 'you', 'be', 'one', 'at', 'by', 'he', 'an', 'all', 'who', 'from', 'like', 'its', 'they', 'so', 'or', 'about', 'her', 'has', 'just', 'out', 'some', 'good', 'more', 'very', 'This', 'what']


In [9]:
print(model.wv.most_similar('king', topn = 5))

[('kingdom', 0.6062176823616028), ('devil', 0.6034606099128723), ('Ahmad', 0.5904736518859863), ('Jaffar', 0.5859519243240356), ('vizier', 0.579598605632782)]


In [10]:
print(model.wv.most_similar('queen', topn = 5))

[('goddess', 0.6358354091644287), ('maid', 0.6209036707878113), ('blonde', 0.6065131425857544), ('Arnoul', 0.5869413614273071), ('showgirl', 0.5725454092025757)]
