# IMDB Movie review 50K dataset

In [1]:
import pandas as pd

In [3]:
df= pd.read_csv('IMDB_Dataset.csv')

In [5]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.shape

(50000, 2)

In [8]:
df['review'].loc[100]

"This short film that inspired the soon-to-be full length feature - Spatula Madness - is a hilarious piece that contends against similar cartoons yielding multiple writers. The short film stars Edward the Spatula who after being fired from his job, joins in the fight against the evil spoons. This premise allows for some funny content near the beginning, but is barely present for the remainder of the feature. This film's 15-minute running time is absorbed by some odd-ball comedy and a small musical number. Unfortunately not much else lies below it. The plot that is set up doesn't really have time to show. But it's surely follows it plot better than many high-budget Hollywood films. This film is worth watching at least a few times. Take it for what it is, and don't expect a deep story."

## Data Cleaning and Preprocessing

In [9]:
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Stemming libaries

In [10]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()

In [18]:
corpus=[]

for i in range(0, len(df)):
    text= re.sub('[^a-zA-Z0-9]', ' ', df['review'][i])
    text= text.lower()
    text= text.split()

    text=[ps.stem(word) for word in text if not word in stopwords.words('english')]
    text=' '.join(text)
    corpus.append(text)

In [19]:
 corpus

['one review mention watch 1 oz episod hook right exactli happen br br first thing struck oz brutal unflinch scene violenc set right word go trust show faint heart timid show pull punch regard drug sex violenc hardcor classic use word br br call oz nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda em citi home mani aryan muslim gangsta latino christian italian irish scuffl death stare dodgi deal shadi agreement never far away br br would say main appeal show due fact goe show dare forget pretti pictur paint mainstream audienc forget charm forget romanc oz mess around first episod ever saw struck nasti surreal say readi watch develop tast oz got accustom high level graphic violenc violenc injustic crook guard sold nickel inmat kill order get away well manner middl class inmat turn prison bitch due lack street skill prison experi watch oz may becom comfort uncomfort view that get touch

## 1. Bag of Words Model

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=2500, binary=True, ngram_range=(1,2))

X= cv.fit_transform(corpus).toarray()

In [22]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [23]:
X.shape

(50000, 2500)

### Label Encoding

In [24]:
y= pd.get_dummies(df['sentiment'])
y=y.iloc[:,1].values

In [25]:
y

array([ True,  True,  True, ..., False, False, False])

### Train-Test Split

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.20, random_state=0)

In [27]:
X_train, y_train

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 1, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 array([False, False,  True, ...,  True, False,  True]))

### Using Naive Bayes for Classification

In [29]:
from sklearn.naive_bayes import MultinomialNB
sentiment_detect_model = MultinomialNB().fit(X_train, y_train)

### Predictions

In [30]:
y_pred= sentiment_detect_model.predict(X_test)

In [31]:
from sklearn.metrics import accuracy_score, classification_report

In [32]:
score= accuracy_score(y_test, y_pred)
print(score)

0.8493


### Classification Report

In [33]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       False       0.84      0.86      0.85      4966
        True       0.86      0.84      0.85      5034

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



## 2. TFIDF Model

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv= TfidfVectorizer(max_features=2500, ngram_range=(1,2))

X= tv.fit_transform(corpus).toarray()

### Train-Test Split

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.20, random_state=0)

### a) Naive Bayes for Classification

In [40]:
from sklearn.naive_bayes import MultinomialNB
sentiment_detect_model= MultinomialNB().fit(X_train, y_train)

#### Predictions

In [41]:
y_pred= sentiment_detect_model.predict(X_test)

In [43]:
score= accuracy_score(y_test, y_pred)
print(score)

0.8472


#### Classification Report

In [44]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

       False       0.83      0.86      0.85      4893
        True       0.86      0.84      0.85      5107

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



### b) Random Forest for Classification

In [45]:
from sklearn.ensemble import RandomForestClassifier
classifier= RandomForestClassifier()
classifier.fit(X_train, y_train)

#### Predictions

In [46]:
y_pred= classifier.predict(X_test)

In [50]:
score= accuracy_score(y_pred, y_test)
print("accuracy score is" , score)

print("classification is as follows")
print(classification_report(y_pred,y_test))
      

accuracy score is 0.8435
classification is as follows
              precision    recall  f1-score   support

       False       0.85      0.84      0.85      5096
        True       0.84      0.85      0.84      4904

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



### 3. Word2Vec Implementation

In [51]:
import gensim.downloader as api
wv= api.load('word2vec-google-news-300')

In [54]:
vec_try= wv['hello']
vec_try

array([-0.05419922,  0.01708984, -0.00527954,  0.33203125, -0.25      ,
       -0.01397705, -0.15039062, -0.265625  ,  0.01647949,  0.3828125 ,
       -0.03295898, -0.09716797, -0.16308594, -0.04443359,  0.00946045,
        0.18457031,  0.03637695,  0.16601562,  0.36328125, -0.25585938,
        0.375     ,  0.171875  ,  0.21386719, -0.19921875,  0.13085938,
       -0.07275391, -0.02819824,  0.11621094,  0.15332031,  0.09082031,
        0.06787109, -0.0300293 , -0.16894531, -0.20800781, -0.03710938,
       -0.22753906,  0.26367188,  0.012146  ,  0.18359375,  0.31054688,
       -0.10791016, -0.19140625,  0.21582031,  0.13183594, -0.03515625,
        0.18554688, -0.30859375,  0.04785156, -0.10986328,  0.14355469,
       -0.43554688, -0.0378418 ,  0.10839844,  0.140625  , -0.10595703,
        0.26171875, -0.17089844,  0.39453125,  0.12597656, -0.27734375,
       -0.28125   ,  0.14746094, -0.20996094,  0.02355957,  0.18457031,
        0.00445557, -0.27929688, -0.03637695, -0.29296875,  0.19

### Model with Wordnet lemmatizer

In [55]:
from nltk.stem import WordNetLemmatizer
lemmatizer= WordNetLemmatizer()

In [56]:
corpus[10]

'phil alien one quirki film humour base around odd everyth rather actual punchlin br br first odd pretti funni movi progress find joke odd funni anymor br br low budget film that never problem pretti interest charact eventu lost interest br br imagin film would appeal stoner current partak br br someth similar better tri brother anoth planet'

In [58]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [59]:
words=[]

for sent in corpus:
    sent_token = sent_tokenize(sent)

    for sent in sent_token:
        words.append(simple_preprocess(sent))

In [60]:
words

[['one',
  'review',
  'mention',
  'watch',
  'oz',
  'episod',
  'hook',
  'right',
  'exactli',
  'happen',
  'br',
  'br',
  'first',
  'thing',
  'struck',
  'oz',
  'brutal',
  'unflinch',
  'scene',
  'violenc',
  'set',
  'right',
  'word',
  'go',
  'trust',
  'show',
  'faint',
  'heart',
  'timid',
  'show',
  'pull',
  'punch',
  'regard',
  'drug',
  'sex',
  'violenc',
  'hardcor',
  'classic',
  'use',
  'word',
  'br',
  'br',
  'call',
  'oz',
  'nicknam',
  'given',
  'oswald',
  'maximum',
  'secur',
  'state',
  'penitentari',
  'focus',
  'mainli',
  'emerald',
  'citi',
  'experiment',
  'section',
  'prison',
  'cell',
  'glass',
  'front',
  'face',
  'inward',
  'privaci',
  'high',
  'agenda',
  'em',
  'citi',
  'home',
  'mani',
  'aryan',
  'muslim',
  'gangsta',
  'latino',
  'christian',
  'italian',
  'irish',
  'scuffl',
  'death',
  'stare',
  'dodgi',
  'deal',
  'shadi',
  'agreement',
  'never',
  'far',
  'away',
  'br',
  'br',
  'would',
  'say',

In [None]:
p