In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# importing the Dataset
import pandas as pd
reviews = pd.read_csv('/content/drive/MyDrive/NLP DataSet/imdb_master.csv',usecols=["review", "label"],
                      encoding='ISO-8859-1').sample(n=5000)
reviews.head()

Unnamed: 0,review,label
10249,Supreme Sanction is a movie about a female ass...,neg
70404,"""The Cathedral and the Bazaar"" has become a mo...",unsup
46960,Good story. Good script. Good casting. Good ac...,pos
87136,"... because all I offer is honesty here, folks...",unsup
51516,"I decided to watch this again, after reading s...",unsup


## Text Preprocessing

In [11]:
print(len(reviews))

5000


In [3]:
#Data cleaning and preprocessing
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
ps = PorterStemmer()
lemma=WordNetLemmatizer()

In [None]:
# For Stemming
corpus = []
for i in range(0, len(reviews)):
    review = re.sub('[^a-zA-Z]', ' ', str(reviews['review'].iloc[i]))
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

## Or

In [5]:
# For Lemmatizer
corpus = []
for i in range(0, len(reviews)):
    review = re.sub('[^a-zA-Z]', ' ', str(reviews['review'].iloc[i]))
    review = review.lower()
    review = review.split()

    review = [lemma.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [6]:
corpus

['supreme sanction movie female assassin work u government kill known tv reporter spare life see little daughter killed becomes next target employer br br script good although seen worse b movie hit man remorse government killing innocent people name fighting terror next alien rescuing victim supreme sanction never win award script acting better afraid better known actor michael madsen kristy swanson clearly lot bill pay therefor accepted play movie together actor probably even know camera really look like good movie either br br watch movie well got anything better watch action flick tired st rerun mc gyver team might movie want see otherwise better leave alone give',
 'cathedral bazaar become modern classic enjoyed author appearance never seen stallman torvalds screen either br br totally uninteresting possibly even devastatingly horrible excruciatingly boring anyone spent time using writing free software target audience probably life valley plus exclusive pocket geekdom major city t

## Creating the Bag of Words model

In [63]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [65]:
## Check the unique values in X
import numpy as np
#np.unique(X)
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [22]:
y=pd.get_dummies(reviews['label'])
y=y.iloc[:,1].values

In [25]:
## Check the unique values in y
np.unique(y)

array([0, 1], dtype=uint8)

In [26]:
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [27]:
from sklearn.ensemble import RandomForestClassifier
review_detect_model = RandomForestClassifier().fit(X_train, y_train)

In [28]:
#prediction
y_pred=review_detect_model .predict(X_test)

In [29]:
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(score)

0.774


In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.99      0.78      0.87       984
           1       0.04      0.56      0.07        16

    accuracy                           0.77      1000
   macro avg       0.52      0.67      0.47      1000
weighted avg       0.98      0.77      0.86      1000



## Creating the TFIDF model

In [31]:
# Creating the TFIDF model
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer(max_features=2500)
X = tv.fit_transform(corpus).toarray()

In [32]:
y=pd.get_dummies(reviews['label'])
y=y.iloc[:,1].values

In [33]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [34]:
## Check the unique values in X
import numpy as np
np.unique(X)

array([0.        , 0.00689724, 0.00785087, ..., 0.86647268, 0.87238608,
       0.90280216])

In [35]:
from sklearn.ensemble import RandomForestClassifier
review_detect_model = RandomForestClassifier().fit(X_train, y_train)

In [None]:
#prediction
y_pred=review_detect_model .predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(classification_report(y_pred,y_test))
print(score)

              precision    recall  f1-score   support

           0       0.99      0.77      0.87       986
           1       0.03      0.43      0.05        14

    accuracy                           0.77      1000
   macro avg       0.51      0.60      0.46      1000
weighted avg       0.98      0.77      0.86      1000

0.77


## Word2Vec Continious Bag of Words (CBOW)


In [7]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300') # google news data has 300 dimension

In [8]:
import gensim
# Train CBOW Word2Vec model
model=gensim.models.Word2Vec(corpus,window=5,min_count=2)



In [9]:
model.corpus_count

5000

In [10]:
# Convert each document into a fixed-size vector using the average of word vectors
X = [
    sum(model.wv[word] for word in doc) / len(doc) if len(doc) > 0 else [0.0] * 100
    for doc in corpus
]

In [12]:
X

[array([ 0.07314971,  0.00518676,  0.0163745 , -0.03605983, -0.06226183,
        -0.00817439, -0.02244153,  0.0182607 , -0.01050325, -0.00888691,
         0.02426183,  0.01167689, -0.07286569,  0.0071978 , -0.07490083,
        -0.0071519 ,  0.02194975,  0.02683548,  0.00220551, -0.1502659 ,
         0.05433727,  0.00631379,  0.06298013,  0.03834402,  0.02091741,
         0.02829142, -0.07593052,  0.04157894,  0.00833813, -0.02726018,
        -0.02518992,  0.00272079,  0.08149832, -0.09664772,  0.02679966,
        -0.12835501,  0.04828342, -0.02442318, -0.00589346, -0.06836576,
        -0.08325405, -0.03377907, -0.04893961,  0.02796131, -0.02790685,
        -0.09411175, -0.01187099,  0.03390525,  0.01427784,  0.03410356,
         0.01046823,  0.00938503,  0.00142718,  0.01668299,  0.0009025 ,
        -0.07239992,  0.00889532, -0.07066662, -0.03923742,  0.04741824,
        -0.00632869, -0.01255624,  0.04652043,  0.04168387, -0.04438337,
         0.09196402,  0.09910731,  0.06791317, -0.0

In [13]:
y=pd.get_dummies(reviews['label'])
y=y.iloc[:,1].values

In [14]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [15]:
from sklearn.ensemble import RandomForestClassifier
review_detect_model_cbow = RandomForestClassifier().fit(X_train, y_train)

In [17]:
#prediction
y_pred=review_detect_model_cbow.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(classification_report(y_pred,y_test))
print(score)

              precision    recall  f1-score   support

           0       0.98      0.76      0.86       980
           1       0.03      0.40      0.06        20

    accuracy                           0.75      1000
   macro avg       0.51      0.58      0.46      1000
weighted avg       0.97      0.75      0.84      1000

0.749


## AvgWord2Vec

In [19]:
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)

    return np.mean([model.wv[word] for word in doc if word in model.wv.index_to_key],axis=0)
                #or [np.zeros(len(model.wv.index_to_key))], axis=0)

In [22]:
from tqdm import tqdm
import numpy as np
#apply for the entire sentences
X_avg=[]
for i in tqdm(range(len(corpus))):
    X_avg.append(avg_word2vec(corpus[i]))

100%|██████████| 5000/5000 [00:11<00:00, 451.44it/s]


In [23]:
X_avg[0]

array([ 0.07314971,  0.00518676,  0.0163745 , -0.03605983, -0.06226183,
       -0.00817439, -0.02244153,  0.0182607 , -0.01050325, -0.00888691,
        0.02426183,  0.01167689, -0.07286569,  0.0071978 , -0.07490083,
       -0.0071519 ,  0.02194975,  0.02683548,  0.00220551, -0.1502659 ,
        0.05433727,  0.00631379,  0.06298013,  0.03834402,  0.02091741,
        0.02829142, -0.07593052,  0.04157894,  0.00833813, -0.02726018,
       -0.02518992,  0.00272079,  0.08149832, -0.09664772,  0.02679966,
       -0.12835501,  0.04828342, -0.02442318, -0.00589346, -0.06836576,
       -0.08325405, -0.03377907, -0.04893961,  0.02796131, -0.02790685,
       -0.09411175, -0.01187099,  0.03390525,  0.01427784,  0.03410356,
        0.01046823,  0.00938503,  0.00142718,  0.01668299,  0.0009025 ,
       -0.07239992,  0.00889532, -0.07066662, -0.03923742,  0.04741824,
       -0.00632869, -0.01255624,  0.04652043,  0.04168387, -0.04438337,
        0.09196402,  0.09910731,  0.06791317, -0.09623503,  0.08

In [24]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_avg, y, test_size = 0.20, random_state = 0)

In [25]:
from sklearn.ensemble import RandomForestClassifier
review_detect_model_avgwd2vc = RandomForestClassifier().fit(X_train, y_train)

In [26]:
#prediction
y_pred=review_detect_model_avgwd2vc.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score,classification_report
score=accuracy_score(y_test,y_pred)
print(classification_report(y_pred,y_test))
print(score)

              precision    recall  f1-score   support

           0       0.99      0.76      0.86       984
           1       0.04      0.56      0.07        16

    accuracy                           0.76      1000
   macro avg       0.51      0.66      0.46      1000
weighted avg       0.98      0.76      0.85      1000

0.755
