In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,regexp_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score,confusion_matrix
from joblib import dump, load


#### *loading dataset*

In [147]:
raw_df = pd.read_csv('../../ds_csv/imdb_reviews/IMDB Dataset.csv',dtype={'sentiment':'category'})

In [148]:
sentiment_label = {'positive':1,'negative':0}
raw_df['sentiment'] = raw_df['sentiment'].map(sentiment_label)

#### *dataset analysis*

In [149]:
raw_df.shape,raw_df.columns

((50000, 2), Index(['review', 'sentiment'], dtype='object'))

In [150]:
raw_df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
5,"Probably my all-time favorite movie, a story o...",1
6,I sure would like to see a resurrection of a u...,1
7,"This show was an amazing, fresh & innovative i...",0
8,Encouraged by the positive comments about this...,0
9,If you like original gut wrenching laughter yo...,1


In [151]:
raw_df.isnull().sum()
# no null value found 

review       0
sentiment    0
dtype: int64

In [152]:
# distribution of positive and negative review 
raw_df['sentiment'].value_counts()
# both (positive,negative) are equal distributed 

sentiment
0    25000
1    25000
Name: count, dtype: int64

#### *text preprocessing*

In [153]:
# before preprocessing text 
raw_df['review'].loc[1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [154]:
en_stopwords = stopwords.words('english')
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});') # HTML filtering 
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text):
    # convert to lower case 
    text = text.lower()
    
    # filter using regex 
    text = re.sub('^a-zA-Z',' ',text)
    text = re.sub('[0-9]','',text)
    text = re.sub(CLEANR, '', text) 
 
    # tokenized 
    text = word_tokenize(text)
    
    # removing punctuation
    text = [word for word in text if word.isalnum()]

    # removing stopword 
    text = [word for word in text if word not in set(en_stopwords)]

    # lemmatization or stemming
    text = [lemmatizer.lemmatize(word) for word in text]
    # text = [stemmer.stem(word) for word in text]
    
    return ' '.join(text)

In [155]:
raw_df['review'] = raw_df['review'].apply(preprocess)

In [156]:
raw_df.to_csv('preprocessed_imdb_reviews.csv',index=False)

In [157]:
# after preprocessing text 
raw_df['review'].loc[0]

'one reviewer mentioned watching oz episode hooked right exactly happened first thing struck oz brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use called oz nickname given oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy dealing shady agreement never far would say main appeal show due fact go show would dare forget pretty picture painted mainstream audience forget charm forget romance oz mess around first episode ever saw struck nasty surreal could say ready watched developed taste oz got accustomed high level graphic violence violence injustice crooked guard sold nickel inmate kill order get away well mannered middle class inmate turned prison bitch due lack street skill prison experience watching oz

#### *loading preprocessed dataset*

In [25]:
preprocessed_df = pd.read_csv('preprocessed_imdb_reviews.csv')

In [26]:
preprocessed_df.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching oz episode hoo...,1
1,wonderful little production filming technique ...,1
2,thought wonderful way spend time hot summer we...,1
3,basically family little boy jake think zombie ...,0
4,petter mattei love time money visually stunnin...,1


In [27]:
X = preprocessed_df.iloc[:25000,0]
y = preprocessed_df.iloc[:25000,-1].values

In [28]:
X.shape,y.shape,type(X),type(y)

((25000,), (25000,), pandas.core.series.Series, numpy.ndarray)

In [29]:
preprocessed_df.info(verbose=False,memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Columns: 2 entries, review to sentiment
dtypes: int64(1), object(1)
memory usage: 39.8 MB


In [40]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [32]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((20000,), (20000,), (5000,), (5000,))

In [41]:
tfv = TfidfVectorizer(max_features=10000)
X_train = tfv.fit_transform(X_train).toarray()
X_test = tfv.transform(X_test).toarray()

In [43]:
X_train.shape,X_test.shape,type(X_train)

((20000, 10000), (5000, 10000), numpy.ndarray)

#### *Training Classification Model*

In [44]:
# RandomForest Model
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100,random_state=1)
rf_classifier.fit(X_train,y_train)
dump(rf_classifier, 'RandomForestClassifier.joblib') 

['RandomForestClassifier.joblib']

In [45]:
# evaluation & accuracy_score for RandomForestClassifier 
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_pred) # True,Pred
print(f"RandomForest Accuracy: {accuracy*100:.2f}%")

RandomForest Accuracy: 85.26%


In [18]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
tree_classifier = DecisionTreeClassifier(criterion='gini',random_state=1)
tree_classifier.fit(X_train,y_train)

In [19]:
# evaluation & accuracy_score for DecisionTree 
y_pred = tree_classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_pred) # True,Pred
print(f"DecisonTree Accuracy: {accuracy*100:.2f}%")

DecisonTree Accuracy: 68.25%


In [46]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)

In [47]:
# evaluation & accuracy_score for Bayes 
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_pred) # True,Pred
print(f"Navie Bayes Accuracy: {accuracy*100:.2f}%")

Navie Bayes Accuracy: 75.80%


In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors=20)
knn_classifier.fit(X_train,y_train)

In [17]:
# evaluation & accuracy_score for KNN 
y_pred = knn_classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_pred) # True,Pred
print(f"KNN Accuracy: {accuracy*100:.2f}%")

KNN Accuracy: 73.00%


In [13]:
# SVC Model
from sklearn.svm import SVC
svc_classifier = SVC(random_state=0,kernel='linear')
svc_classifier.fit(X_train,y_train)

In [15]:
# evaluation & accuracy_score for SVC
y_pred = svc_classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_pred) # True,Pred
print(f"SVC Accuracy : {accuracy*100:.2f}%")

SVC Accuracy : 83.25%
