In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df.sample(5)

Unnamed: 0,review,sentiment
7544,"I'll admit that I liked the first one, and was...",negative
33106,The movie starts out fine. Widower out with ne...,negative
43102,"I enjoy movies like this for their spirit, no ...",positive
3059,"Pretty twisted Horror film, that has a few goo...",negative
31388,"Larry Bishop, the Writer/Actor/Director, focus...",negative


In [3]:
df.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [4]:
df.sentiment = df.sentiment.apply(lambda x: 1 if x =='positive' else 0 )

In [6]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [7]:
xtrain, xtest, ytrain, ytest = train_test_split(df.review, df.sentiment, test_size=0.2)

In [8]:
xtrain.shape

(40000,)

In [35]:
ytrain.shape

(40000,)

In [None]:
#1. create a pipeline object
clf = Pipeline([
    ('vectorizer', CountVectorizer()),                                                    #initializing the vectorizer
    ('random_forest', (RandomForestClassifier(n_estimators=50, criterion='entropy')))      #using the RandomForest classifier
])



#2. fit with X_train and y_train
clf.fit(xtrain, ytrain)


#3. get the predictions for X_test and store it in y_pred
y_pred = clf.predict(xtest)




In [37]:
#4. print the classfication report
print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.85      0.84      4974
           1       0.85      0.84      0.84      5026

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [40]:
clf1 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('KNN', KNeighborsClassifier(n_neighbors=10, metric='euclidean'))
])
clf1

In [41]:
clf1.fit(xtrain, ytrain)

y_pred1 = clf1.predict(xtest)

In [42]:
print(classification_report(y_pred1, ytest))

              precision    recall  f1-score   support

           0       0.66      0.65      0.65      5071
           1       0.64      0.66      0.65      4929

    accuracy                           0.65     10000
   macro avg       0.65      0.65      0.65     10000
weighted avg       0.65      0.65      0.65     10000



In [43]:
clf3 = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('NB', MultinomialNB())
])
clf3

In [44]:
clf3.fit(xtrain, ytrain)

y_pred3 = clf3.predict(xtest)

In [45]:
print(classification_report(y_pred3, ytest))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85      5319
           1       0.81      0.87      0.84      4681

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

