In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [2]:
#1. read the data provided in the same directory with name 'movies_sentiment_data.csv' and store it in df variable
df = pd.read_csv("IMDB Dataset.csv")

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(50000, 2)

In [5]:
df['category'] = df['sentiment'].apply(lambda x: 1 if x == 'positive'  else 0)

In [6]:
df.head()

Unnamed: 0,review,sentiment,category
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [7]:
df['category'].value_counts() 

category
1    25000
0    25000
Name: count, dtype: int64

In [8]:
X_train,X_test,y_train,y_test = train_test_split(df.review,df.category, test_size = 0.2)

In [9]:
X_train.shape,X_test.shape

((40000,), (10000,))

In [10]:
clf = Pipeline([('vectorizer',CountVectorizer()),
               ('rf',RandomForestClassifier(n_estimators=50, criterion='entropy'))
               ])

In [11]:
clf.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('rf',
                 RandomForestClassifier(criterion='entropy', n_estimators=50))])

In [12]:
y_pred = clf.predict(X_test)

In [13]:
classification_report_rf = classification_report(y_test,y_pred)

In [14]:
print(classification_report_rf)

              precision    recall  f1-score   support

           0       0.85      0.84      0.84      5128
           1       0.83      0.84      0.83      4872

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



In [18]:
X_test[:2]

20163    Emanuele Crialese did a fantastic job with one...
13227    kite runner is undoubtedly one of the most ama...
Name: review, dtype: object

In [19]:
X_test[20163]

KeyError: 26389

In [20]:
vectorizer = CountVectorizer()
v = vectorizer.fit(X_train)
text_review = vectorizer.transform([X_test[20163]])
print(text_review)

  (0, 564)	1
  (0, 1695)	1
  (0, 1804)	1
  (0, 1957)	1
  (0, 2107)	2
  (0, 2507)	1
  (0, 2700)	4
  (0, 3347)	7
  (0, 3484)	3
  (0, 3626)	2
  (0, 3698)	1
  (0, 3796)	1
  (0, 3809)	1
  (0, 3884)	1
  (0, 4087)	25
  (0, 4425)	1
  (0, 4649)	1
  (0, 4652)	1
  (0, 5135)	4
  (0, 5355)	1
  (0, 5396)	1
  (0, 5560)	7
  (0, 5916)	1
  (0, 6166)	1
  (0, 6536)	1
  :	:
  (0, 88074)	7
  (0, 88151)	1
  (0, 88415)	1
  (0, 88739)	1
  (0, 89405)	3
  (0, 89491)	1
  (0, 89521)	1
  (0, 89854)	2
  (0, 89918)	1
  (0, 90056)	1
  (0, 90119)	1
  (0, 90318)	2
  (0, 90326)	2
  (0, 90551)	2
  (0, 90646)	1
  (0, 90869)	9
  (0, 91080)	1
  (0, 91196)	1
  (0, 91201)	1
  (0, 91239)	1
  (0, 91852)	1
  (0, 91864)	1
  (0, 92090)	1
  (0, 92094)	1
  (0, 92107)	1


In [15]:
clf.predict([X_test[20163]])

KeyError: 26389

In [53]:
df.iloc[26389]

review       Walter Matthau is best remembered for the long...
sentiment                                             positive
category                                                     1
Name: 26389, dtype: object

In [48]:
v.get_feature_names_out().shape

(92681,)

In [49]:
v.vocabulary_

{'jackie': 42888,
 'chan': 14377,
 'is': 42585,
 'considered': 17815,
 'by': 12421,
 'many': 50555,
 'film': 30248,
 'and': 4040,
 'martial': 50919,
 'arts': 5515,
 'movie': 54715,
 'fans': 29274,
 'as': 5547,
 'one': 58250,
 'of': 57920,
 'the': 82009,
 'greatest': 35272,
 'action': 2029,
 'stars': 77763,
 'ever': 28041,
 'to': 82972,
 'grace': 34987,
 'silver': 74506,
 'screen': 72090,
 'police': 62848,
 'story': 78400,
 'cemented': 14089,
 'his': 38403,
 'reputation': 68165,
 'likely': 47958,
 'successor': 79195,
 'late': 46848,
 'great': 35269,
 'bruce': 11635,
 'lee': 47249,
 'if': 40144,
 'enter': 27178,
 'dragon': 24630,
 'bared': 7413,
 'so': 75901,
 'called': 12725,
 'bench': 8487,
 'mark': 50779,
 'greatness': 35276,
 'in': 40721,
 '70s': 1099,
 'then': 82079,
 'same': 70839,
 'can': 12901,
 'be': 7892,
 'said': 70645,
 'about': 1614,
 '80s': 1148,
 'br': 10864,
 'forget': 31493,
 'rush': 70294,
 'hour': 39298,
 'trilogy': 84260,
 'or': 58512,
 'any': 4612,
 'us': 87119,
 'ef

In [None]:
clf_knn = Pipeline[((knn))]