In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import re
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
data=pd.read_csv('IMDB Dataset.csv')

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
le=LabelEncoder()
data['sentiment_en']=le.fit_transform(data['sentiment'])

In [None]:
tf_id=TfidfVectorizer(ngram_range=(1,1))


In [None]:
def preprocess(text):
  text=text.lower()
  text=text.replace('<br /><br />','')
  text_tokens=word_tokenize(text)
  stop_words=stopwords.words('english')
  text_tokens=[word for word in text_tokens if word not in stop_words]
  lemmatizer=WordNetLemmatizer()
  text_tokens=[lemmatizer.lemmatize(word) for word in text_tokens]
  text=' '.join(text_tokens)
  return text



In [None]:
data['review']=data['review'].apply(preprocess)

In [None]:
data.head()

Unnamed: 0,review,sentiment,sentiment_en
0,one reviewer mentioned watching 1 oz episode '...,positive,1
1,wonderful little production . filming techniqu...,positive,1
2,thought wonderful way spend time hot summer we...,positive,1
3,basically 's family little boy ( jake ) think ...,negative,0
4,petter mattei 's `` love time money '' visuall...,positive,1


In [None]:
review_vec=tf_id.fit_transform(data['review'],)

In [None]:
print(review_vec.shape)
print(tf_id.get_feature_names_out()[1000:1500])

(50000, 97538)
['330am' '330mins' '332960073452' '333' '336th' '338' '33bc' '33it' '33m'
 '33rd' '34' '345' '346' '3462' '347' '34c' '34th' '35' '350' '3500' '351'
 '3516' '356' '357' '358' '35c' '35mins' '35mm' '35pm' '35s' '35th' '35yr'
 '36' '360' '360remake' '360s' '362' '36310' '365' '367' '36th' '37' '370'
 '3714' '372' '372nd' '373' '37449ing' '378' '37ad' '37c' '37pm' '38'
 '3850' '387' '38k' '38s' '38th' '39' '392' '395' '39811' '39d' '39th'
 '3a' '3am' '3bs' '3colours' '3d' '3dfx' '3dvd' '3h' '3hr' '3k' '3lbs'
 '3m' '3mins' '3p' '3p0' '3pm' '3po' '3rd' '3rds' '3rg' '3s' '3th' '3who'
 '3x' '3x5' '3yrs' '40' '400' '4000' '40000' '400000' '400lb' '401k' '405'
 '406' '409' '40am' '40k' '40kms' '40min' '40mins' '40mph' '40s'
 '40something' '40th' '40yr' '41' '42' '420' '425' '426' '428' '42m'
 '42nd' '43' '430' '44' '440' '4400' '442nd' '44c' '44mb' '44megabytes'
 '44yrs' '45' '450' '4500' '451' '454' '45am' '45ish' '45m' '45min'
 '45mins' '45pm' '45rpm' '45s' '45th' '46' '460' '4

In [None]:
X_train,X_test,y_train,y_test=train_test_split(review_vec,data['sentiment_en'],test_size=0.2,random_state=42)

In [None]:
model=MultinomialNB()
model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X_test)

In [None]:
acc=accuracy_score(y_test,y_pred)
report=classification_report(y_test,y_pred,output_dict=True)

In [None]:
if acc > 0.8:
    print("The model performs well with good accuracy.")
else:
    print("The model could be improved; consider tuning hyperparameters or using a different algorithm.")

The model performs well with good accuracy.


In [None]:
print(f"Accuracy: {acc:.2f}")

Accuracy: 0.87


In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.87      4961
           1       0.88      0.85      0.86      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [None]:
sample_text="There’s something special about this film that stays with you long after the credits roll. It starts off quietly, but before you know it, you're fully absorbed in the world it creates. The characters feel so real, like people you might have known at some point, and their journeys are both heartwarming and thought-provoking. Without relying on big, flashy moments, the story manages to unfold in a way that's both surprising and satisfying. There's a kind of magic in the way the visuals and music work together, effortlessly drawing you in. By the end, you feel like you've experienced something truly memorable, the kind of film that lingers in your mind."
sample_tes=preprocess(sample_text)
vec_sample=tf_id.transform([sample_text])
y_sample_pred=model.predict(vec_sample)
if(y_sample_pred==1):
  print("Positive")
else:
  print("Negative")

Positive


In [None]:
sample_neg_text="While I had high hopes for this film based on its intriguing premise and the talent involved, it ultimately left me feeling a bit underwhelmed. The story had potential but seemed to meander without ever fully developing its characters or themes. At times, I found myself lost in the plot, struggling to connect with the motivations of the characters. The pacing felt uneven, with some scenes dragging on while others rushed by too quickly. Although there were moments of visual beauty, they couldn't quite compensate for the lack of emotional depth. Overall, it felt more like a collection of scenes than a cohesive narrative, and I walked away wishing it had delivered more"
sample_neg_tes=preprocess(sample_neg_text)
vec_sample_neg=tf_id.transform([sample_neg_text])
y_sample_neg_pred=model.predict(vec_sample_neg)
if(y_sample_neg_pred==1):
  print("Positive")
else:
  print("Negative")

Negative
