In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import pickle
import re

In [2]:
# Load dataset
df = pd.read_csv(r'..\dataset\imbd_dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
print(df["review"][1])
print('-'*20)
print(df["review"][1000])

A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master's of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional 'dream' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell's murals decorating every surface) are terribly well done.
-

Change the value of the "sentiment" column from "positive" and "negative" to 1 and 0, respectively, in the dataframe `df`.

```python

In [4]:
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


Perform basic cleaning on the dataframe`df`.

```python

In [5]:
# remove "(<.*?>)" markup
df['review'] = df['review'].apply(lambda x: re.sub('(<.*?>)', ' ', x))

# remove punctuation marks 
df['review'] = df['review'].apply(lambda x: re.sub('[,\.!?:()"]', '', x))
    
# remove whitespace
df['review'] = df['review'].apply(lambda x: x.strip())

# remove all strings that contain a non-letter
df['review'] = df['review'].apply(lambda x: re.sub('[^a-zA-Z"]',' ',x))
    
# convert to lower
df['review'] = df['review'].apply(lambda x: x.lower())

In [6]:
print(df["review"][1])
print('-'*20)
print(df["review"][1000])

a wonderful little production   the filming technique is very unassuming  very old time bbc fashion and gives a comforting and sometimes discomforting sense of realism to the entire piece   the actors are extremely well chosen  michael sheen not only has got all the polari but he has all the voices down pat too you can truly see the seamless editing guided by the references to williams  diary entries not only is it well worth the watching but it is a terrificly written and performed piece a masterful production about one of the great master s of comedy and his life   the realism really comes home with the little things the fantasy of the guard which rather than use the traditional  dream  techniques remains solid then disappears it plays on our knowledge and our senses particularly with the scenes concerning orton and halliwell and the sets particularly of their flat with halliwell s murals decorating every surface are terribly well done
--------------------
this movie is awful i can t

In [7]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [8]:
# Train model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [9]:
# Evaluate
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.85      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [10]:
# Save model and vectorizer
with open(r'..\models\naive_bayes.pkl', 'wb') as f:
    pickle.dump(model, f)

with open(r'..\models\nb_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [11]:
# Load the saved model and vectorizer
with open(r'..\models\naive_bayes.pkl', 'rb') as f:
    model = pickle.load(f)

with open(r'..\models\nb_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

# Predict
text = ["This movie was fantastic!"]
X = vectorizer.transform(text)
prediction = model.predict(X)
print(prediction)

[1]


In [12]:
user_input = 'This movie was fantastic!'
proba = model.predict_proba(vectorizer.transform([user_input]))
label = model.predict(vectorizer.transform([user_input]))[0]
confidence = round(float(proba[0].max()) * 100, 2)
print(f'Predicted sentiment: {label}')
print(f'Confidence: {confidence}%')

Predicted sentiment: 1
Confidence: 79.01%


In [13]:
# Load the saved model and vectorizer
with open(r'..\models\naive_bayes.pkl', 'rb') as f:
    model = pickle.load(f)

with open(r'..\models\nb_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

# Predict with confidence score
text = ["This movie was terrible!"]
X = vectorizer.transform(text)

prediction = model.predict(X)[0]
confidence = model.predict_proba(X).max()

# Display results
label = "Positive" if prediction == 1 else "Negative"
print(f"Label: {label} | Confidence: {confidence:.2%}")

Label: Negative | Confidence: 88.71%
