In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
# Load IMDb dataset
df = pd.read_csv("IMDB Dataset/IMDB Dataset.csv") 
print(df.head())

# Convert labels to binary
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['review'], df['sentiment'], test_size=0.2, random_state=42
)

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [8]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000, ngram_range=(1,2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train classifier
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Evaluate
print("Model accuracy on test set:", model.score(X_test_tfidf, y_test))
print(classification_report(y_test, model.predict(X_test_tfidf)))

Model accuracy on test set: 0.8938
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.88      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [9]:
# Test with custom input
def predict_sentiment(review_text):
    review_tfidf = vectorizer.transform([review_text])
    prediction = model.predict(review_tfidf)[0]
    return "positive" if prediction == 1 else "negative"

# Example test
test_review = "This movie was absolutely horrible with weak acting and a boring story!"
print("Sentiment:", predict_sentiment(test_review))

Sentiment: negative


In [10]:
import joblib

joblib.dump(model, "sentiment_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']