In [68]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [69]:
df = pd.read_csv('clean_data.csv')
df.head()

Unnamed: 0,review_id,review,sentiment,cleaned_review,review_length
0,44805,One of the other reviewers has mentioned that ...,positive,one reviewer mentioned watching 1 oz episode y...,1761
1,44806,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...,998
2,44807,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...,926
3,44808,Basically there's a family where a little boy ...,negative,basically there family little boy jake think t...,748
4,44809,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...,1317


In [70]:
X = df['cleaned_review']
y = df['sentiment']

In [71]:
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [72]:
X[0:5], y[0:5]

(0    one reviewer mentioned watching 1 oz episode y...
 1    wonderful little production filming technique ...
 2    thought wonderful way spend time hot summer we...
 3    basically there family little boy jake think t...
 4    petter matteis love time money visually stunni...
 Name: cleaned_review, dtype: object,
 array([1, 1, 1, 0, 1]))

# Train Test Split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [74]:
print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

Training set size: 40000
Test set size: 10000


In [75]:
model = make_pipeline(TfidfVectorizer(), LogisticRegression(max_iter=1000))

In [76]:
model.fit(X_train, y_train)

In [77]:
y_train_pred = model.predict(X_train)

In [78]:
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision, train_recall, train_f1, _ = precision_recall_fscore_support(y_train, y_train_pred, average='binary')

print(f"\nTraining Accuracy: {train_accuracy:.4f}")
print(f"Training Precision: {train_precision:.4f}")
print(f"Training Recall: {train_recall:.4f}")
print(f"Training F1-score: {train_f1:.4f}")


Training Accuracy: 0.9324
Training Precision: 0.9259
Training Recall: 0.9401
Training F1-score: 0.9329


In [79]:
y_test_pred = model.predict(X_test)

In [80]:
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(y_test, y_test_pred, average='binary')

print(f"\nTest Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1-score: {test_f1:.4f}")


Test Accuracy: 0.8943
Test Precision: 0.8847
Test Recall: 0.9068
Test F1-score: 0.8956


In [83]:
import joblib
joblib.dump(model, 'sentiment_model.joblib')

['sentiment_model.joblib']