In [1]:
import pandas as pd
import numpy as np

df_cleaned = pd.read_csv('Dataset/IMDB_Dataset_Cleaned.csv')


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X = df_cleaned['review']
y = df_cleaned['sentiment'].map({'positive': 1, 'negative': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
print("Training feature matrix shape:", X_train_vec.shape)
print("Testing feature matrix shape:", X_test_vec.shape)



Training feature matrix shape: (40000, 5000)
Testing feature matrix shape: (10000, 5000)


In [3]:
import joblib

joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
print("Vectorizer saved successfully!")


Vectorizer saved successfully!


In [4]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)
print(classification_report(y_test, y_pred))
print("Accuracy:", np.mean(y_pred == y_test))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

           0       0.86      0.83      0.85      5000
           1       0.84      0.87      0.85      5000

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

Accuracy: 0.8512
Confusion Matrix:
[[4166  834]
 [ 654 4346]]


In [5]:
model.save_model('review_sentiment_model.json')