In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the data
data = pd.read_csv("IMDBDataset.csv")

# Pre-Processing
data['review'] = data['review'].str.lower()
data['review'] = data['review'].replace('[^a-zA-Z\s]', '', regex=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)

# Vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model building and training
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Model evaluation
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Serialize and save the trained model
model_filename = "sentiment_model.joblib"
vectorizer_filename = "tfidf_vectorizer.joblib"

joblib.dump(model, model_filename)
joblib.dump(vectorizer, vectorizer_filename)

print(f"Model and vectorizer saved as {model_filename} and {vectorizer_filename}")


Accuracy: 0.8987
Classification Report:
               precision    recall  f1-score   support

    negative       0.91      0.89      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

Model and vectorizer saved as sentiment_model.joblib and tfidf_vectorizer.joblib


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Load the data
data = pd.read_csv("IMDBDataset.csv")

# Preprocessing (you can reuse the preprocessing code from the previous example)
data['review'] = data['review'].replace('[^a-zA-Z\s]', '', regex=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)

# Vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Model building and training
model = SVC(kernel='linear', C=1.0, random_state=42)  # Linear kernel for text data
model.fit(X_train_vec, y_train)

# Model evaluation
y_pred = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Serialize and save the trained model
model_filename = "sentiment_model2.joblib"
vectorizer_filename = "tfidf_vectorizer2.joblib"

joblib.dump(model, model_filename)
joblib.dump(vectorizer, vectorizer_filename)

print(f"Model and vectorizer saved as {model_filename} and {vectorizer_filename}")


Accuracy: 0.9068
Classification Report:
               precision    recall  f1-score   support

    negative       0.91      0.90      0.91      4961
    positive       0.90      0.92      0.91      5039

    accuracy                           0.91     10000
   macro avg       0.91      0.91      0.91     10000
weighted avg       0.91      0.91      0.91     10000

Model and vectorizer saved as sentiment_model2.joblib and tfidf_vectorizer2.joblib
