In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import joblib  # To save the trained model

# Load the IMDB dataset
data = pd.read_csv('imdb_top_1000(1).csv')

# Preprocess the text data
# For simplicity, we'll use TF-IDF Vectorizer for text vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(data['review'])
y = data['sentiment']  # Use the original sentiment labels ('positive' or 'negative')

# Map sentiment labels to integers
y = y.map({'positive': 1, 'negative': 0})

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LinearSVC model
model = LinearSVC()

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Classification report (precision, recall, F1-score)
print(classification_report(y_test, y_pred))

# Optionally, save the trained model using joblib
joblib.dump(model, 'imdb_sentiment_model.pkl')


Accuracy: 0.8
              precision    recall  f1-score   support

           0       0.82      0.79      0.80       104
           1       0.78      0.81      0.80        96

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.80       200
weighted avg       0.80      0.80      0.80       200





['imdb_sentiment_model.pkl']