In [1]:
# Install required libraries
# pip install datasets transformers torch flask

# Import necessary libraries
import re
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Load IMDb dataset
dataset = load_dataset("imdb")

# Extract data
X_train = dataset['train']['text']
y_train = dataset['train']['label']
X_test = dataset['test']['text']
y_test = dataset['test']['label']

# Preprocess text function
def preprocess_text(text):
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words('english'))  # Stopword removal
    tokens = [word for word in tokens if word.lower() not in stop_words]
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]  # Stemming
    return ' '.join(tokens)

# Apply preprocessing
X_train = [preprocess_text(review) for review in X_train]
X_test = [preprocess_text(review) for review in X_test]

# Convert text data into numerical form using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train Logistic Regression model
model_lr = LogisticRegression(max_iter=1000,penalty='l2', C=1.0)
model_lr.fit(X_train_tfidf, y_train)

# Predictions and evaluation
y_pred_lr = model_lr.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred_lr)
precision = precision_score(y_test, y_pred_lr)
recall = recall_score(y_test, y_pred_lr)
f1 = f1_score(y_test, y_pred_lr)

print("Logistic Regression Performance:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


Logistic Regression Performance:
Accuracy: 0.8776
Precision: 0.8758560280299411
Recall: 0.87992
F1-Score: 0.8778833107191316


In [9]:
import joblib

# Save the Logistic Regression model and TF-IDF vectorizer
joblib.dump(model_lr, 'model_lr.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']