In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import sqlite3
import logging
from datasets import load_dataset

In [4]:
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 310092.89 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 394121.49 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 687784.75 examples/s]


In [5]:
train_df = pd.DataFrame(
    {"review_text": train_data["text"], "sentiment": train_data["label"]})
test_df = pd.DataFrame(
    {"review_text": test_data["text"], "sentiment": test_data["label"]})

In [6]:
train_df["sentiment"] = train_df["sentiment"].map(
    {0: "negative", 1: "positive"})
test_df["sentiment"] = test_df["sentiment"].map({0: "negative", 1: "positive"})

In [7]:
conn = sqlite3.connect("imdb_reviews.db")
cursor = conn.cursor()

In [8]:
cursor.execute("""
CREATE TABLE IF NOT EXISTS imdb_reviews (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    review_text TEXT NOT NULL,
    sentiment TEXT NOT NULL
)
""")

<sqlite3.Cursor at 0x143cc3340>

In [9]:
train_df.to_sql("imdb_reviews", conn, if_exists="replace", index=False)

25000

In [10]:
import re

In [11]:
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    return text

In [12]:
train_df["cleaned_review"] = train_df["review_text"].apply(clean_text)
test_df["cleaned_review"] = test_df["review_text"].apply(clean_text)

In [13]:
print("Sentiment Distribution:")
print(train_df["sentiment"].value_counts())

Sentiment Distribution:
sentiment
negative    12500
positive    12500
Name: count, dtype: int64


In [14]:
print("\nAverage Review Length:")
print(train_df.groupby("sentiment")[
      "cleaned_review"].apply(lambda x: x.str.len().mean()))


Average Review Length:
sentiment
negative    1236.21808
positive    1283.05656
Name: cleaned_review, dtype: float64


In [15]:
tfidf = TfidfVectorizer(max_features=5000)
X_train = tfidf.fit_transform(train_df["cleaned_review"])
X_test = tfidf.transform(test_df["cleaned_review"])

In [16]:
y_train = train_df["sentiment"]
y_test = test_df["sentiment"]

In [18]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [19]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.88308
Classification Report:
               precision    recall  f1-score   support

    negative       0.89      0.88      0.88     12500
    positive       0.88      0.89      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [20]:
import joblib
joblib.dump(model, "sentiment_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']

In [21]:
conn.close()