In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score

In [2]:
# Load data
df = pd.read_csv("amazonreviews.tsv", sep="\t")

In [3]:
# Preview data
print(df.head())

  label                                             review
0   pos  Stuning even for the non-gamer: This sound tra...
1   pos  The best soundtrack ever to anything.: I'm rea...
2   pos  Amazing!: This soundtrack is my favorite music...
3   pos  Excellent Soundtrack: I truly like this soundt...
4   pos  Remember, Pull Your Jaw Off The Floor After He...


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   10000 non-null  object
 1   review  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB
None


In [5]:
# Convert labels: 'pos' → 1, 'neg' → 0
df['label'] = df['label'].map({'pos': 1, 'neg': 0})

In [6]:
# Features and labels
X = df['review']
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [8]:
# Train model
log_model = LogisticRegression()
log_model.fit(X_train_vec, y_train)

In [9]:
# Predict and evaluate
y_pred_log = log_model.predict(X_test_vec)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))

Logistic Regression Accuracy: 0.8485
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85      1037
           1       0.84      0.84      0.84       963

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000



In [10]:
# Train model
lin_model = LinearRegression()
lin_model.fit(X_train_vec, y_train)

In [11]:
# Predict and threshold at 0.5
y_pred_lin = lin_model.predict(X_test_vec)
y_pred_lin_class = (y_pred_lin >= 0.5).astype(int)

In [12]:
# Evaluate
print("Linear Regression MSE:", mean_squared_error(y_test, y_pred_lin))
print("Linear Regression R² Score:", r2_score(y_test, y_pred_lin))
print("Converted Accuracy (Thresholded):", accuracy_score(y_test, y_pred_lin_class))

Linear Regression MSE: 0.26445888360950814
Linear Regression R² Score: -0.05928569655661842
Converted Accuracy (Thresholded): 0.744


In [13]:
import joblib

# Save model and vectorizer
joblib.dump(log_model, "logistic_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

['tfidf_vectorizer.pkl']