# Amazon Reviews — Pipeline Version (TF‑IDF + Logistic & Random Forest)
End‑to‑end scikit‑learn Pipelines for text classification using TF‑IDF. Includes Logistic Regression and Random Forest with shared preprocessing, evaluation, ROC curves, Importance feature analysis and save/load examples.

In [None]:
# === Imports ===
import pandas as pd
import numpy as np
import nltk, re, joblib
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report, roc_curve
import matplotlib.pyplot as plt

nltk.download('stopwords')


In [None]:
# === Load data (adjust path if needed) ===
csv_path = r'D:\NLP_sentiment\Reviews.csv' 
df = pd.read_csv(csv_path)
print(df.shape)
# Create sentiment label: >3 -> positive else negative
df["Sentiment"] = df["Score"].apply(lambda s: "positive" if s > 3 else "negative")
df = df[["Score", "Sentiment", "Summary", "Text"]].dropna(subset=["Summary", "Score"]).copy()
df.head(3)


In [None]:
# === Text cleaning ===
snow = SnowballStemmer('english')
stop = stopwords.words('english')

def clean_text(sentence):
    sentence = str(sentence)
    sentence = sentence.lower()  # lower case
    sentence = re.sub(r'[?|!|.|,|)|(|\|/]', ' ', sentence)  # replace these punctuation with space
    tokens = sentence.split()
    out = []
    for t in tokens:
        out.append(snow.stem(t))
    out = " ".join(out)
    out = re.sub(r'[\'+"'"+"|"|#]", '', out)  # remove these punctuation
    return out


In [None]:
# === Train / Test split ===
X = df["Summary"]            # raw text; cleaning runs inside vectorizer
y = df["Sentiment"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape


In [None]:
# === Build Pipelines ===
# Shared TF-IDF configuration
tfidf = TfidfVectorizer(
    preprocessor=clean_text,  # run same cleaning at train & inference
    ngram_range=(1, 2),
    max_features=50_000,
    min_df=2
)

# 1) Logistic Regression pipeline
logreg_pipe = Pipeline([
    ("tfidf", tfidf),
    ("clf", LogisticRegression(
        solver="saga",
        class_weight="balanced",
        max_iter=2000,
        random_state=42
    ))
])

# 2) Random Forest pipeline
# Note: RF on high-dimensional sparse TF-IDF can be heavy; we cap depth/features for speed.
rf_pipe = Pipeline([
    ("tfidf", tfidf),
    ("clf", RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        max_features="sqrt",
        class_weight="balanced_subsample",
        n_jobs=-1,
        random_state=42
    ))
])


In [None]:
# === Train both models ===
logreg_pipe.fit(X_train, y_train)
rf_pipe.fit(X_train, y_train)


In [None]:
# === Evaluate ===
def evaluate_model(name, pipe, X_test, y_test):
    y_pred = pipe.predict(X_test)
    # For probabilities: take column 1 which corresponds to 'positive' if classes_ are sorted as ['negative','positive']
    # Find index of 'positive' robustly:
    classes = list(pipe.named_steps['clf'].classes_)
    pos_idx = classes.index('positive')
    y_prob = pipe.predict_proba(X_test)[:, pos_idx]
    y_test_binary = (y_test == "positive").astype(int)

    acc = accuracy_score(y_test, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="binary", pos_label="positive")
    auc = roc_auc_score(y_test_binary, y_prob)

    print(f"=== {name} ===")
    print(f"Accuracy: {acc:.3f} | Precision: {prec:.3f} | Recall: {rec:.3f} | F1: {f1:.3f} | ROC-AUC: {auc:.3f}")
    print("\nClassification report:\n", classification_report(y_test, y_pred))
    return y_test_binary, y_prob

y_test_bin_lr, y_prob_lr = evaluate_model("Logistic Regression", logreg_pipe, X_test, y_test)
y_test_bin_rf, y_prob_rf = evaluate_model("Random Forest", rf_pipe, X_test, y_test)


In [None]:
# === Plot ROC curves side by side ===
fpr1, tpr1, _ = roc_curve(y_test_bin_lr, y_prob_lr)
fpr2, tpr2, _ = roc_curve(y_test_bin_rf, y_prob_rf)

plt.figure(figsize=(8, 5))
plt.plot(fpr1, tpr1, label=f"LogReg (AUC={roc_auc_score(y_test_bin_lr, y_prob_lr):.3f})")
plt.plot(fpr2, tpr2, label=f"RandomForest (AUC={roc_auc_score(y_test_bin_rf, y_prob_rf):.3f})")
plt.plot([0,1], [0,1], 'k--', label="Chance")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve — Pipelines")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# Get feature names from the TF-IDF inside the pipelines
feat_names_lr = logreg_pipe.named_steps['tfidf'].get_feature_names_out()
feat_names_rf = rf_pipe.named_steps['tfidf'].get_feature_names_out()

# Sanity check: both tfidf steps share the same config; feature lists should match
assert np.array_equal(feat_names_lr, feat_names_rf), "TF-IDF vocab differs between pipelines."
feature_names = feat_names_lr
len(feature_names)

In [None]:
# --- Logistic Regression: top positive/negative coefficients ---
coefs = logreg_pipe.named_steps['clf'].coef_[0]

topN = 25  # how many to show
top_pos_idx = np.argsort(coefs)[-topN:][::-1]   # largest -> smallest
top_neg_idx = np.argsort(coefs)[:topN]          # most negative -> less negative

top_pos = pd.DataFrame({
    "feature": feature_names[top_pos_idx],
    "coef": coefs[top_pos_idx]
}).reset_index(drop=True)

top_neg = pd.DataFrame({
    "feature": feature_names[top_neg_idx],
    "coef": coefs[top_neg_idx]
}).reset_index(drop=True)

print("Top positive LR features (push to 'positive')")
display(top_pos)
print("\nTop negative LR features (push to 'negative')")
display(top_neg)

# Plot
plt.figure(figsize=(8, 8))
plt.subplot(1,2,1)
plt.barh(top_pos["feature"][::-1], top_pos["coef"][::-1])
plt.title("LR: Top Positive Coefficients")
plt.tight_layout()

plt.subplot(1,2,2)
plt.barh(top_neg["feature"][::-1], top_neg["coef"][::-1])
plt.title("LR: Top Negative Coefficients")
plt.tight_layout()
plt.show()

In [None]:
# --- Random Forest: top feature importances ---
rf_importances = rf_pipe.named_steps['clf'].feature_importances_
topN = 25
top_rf_idx = np.argsort(rf_importances)[-topN:][::-1]

top_rf = pd.DataFrame({
    "feature": feature_names[top_rf_idx],
    "importance": rf_importances[top_rf_idx]
}).reset_index(drop=True)

print("Top RF feature importances")
display(top_rf)

# Plot
plt.figure(figsize=(6, 8))
plt.barh(top_rf["feature"][::-1], top_rf["importance"][::-1])
plt.title("Random Forest: Top Feature Importances")
plt.tight_layout()
plt.show()

In [None]:
# === Save both pipelines ===
joblib.dump(logreg_pipe, 'logreg_text_pipeline.pkl')
joblib.dump(rf_pipe, 'rf_text_pipeline.pkl')

# === Load & test on a sample ===
loaded_lr = joblib.load('logreg_text_pipeline.pkl')
loaded_rf = joblib.load('rf_text_pipeline.pkl')

sample_text = "Very tasty and fresh"
print("LR Prediction:", loaded_lr.predict([sample_text])[0], 
      " | Prob positive:", loaded_lr.predict_proba([sample_text])[0][ list(loaded_lr.named_steps['clf'].classes_).index('positive') ])

print("RF Prediction:", loaded_rf.predict([sample_text])[0],
      " | Prob positive:", loaded_rf.predict_proba([sample_text])[0][ list(loaded_rf.named_steps['clf'].classes_).index('positive') ])
