In [None]:
# Preprocess SIA reviews with 70/10/20 split (train/val/test), combine title+text, clean, stopwords, TF-IDF
import pandas as pd
import numpy as np
import re
from pathlib import Path
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz
import pickle

# ---------- Load data ----------
in_path = Path(r"C:\Users\User\OneDrive\Y3S1\BT4222\project\singapore_airlines_reviews.csv")
df = pd.read_csv(in_path)
df.columns = [c.strip() for c in df.columns]

# ---------- Combine title + text ----------
df["combined_review"] = (df["title"].fillna("") + " " + df["text"].fillna("")).str.strip()

# ---------- Proxy sentiment from rating ----------
def map_rating_to_sentiment(x):
    try:
        r = float(x)
    except Exception:
        return np.nan
    if r <= 2:  return "negative"
    if r >= 4:  return "positive"
    return "neutral"

df["sentiment_proxy"] = df["rating"].apply(map_rating_to_sentiment)

# ---------- Cleaner ----------
emoji_pattern = re.compile(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)')
tag_pattern   = re.compile(r'<[^>]*>')
nonword_pat   = re.compile(r'[\W]+')

def preprocessor(text: str) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    text = tag_pattern.sub('', text)
    emojis = emoji_pattern.findall(text)
    text = nonword_pat.sub(' ', text.lower()).strip()
    if emojis:
        text = text + ' ' + ' '.join(e.replace('-', '') for e in emojis)
    return text

# ---------- 70/10/20 split using row indices ----------
idx_all = np.arange(len(df))
y = df["sentiment_proxy"].values
strat_all = y if pd.Series(y).notna().all() else None

# 80% train+val vs 20% test
idx_trval, idx_test, y_trval, y_test = train_test_split(
    idx_all, y,
    test_size=0.20, random_state=42, stratify=strat_all
)

# from 80% (train+val), carve 10% total for val -> 12.5% of trval
val_frac = 0.10 / 0.80
strat_trval = y_trval if pd.Series(y_trval).notna().all() else None

idx_train, idx_val, y_train, y_val = train_test_split(
    idx_trval, y_trval,
    test_size=val_frac, random_state=42, stratify=strat_trval
)

# ---------- Assign split labels back to df ----------
df["split"] = "train"
df.loc[idx_val, "split"] = "val"
df.loc[idx_test, "split"] = "test"

# (Optional) Save the dataset with split labels for downstream use
out_dir = Path(r"C:\Users\User\OneDrive\Y3S1\BT4222\project\sia_preproc_70_10_20")
out_dir.mkdir(exist_ok=True, parents=True)
df.to_csv(out_dir / "sia_reviews_with_split.csv", index=False)

# ---------- TF-IDF (fit on train only) ----------
X_train_text = df.loc[idx_train, "combined_review"].astype(str).values
X_val_text   = df.loc[idx_val,   "combined_review"].astype(str).values
X_test_text  = df.loc[idx_test,  "combined_review"].astype(str).values

vectorizer = TfidfVectorizer(
    preprocessor=preprocessor,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_val_tfidf   = vectorizer.transform(X_val_text)
X_test_tfidf  = vectorizer.transform(X_test_text)

# ---------- Save artefacts ----------
from scipy.sparse import save_npz
save_npz(out_dir / "X_train_tfidf.npz", X_train_tfidf)
save_npz(out_dir / "X_val_tfidf.npz", X_val_tfidf)
save_npz(out_dir / "X_test_tfidf.npz", X_test_tfidf)

with open(out_dir / "tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

pd.DataFrame({"index": idx_train, "text": X_train_text, "label": y_train}).to_csv(out_dir / "train_split.csv", index=False)
pd.DataFrame({"index": idx_val,   "text": X_val_text,   "label": y_val}).to_csv(out_dir / "val_split.csv", index=False)
pd.DataFrame({"index": idx_test,  "text": X_test_text,  "label": y_test}).to_csv(out_dir / "test_split.csv", index=False)

summary = {
    "n_total": int(len(df)),
    "n_train": int(len(idx_train)),
    "n_val":   int(len(idx_val)),
    "n_test":  int(len(idx_test)),
    "n_features": int(X_train_tfidf.shape[1]),
    "train_class_counts": Counter(y_train),
    "val_class_counts":   Counter(y_val),
    "test_class_counts":  Counter(y_test),
    "files": {
        "dataset_with_split": str(out_dir / "sia_reviews_with_split.csv"),
        "X_train_tfidf": str(out_dir / "X_train_tfidf.npz"),
        "X_val_tfidf":   str(out_dir / "X_val_tfidf.npz"),
        "X_test_tfidf":  str(out_dir / "X_test_tfidf.npz"),
        "tfidf_vectorizer.pkl": str(out_dir / "tfidf_vectorizer.pkl"),
        "train_split.csv": str(out_dir / "train_split.csv"),
        "val_split.csv":   str(out_dir / "val_split.csv"),
        "test_split.csv":  str(out_dir / "test_split.csv"),
    }
}
print(summary)


{'n_total': 10000, 'n_train': 7000, 'n_val': 1000, 'n_test': 2000, 'n_features': 42199, 'train_class_counts': Counter({'positive': 5174, 'negative': 1120, 'neutral': 706}), 'val_class_counts': Counter({'positive': 739, 'negative': 160, 'neutral': 101}), 'test_class_counts': Counter({'positive': 1478, 'negative': 320, 'neutral': 202}), 'files': {'dataset_with_split': 'C:\\Users\\User\\OneDrive\\Y3S1\\BT4222\\project\\sia_preproc_70_10_20\\sia_reviews_with_split.csv', 'X_train_tfidf': 'C:\\Users\\User\\OneDrive\\Y3S1\\BT4222\\project\\sia_preproc_70_10_20\\X_train_tfidf.npz', 'X_val_tfidf': 'C:\\Users\\User\\OneDrive\\Y3S1\\BT4222\\project\\sia_preproc_70_10_20\\X_val_tfidf.npz', 'X_test_tfidf': 'C:\\Users\\User\\OneDrive\\Y3S1\\BT4222\\project\\sia_preproc_70_10_20\\X_test_tfidf.npz', 'tfidf_vectorizer.pkl': 'C:\\Users\\User\\OneDrive\\Y3S1\\BT4222\\project\\sia_preproc_70_10_20\\tfidf_vectorizer.pkl', 'train_split.csv': 'C:\\Users\\User\\OneDrive\\Y3S1\\BT4222\\project\\sia_preproc_70_