In [1]:
#imports
from google.colab import drive
import pandas as pd
import numpy as np
import re
from pathlib import Path
from collections import Counter
import pickle
import csv

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz

In [None]:
#PROCESS SKYTRAX REVIEWS
df1 = pd.read_csv("data/skytrax_reviews.csv")
df2 = pd.read_csv("data/skytrax_reviews2.csv")
df3 = pd.read_csv("data/skytrax_reviews3.csv")
df4 = pd.read_csv("data/skytrax_reviews4.csv")
df_skytrax = pd.concat([df1, df2, df3, df4], ignore_index=True)

df_skytrax["text"] = df_skytrax["body"].apply(lambda x: re.sub(r"^.*\|\s*", "", str(x)).strip())
df_skytrax["title"] = df_skytrax["title"].apply(lambda x: re.sub(r'["“”]', '', str(x)).strip())
df_skytrax["combined_review"] = df_skytrax["text"]

#Normalize rating and create sentiment proxy
df_skytrax["rating_normalized"] = (df_skytrax["rating_value_10"] / 2).round().astype(int).clip(1, 5)

def label_sentiment(r):
    if r <= 2: return "negative"
    elif r == 3: return "neutral"
    else: return "positive"

df_skytrax["sentiment_proxy"] = df_skytrax["rating_normalized"].apply(label_sentiment)

#Clean up quotation marks
df_skytrax["combined_review"] = (
    df_skytrax["combined_review"]
    .astype(str)
    .str.strip()
    .str.replace(r'^"+|"+$', '', regex=True)
)

#Drop NaNs and duplicates
df_skytrax.dropna(subset=["combined_review", "sentiment_proxy"], inplace=True)
df_skytrax.drop_duplicates(subset=["combined_review"], inplace=True)

df_skytrax_final = df_skytrax[["combined_review", "sentiment_proxy"]].copy()

Mounted at /content/drive


In [None]:
#process SIA reviews from kaggle
df_sia = pd.read_csv("data/singapore_airlines_reviews.csv")
df_sia.columns = [c.strip() for c in df_sia.columns]
df_sia["combined_review"] = (df_sia["title"].fillna("") + " " + df_sia["text"].fillna("")).str.strip()
def map_rating_to_sentiment(x):
  try:
    r = float(x)
  except Exception:
    return np.nan
  if r <= 2: return "negative"
  if r >= 4: return "positive"
  return "neutral"

df_sia["sentiment_proxy"] = df_sia["rating"].apply(map_rating_to_sentiment)
#Drop NaNs and duplicates
df_sia.dropna(subset=["combined_review", "sentiment_proxy"], inplace=True)
df_sia.drop_duplicates(subset=["combined_review"], inplace=True)

df_sia_final = df_sia[["combined_review", "sentiment_proxy"]].copy()

In [4]:
# Concatenate Dataframes
df = pd.concat([df_skytrax_final, df_sia_final], ignore_index=True)

df.dropna(inplace=True)
df.drop_duplicates(subset=["combined_review"], inplace=True)
df.reset_index(drop=True, inplace=True)

print(f"All data combined and cleaned. Final DataFrame shape: {df.shape}")
print(df["sentiment_proxy"].value_counts())


All data combined and cleaned. Final DataFrame shape: (11665, 2)
sentiment_proxy
positive    8462
negative    2095
neutral     1108
Name: count, dtype: int64


In [5]:
#DATA SPLIT & PREPROCESSING
emoji_pattern = re.compile(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)')
tag_pattern   = re.compile(r'<[^>]*>')
nonword_pat   = re.compile(r'[\W]+')

def preprocessor(text: str) -> str:
    if not isinstance(text, str):
        text = "" if pd.isna(text) else str(text)
    text = tag_pattern.sub('', text)
    emojis = emoji_pattern.findall(text)
    text = nonword_pat.sub(' ', text.lower()).strip()
    if emojis:
        text = text + ' ' + ' '.join(e.replace('-', '') for e in emojis)
    return text

# 70/10/20 split
idx_all = np.arange(len(df))
y = df["sentiment_proxy"].values
# Check for NaNs in y before stratifying (though we already dropped them)
strat_all = y if pd.Series(y).notna().all() else None

if strat_all is None:
    print("Warning: NaNs found in target variable 'y'. Proceeding without stratification.")

# 80% train+val vs 20% test
idx_trval, idx_test, y_trval, y_test = train_test_split(
    idx_all, y,
    test_size=0.20, random_state=42, stratify=strat_all
)

# 10% val vs 70% train (from the 80% trval)
val_frac = 0.10 / 0.80  # 0.125
strat_trval = y_trval if pd.Series(y_trval).notna().all() else None

idx_train, idx_val, y_train, y_val = train_test_split(
    idx_trval, y_trval,
    test_size=val_frac, random_state=42, stratify=strat_trval
)

# Add split column back to the dataframe for reference
df["split"] = "train"
df.loc[idx_val, "split"] = "val"
df.loc[idx_test, "split"] = "test"

print("Data successfully split into 70% train, 10% val, 20% test.")

# Get text data for vectorization
X_train_text = df.loc[idx_train, "combined_review"].astype(str).values
X_val_text   = df.loc[idx_val, "combined_review"].astype(str).values
X_test_text  = df.loc[idx_test, "combined_review"].astype(str).values

# Initialize and fit TF-IDF Vectorizer
vectorizer = TfidfVectorizer(
    preprocessor=preprocessor,
    stop_words='english',
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_val_tfidf   = vectorizer.transform(X_val_text)
X_test_tfidf  = vectorizer.transform(X_test_text)

print(f"TF-IDF vectorization complete. Vocabulary size: {X_train_tfidf.shape[1]}")

Data successfully split into 70% train, 10% val, 20% test.
TF-IDF vectorization complete. Vocabulary size: 51228


In [None]:
#Save
out_dir = Path("data/combined_split")
out_dir.mkdir(exist_ok=True, parents=True)

# Save the dataset with split labels
df.to_csv(out_dir / "combined_reviews_with_split.csv", index=False)

# Save split dataframes
pd.DataFrame({"index": idx_train, "text": X_train_text, "label": y_train}).to_csv(out_dir / "train_split.csv", index=False)
pd.DataFrame({"index": idx_val, "text": X_val_text,   "label": y_val}).to_csv(out_dir / "val_split.csv", index=False)
pd.DataFrame({"index": idx_test, "text": X_test_text,  "label": y_test}).to_csv(out_dir / "test_split.csv", index=False)

# Save TF-IDF matrices and vectorizer
save_npz(out_dir / "X_train_tfidf.npz", X_train_tfidf)
save_npz(out_dir / "X_val_tfidf.npz", X_val_tfidf)
save_npz(out_dir / "X_test_tfidf.npz", X_test_tfidf)

with open(out_dir / "tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print(f"All artifacts saved to '{out_dir}' directory.")

In [6]:
#Summary
summary = {
    "n_total": int(len(df)),
    "n_train": int(len(idx_train)),
    "n_val":   int(len(idx_val)),
    "n_test":  int(len(idx_test)),
    "n_features": int(X_train_tfidf.shape[1]),
    "train_class_counts": Counter(y_train),
    "val_class_counts":   Counter(y_val),
    "test_class_counts":  Counter(y_test)
}
print(summary)

{'n_total': 11665, 'n_train': 8165, 'n_val': 1167, 'n_test': 2333, 'n_features': 51228, 'train_class_counts': Counter({'positive': 5923, 'negative': 1467, 'neutral': 775}), 'val_class_counts': Counter({'positive': 847, 'negative': 209, 'neutral': 111}), 'test_class_counts': Counter({'positive': 1692, 'negative': 419, 'neutral': 222})}
