In [1]:
# ------------------------------------------------------------------------------
# Imports & constants
# ------------------------------------------------------------------------------
from pathlib import Path
import json

import numpy as np
import pandas as pd
from scipy import sparse
import joblib

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import mutual_info_classif, chi2

RANDOM_STATE = 123


In [2]:
# ------------------------------------------------------------------------------
# Load preprocessed data (from Notebook 01)
# ------------------------------------------------------------------------------
data_path = Path("../data/clean_data/clean_preprocessed_reviews.csv")
df = pd.read_csv(data_path)

print(f"Loaded: {data_path.resolve()}")
print("Shape:", df.shape)
df.head(3)

Loaded: /Users/lucasvercauteren/Desktop/gehaalde vakken/Master eur/seminar/final paper/hotel_reviews_sent_python_notebook/data/clean_data/clean_preprocessed_reviews.csv
Shape: (200000, 3)


Unnamed: 0,Total_Review,encoded_review,preprocessed_review
0,Only a short stay Reception was extremely he...,1,short stay reception extremely helpful advice ...
1,It location far from historic center The park...,1,location far historic center parking free room...
2,Nothing Perfect Location and staff,1,perfect location staff


In [3]:
# ------------------------------------------------------------------------------
# Split into train/test (30% train, 70% test)
# ------------------------------------------------------------------------------
X = df["preprocessed_review"].astype(str)
y = df["encoded_review"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    train_size=0.30,          # 30% train, 70% test
    random_state=RANDOM_STATE,
    stratify=y
)

train_data = pd.DataFrame({"preprocessed_review": X_train, "encoded_review": y_train})
test_data  = pd.DataFrame({"preprocessed_review": X_test,  "encoded_review": y_test})

print("Train size:", train_data.shape, " Test size:", test_data.shape)

Train size: (60000, 2)  Test size: (140000, 2)


In [4]:
# ------------------------------------------------------------------------------
# Balance TRAIN only by undersampling majority class (keep TEST untouched)
# ------------------------------------------------------------------------------
majority = train_data[train_data["encoded_review"] == 1]
minority = train_data[train_data["encoded_review"] == 0]

majority_under = resample(
    majority,
    replace=False,
    n_samples=len(minority),
    random_state=RANDOM_STATE
)

train_data_balanced = pd.concat([majority_under, minority]).sample(frac=1, random_state=RANDOM_STATE)

train_reviews_balanced = train_data_balanced["preprocessed_review"].fillna("").astype(str)
y_train_balanced = train_data_balanced["encoded_review"].astype(int)
test_reviews = test_data["preprocessed_review"].fillna("").astype(str)

print("Balanced TRAIN class counts:\n", y_train_balanced.value_counts().sort_index())
print("TEST class counts:\n", y_test.value_counts().sort_index())

Balanced TRAIN class counts:
 encoded_review
0    10333
1    10333
Name: count, dtype: int64
TEST class counts:
 encoded_review
0     24112
1    115888
Name: count, dtype: int64


In [5]:
# ------------------------------------------------------------------------------
# Tokenization & vocabulary (unigrams, built-in English stopwords)
# ------------------------------------------------------------------------------
vectorizer = CountVectorizer(
    analyzer="word",
    stop_words="english",   # remove english stopwords
    ngram_range=(1, 1),     # unigrams
    lowercase=False         
)

dtm_train_tf = vectorizer.fit_transform(train_reviews_balanced)
dtm_test_tf  = vectorizer.transform(test_reviews)

feature_names = np.array(vectorizer.get_feature_names_out())

print("Vocab size:", len(feature_names))
print("DTM shapes (train, test):", dtm_train_tf.shape, dtm_test_tf.shape)

Vocab size: 16861
DTM shapes (train, test): (20666, 16861) (140000, 16861)


In [10]:
# ------------------------------------------------------------------------------
# Feature selection scores: Information Gain (mutual info) + Chi-Square
# ------------------------------------------------------------------------------
y_train_balanced_arr = y_train_balanced.to_numpy()

# Information Gain (mutual information for discrete features)
ig_scores = mutual_info_classif(
    dtm_train_tf,
    y_train_balanced_arr,
    discrete_features=True,
    random_state=RANDOM_STATE
)
ig_df = pd.DataFrame({"Feature": feature_names, "IG_Score": ig_scores})

# Chi-Square (on binarized presence/absence)
X_bin = (dtm_train_tf > 0).astype(int)
chi_vals, _ = chi2(X_bin, y_train_balanced_arr)
chi_df = pd.DataFrame({"Feature": feature_names, "Chi_Square": chi_vals})

# Merge + normalize + combine
combined_scores = ig_df.merge(chi_df, on="Feature", how="inner")

eps = 1e-12
combined_scores["Norm_IG"] = (
    (combined_scores["IG_Score"] - combined_scores["IG_Score"].min())
    / (combined_scores["IG_Score"].max() - combined_scores["IG_Score"].min() + eps)
)
combined_scores["Norm_Chi_Square"] = (
    (combined_scores["Chi_Square"] - combined_scores["Chi_Square"].min())
    / (combined_scores["Chi_Square"].max() - combined_scores["Chi_Square"].min() + eps)
)

combined_scores["Combined_Score"] = (
    (combined_scores["Norm_IG"] + combined_scores["Norm_Chi_Square"]) / 2.0
)

# Sort descending (highest = most informative)
combined_scores = combined_scores.sort_values("Combined_Score", ascending=False).reset_index(drop=True)

combined_scores.head(50)

Unnamed: 0,Feature,IG_Score,Chi_Square,Norm_IG,Norm_Chi_Square,Combined_Score
0,great,0.02079,663.094244,1.0,1.0,1.0
1,excellent,0.016036,570.388753,0.771302,0.860193,0.815747
2,poor,0.013354,470.444344,0.642344,0.709468,0.675906
3,helpful,0.013327,466.228359,0.641018,0.70311,0.672064
4,room,0.018386,232.810808,0.884346,0.351098,0.617722
5,lovely,0.011423,416.376798,0.549456,0.62793,0.588693
6,friendly,0.011285,391.069348,0.542796,0.589764,0.56628
7,dirty,0.011046,373.363636,0.531293,0.563063,0.547178
8,rude,0.009961,341.913684,0.479108,0.515634,0.497371
9,small,0.009718,330.796113,0.467441,0.498867,0.483154


In [7]:
# ------------------------------------------------------------------------------
# Select and reduce to top 500 features (using Combined_Score)
# ------------------------------------------------------------------------------
optimal_features = 500

# 1) Get top features by Combined_Score
top_features = (
    combined_scores.sort_values("Combined_Score", ascending=False)["Feature"]
    .head(optimal_features)
    .tolist()
)

# 2) Map feature names -> indices in the fitted vectorizer
vocab = vectorizer.vocabulary_
missing = [f for f in top_features if f not in vocab]
if missing:
    print(f"Note: {len(missing)} selected features not in vocab (showing up to 10): {missing[:10]}")

col_idx = np.array([vocab[f] for f in top_features if f in vocab], dtype=int)

# 3) Subset sparse matrices to only the selected features
dtm_train_opt = dtm_train_tf[:, col_idx]
dtm_test_opt  = dtm_test_tf[:,  col_idx]

print("Full DTM shapes   :", dtm_train_tf.shape, dtm_test_tf.shape)
print("Reduced DTM shapes:", dtm_train_opt.shape, dtm_test_opt.shape)

Full DTM shapes   : (20666, 16861) (140000, 16861)
Reduced DTM shapes: (20666, 500) (140000, 500)


In [8]:
# ------------------------------------------------------------------------------
# Save all data for modeling
# ------------------------------------------------------------------------------
FEAT = Path("../data/features")
FEAT.mkdir(parents=True, exist_ok=True)

# Feature-selection table
combined_scores.to_pickle(FEAT / "combined_scores.pkl")

# Full sparse matrices
sparse.save_npz(FEAT / "dtm_train_tf.npz", dtm_train_tf)
sparse.save_npz(FEAT / "dtm_test_tf.npz",  dtm_test_tf)

# Reduced sparse matrices (top N = 500)
sparse.save_npz(FEAT / "dtm_train_opt.npz", dtm_train_opt)
sparse.save_npz(FEAT / "dtm_test_opt.npz",  dtm_test_opt)

# Labels
joblib.dump(y_train_balanced.to_numpy(), FEAT / "y_train_balanced.joblib")
joblib.dump(y_test.to_numpy(),           FEAT / "y_test.joblib")

#  Vectorizer + feature names
joblib.dump(vectorizer,    FEAT / "vectorizer.joblib")
joblib.dump(feature_names, FEAT / "feature_names.joblib")

# Top-feature list for reference (JSON)
with open(FEAT / "top_features_500.json", "w") as f:
    json.dump(top_features, f)

print("All artifacts saved successfully to:", FEAT.resolve())
print("   - Full matrices :", dtm_train_tf.shape, dtm_test_tf.shape)
print("   - Reduced (500) :", dtm_train_opt.shape, dtm_test_opt.shape)

✅ All artifacts saved successfully to: /Users/lucasvercauteren/Desktop/gehaalde vakken/Master eur/seminar/final paper/hotel_reviews_sent_python_notebook/data/features
   - Full matrices : (20666, 16861) (140000, 16861)
   - Reduced (500) : (20666, 500) (140000, 500)
