In [1]:
#Imports
from fileinput import filename
import random
import math
import matplotlib
from scipy.io import loadmat
import csv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle

import sklearn.linear_model as ln
import sklearn.ensemble as es
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.glassbox import ExplainableBoostingRegressor
from interpret import show
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline


from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline


from cross_validate import cross_validate_model

In [26]:
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict, Any

def load_hotel_dataset_for_ebm(
    csv_path: str,
    selected_features: List[str] # <--- NEW PARAMETER
) -> Tuple[pd.DataFrame, np.ndarray, List[str], List[str]]:
    """
    Load the hotel dataset for ExplainableBoostingRegressor (EBM),
    keeping only a specified list of valuable features.

    Args:
        csv_path (str)         : Path to the CSV file.
        selected_features (list): List of column names to INCLUDE in the final feature set X.

    Returns:
        X_df (pd.DataFrame)    : DataFrame with numeric + categorical cols.
        y (np.ndarray)         : Reviewer_Score target array.
        feature_names (list)   : Final list of feature names used.
        feature_types (list)   : Corresponding list of feature types ('continuous' or 'nominal').
    """

    df = pd.read_csv(csv_path)
    
    df.columns = [c.strip() for c in df.columns]

    # --- Target & Initial Cleaning ---
    if "Reviewer_Score" not in df.columns:
        raise ValueError("Expected 'Reviewer_Score' column as target.")
    
    # -------------------------
    # Feature Engineering/Parsing (ALWAYS REQUIRED)
    # -------------------------
    # 1. Parse Review_Date → year/month/day
    df["Review_Date"] = pd.to_datetime(df["Review_Date"], errors="coerce")
    df["Review_Year"] = df["Review_Date"].dt.year
    df["Review_Month"] = df["Review_Date"].dt.month
    df["Review_Day"] = df["Review_Date"].dt.day
    df = df.drop(columns=["Review_Date"], errors='ignore')

    # 2. Parse "days_since_review"
    df["days_since_review"] = (
        df["days_since_review"].astype(str).str.extract(r"(\d+)", expand=False)
    )
    df["days_since_review"] = pd.to_numeric(df["days_since_review"], errors="coerce")

    # -------------------------
    # Feature Definition (Define ALL potential features here)
    # -------------------------

    # Define the *potential* numeric and categorical columns
    POTENTIAL_CATEGORICAL_COLS = [
        "Hotel_Name",
        "Reviewer_Nationality",
    ]
    
    POTENTIAL_NUMERIC_COLS = [
        "Additional_Number_of_Scoring",
        "Average_Score",
        "Review_Total_Negative_Word_Counts",
        "Total_Number_of_Reviews",
        "Review_Total_Positive_Word_Counts",
        "Total_Number_of_Reviews_Reviewer_Has_Given",
        "days_since_review",
        "lat",
        "lng",
        "Review_Year",
        "Review_Month",
        "Review_Day",
    ]
    
    # Text columns to drop (unless you process them separately)
    TEXT_COLS_TO_DROP = [
        "Hotel_Address", 
        "Positive_Review", 
        "Negative_Review", 
        "Tags"
    ]
    
    # -------------------------
    # APPLY SELECTION FILTER
    # -------------------------
    
    # Filter potential lists to keep only the ones in selected_features
    # and only if they exist in the DataFrame columns.
    categorical_cols = [
        c for c in POTENTIAL_CATEGORICAL_COLS 
        if c in selected_features and c in df.columns
    ]
    
    numeric_cols = [
        c for c in POTENTIAL_NUMERIC_COLS 
        if c in selected_features and c in df.columns
    ]

    # Drop any large text columns that are NOT in the selected list
    cols_to_drop = [
        c for c in TEXT_COLS_TO_DROP 
        if c in df.columns and c not in selected_features
    ]
    df = df.drop(columns=cols_to_drop, errors='ignore')

    # -------------------------
    # Final Type Conversion and Cleanup
    # -------------------------

    # 1. Force numeric conversion for numeric_cols
    if numeric_cols:
        df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors="coerce")

    # 2. Convert categorical columns to the EBM-friendly 'category' dtype
    for col in categorical_cols:
        df[col] = df[col].astype("category")

    # 3. Build final feature lists
    feature_names = numeric_cols + categorical_cols
    feature_types = (
        ["continuous"] * len(numeric_cols) +
        ["nominal"] * len(categorical_cols)
    )

    # 4. Drop rows with any missing data in the selected features or target
    df_clean = df.dropna(subset=feature_names + ["Reviewer_Score"])

    X_df = df_clean[feature_names]
    y = df_clean["Reviewer_Score"].astype(float).to_numpy()

    return X_df, y, feature_names, feature_types

# ----------------------------------------------------
# Example Usage: How to call the new function
# ----------------------------------------------------

# Based on your EBM analysis, you decide these are your 'valuable' features:


In [32]:
    
VALUABLE_COLS = [
    "Average_Score",
    "Review_Total_Positive_Word_Counts",
    "Review_Total_Negative_Word_Counts",
    "Reviewer_Nationality",
    "days_since_review",
    "Total_Number_of_Reviews",
    "Total_Number_of_Reviews_Reviewer_Has_Given",
    "Review_Year",
    "Review_Month",
    "Review_Day",
    "lat",
    "lng",
    "Hotel_Name"
]

# Load the dataset
X_df, y, feature_names, feature_types = load_hotel_dataset_for_ebm("datasets/Hotel_Reviews.csv", VALUABLE_COLS)

print("Dataset loaded successfully!")
print("X_df shape:", X_df.shape)
print("y shape:", y.shape)

# Seperate the dataset into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)


Dataset loaded successfully!
X_df shape: (512470, 13)
y shape: (512470,)


In [33]:






# param_grid_ebc = {
#     # "learning_rate": [0.01, 0.1, 0.2],
#     # "max_bins": [32, 64, 128],
#     # "interactions": [1, 2, 5, 10]
#     "max_bins" : [32, 64, 128],
#     "interactions": [1, 2, 5, 10],
#     "interactions": [0],
#     "learning_rate": [0.02],
#     "max_rounds": [50]
# }

# cv_results, best_params_ebc = cross_validate_model(
#     ExplainableBoostingRegressor(),
#     {"feat": feat, "gnd": gnd},
#     param_grid_ebc,
#     5,
# )

# cv_results.to_csv("ebm_cross_validation_results.csv", index=False)
# print("Saved CV results → ebm_cross_validation_results.csv")

# ---------------------------------------------------------
# 2. Train Final Model
# ---------------------------------------------------------



feature_names = X_train.columns.tolist()

print("\n=== Training Final EBM Model ===")
best_model = ExplainableBoostingRegressor(
    feature_names=feature_names,
    learning_rate= 0.01,
    max_bins=512,
    interactions=66,
    max_rounds=500,
)

best_model.fit(X_train, y_train)
print("Trained final EBM model with best hyperparameters.")

# ---------------------------------------------------------
# 3. Save Feature Importances
# ---------------------------------------------------------

show(best_model.explain_global())

# feature_importance_df.to_csv("ebm_feature_importances.csv", index=False)
# print("Saved feature importances → ebm_feature_importances.csv")

# # ---------------------------------------------------------
# # 4. Predict on Test Set + Error Analysis
# # ---------------------------------------------------------

test_pred = best_model.predict(X_test)
y_true = y_test

# # residuals
residuals = y_true - test_pred
abs_error = np.abs(residuals)
print("Residuals and Absolute Errors:")
np.mean(residuals)
np.std(residuals)
np.mean(abs_error)
np.std(abs_error)
print("Mean Residuals:", np.mean(residuals))
print("Std Residuals:", np.std(residuals))
print("Mean Absolute Errors:", np.mean(abs_error))
print("Std Absolute Errors:", np.std(abs_error))

# test_results_df = pd.DataFrame({
#     "True_Value": y_true,
#     "Predicted_Value": test_pred,
#     "Residual": residuals,
#     "Absolute_Error": abs_error,
# })

# test_results_df.to_csv("ebm_test_predictions_with_errors.csv", index=False)
# print("Saved test set predictions + errors → ebm_test_predictions_with_errors.csv")

# # ---------------------------------------------------------
# # 5. Compute & Export Summary Metrics
# # ---------------------------------------------------------

# mae = mean_absolute_error(y_true, test_pred)
# mse = mean_squared_error(y_true, test_pred)
# rmse = np.sqrt(mse)
# r2 = r2_score(y_true, test_pred)

# metrics_df = pd.DataFrame([{
#     "MAE": mae,
#     "MSE": mse,
#     "RMSE": rmse,
#     "R2": r2
# }])

# metrics_df.to_csv("ebm_test_metrics.csv", index=False)
# print("Saved summary test metrics → ebm_test_metrics.csv")

# # Also print them for quick reference
# print("\n=== Test Metrics ===")
# print(f"MAE : {mae:.4f}")
# print(f"RMSE: {rmse:.4f}")
# print(f"R²  : {r2:.4f}")


=== Training Final EBM Model ===
Trained final EBM model with best hyperparameters.


Residuals and Absolute Errors:
Mean Residuals: -0.0004443875378803506
Std Residuals: 1.2214984102554023
Mean Absolute Errors: 0.9073622272242836
Std Absolute Errors: 0.8177727999532304
