In [21]:
import pandas as pd
df_train=pd.read_csv('/Users/rishav/Downloads/IIT_GN/Train_Data/train_data_final.csv')
df_test=pd.read_csv('/Users/rishav/Downloads/final_test_dataset.csv')


In [22]:
df_test[['NCO_3D', 'NIC_5D']] = df_test[['NCO_3D', 'NIC_5D']].fillna(1000)
df_test.to_csv("test_data_final.csv", index=False)
print(df_test.isnull().sum())

HH_ID                                                                                       0
Sector                                                                                      0
State                                                                                       0
NSS-Region                                                                                  0
District                                                                                    0
Household Type                                                                              0
Religion of the head of the household                                                       0
Social Group of the head of the household                                                   0
HH Size (For FDQ)                                                                           0
NCO_3D                                                                                      0
NIC_5D                                                      

In [5]:
import pandas as pd

df = pd.read_csv('/Users/rishav/Downloads/test_data_final.csv')

# Find columns with missing values
null_columns = df.columns[df.isnull().sum() > 0]

# Print the count of missing values in each column with missing data
print(df.isnull().sum()[null_columns])
print(df.shape)


Series([], dtype: int64)
(225316, 50)


In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# --- Standardize numeric features ---
def standardize_features(file_path, target_column):
    df = pd.read_csv(file_path)
    non_numeric_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
    if target_column in non_numeric_cols:
        non_numeric_cols.remove(target_column)
    X_numeric = df.drop(columns=[target_column] + non_numeric_cols)
    y = df[target_column].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_numeric)
    return X_scaled, y, scaler, non_numeric_cols, X_numeric.columns.tolist()

# --- Elbow method to determine optimal clusters ---
def elbow_method_auto(X, max_clusters=10):
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
    deltas = np.diff(inertias)
    second_deltas = np.diff(deltas)
    elbow_point = np.argmin(second_deltas) + 2  # Adjust for index shift
    return elbow_point

# --- Evaluation metrics ---
def mean_percentage_error(y_true, y_pred):
    # Avoid division by zero
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def evaluate_predictions(y_true, y_pred, method_name=""):
    mpe = mean_percentage_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n--- {method_name} ---")
    print(f"Mean Percentage Error (MPE): {mpe:.2f}%")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R² Score: {r2:.4f}")

# --- Main Training Function with XGBoost ---
def cluster_and_predict_xgb(file_path, target_column='WeightedExpense', max_clusters=10):
    # Standardize
    X_scaled, y, scaler, non_numeric_cols, numeric_cols = standardize_features(file_path, target_column)

    # Determine optimal number of clusters
    optimal_k = elbow_method_auto(X_scaled, max_clusters=max_clusters)
    print(f"Optimal number of clusters: {optimal_k}")

    # KMeans clustering
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
    clusters = kmeans.fit_predict(X_scaled)

    # Cluster-wise XGBoost model training
    preds_model = np.zeros_like(y)
    cluster_models = {}

    # XGBoost parameters
    xgb_params = {
        'n_estimators': 100,
        'learning_rate': 0.1,
        'max_depth': 6,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'reg:squarederror',
        'random_state': 42,
        'verbosity': 0
    }

    for cluster_id in np.unique(clusters):
        cluster_idx = clusters == cluster_id
        X_cluster = X_scaled[cluster_idx]
        y_cluster = y[cluster_idx]

        # Skip empty clusters (shouldn't happen but just in case)
        if len(y_cluster) == 0:
            continue

        # Train XGBoost model for this cluster
        model = XGBRegressor(**xgb_params)
        model.fit(X_cluster, y_cluster)
        
        # Make predictions for this cluster
        preds_model[cluster_idx] = model.predict(X_cluster)

        # Save model per cluster
        cluster_models[cluster_id] = model

    # Evaluate training performance
    evaluate_predictions(y, preds_model, method_name="Cluster-wise XGBoost Prediction")

    # Return trained components for prediction
    return scaler, kmeans, cluster_models, numeric_cols, non_numeric_cols

def predict_on_test_data(test_csv_path, scaler, kmeans, cluster_models, numeric_cols, non_numeric_cols, target_column='WeightedExpense'):
    """
    Predict expenses on test data using trained XGBoost models.
    
    Parameters:
    -----------
    test_csv_path : str
        Path to the test CSV file
    scaler : StandardScaler
        Fitted scaler from training data
    kmeans : KMeans
        Trained KMeans model
    cluster_models : dict
        Dictionary of XGBoost models for each cluster
    numeric_cols : list
        List of numeric column names used in training
    non_numeric_cols : list
        List of non-numeric column names from training
    target_column : str
        Name of the target column
        
    Returns:
    --------
    pd.DataFrame
        Test data with predictions
    """
    # Load test data
    test_data = pd.read_csv(test_csv_path)
    
    # Drop 'TotalExpense' if present
    if 'TotalExpense' in test_data.columns:
        test_data = test_data.drop(columns=['TotalExpense'])
    
    # Check if all required numeric columns exist in test data
    missing_cols = [col for col in numeric_cols if col not in test_data.columns]
    if missing_cols:
        raise ValueError(f"Test data is missing columns that were used in training: {missing_cols}")
    
    # Create a DataFrame with only the numeric columns in the correct order
    X_numeric_test = pd.DataFrame(index=test_data.index)
    for col in numeric_cols:
        X_numeric_test[col] = test_data[col]
    
    # Handle missing values
    if X_numeric_test.isna().any().any():
        cols_with_nan = X_numeric_test.columns[X_numeric_test.isna().any()].tolist()
        print(f"Warning: NaN values found in columns: {cols_with_nan}")
        
        # Fill NaN values with appropriate defaults for each column
        for col in cols_with_nan:
            print(f"Filling NaN values in {col}")
            if col in ['NCO_3D', 'NIC_5D']:
                X_numeric_test[col] = X_numeric_test[col].fillna(1000)
            else:
                X_numeric_test[col] = X_numeric_test[col].fillna(X_numeric_test[col].median())
    
    # Double-check for any remaining NaN values
    if X_numeric_test.isna().any().any():
        raise ValueError("Failed to handle all NaN values in numeric features")
        
    # Standardize features using the same scaler from training
    X_scaled_test = scaler.transform(X_numeric_test)
    
    # Predict clusters
    test_clusters = kmeans.predict(X_scaled_test)
    
    # Predict using cluster-specific XGBoost models
    predictions = np.zeros(X_scaled_test.shape[0])
    
    for cluster_id in np.unique(test_clusters):
        cluster_idx = test_clusters == cluster_id
        X_cluster_test = X_scaled_test[cluster_idx]
        
        if cluster_id in cluster_models:
            model = cluster_models[cluster_id]
            predictions[cluster_idx] = model.predict(X_cluster_test)
        else:
            print(f"Warning: No model for cluster {cluster_id}, using average prediction")
            # Use average of all predictions for this cluster
            cluster_means = np.mean([m.predict(X_cluster_test).mean() 
                                     for m in cluster_models.values()])
            predictions[cluster_idx] = cluster_means
    
    # Add predictions to test DataFrame
    test_data['predicted_expense'] = predictions
    
    # Save predictions to CSV
    output_path = 'test_data_with_predictions_xgb.csv'
    test_data.to_csv(output_path, index=False)
    print(f"\nPredictions saved to {output_path}")
    
    # Display sample predictions
    print("\nSample Predictions:")
    print(test_data[['predicted_expense']].head())
    
    return test_data

def aggregate_household_predictions(test_results):
    """
    Aggregate predictions by household ID
    """
    # Step 1: Calculate the sum of predicted_expense for each household ID
    household_sums = test_results.groupby('HH_ID')['predicted_expense'].sum()

    # Step 2: Replace each row's predicted_expense with its household sum
    test_results['predicted_expense'] = test_results['HH_ID'].map(household_sums)

    # Step 3: Save the updated dataframe to CSV
    output_path = 'test_data_with_household_sums_xgb.csv'
    test_results.to_csv(output_path, index=False)
    print(f"Saved data with household sums to {output_path}")

    # Step 4: Verify the result with a sample household
    sample_household = test_results['HH_ID'].iloc[0]
    sample_rows = test_results[test_results['HH_ID'] == sample_household].head()
    print(f"\nSample rows from household {sample_household}:")
    print(sample_rows[['HH_ID', 'predicted_expense']])
    
    return test_results

# --- Main execution ---
# Training with XGBoost
scaler, kmeans, cluster_models, numeric_cols, non_numeric_cols = cluster_and_predict_xgb(
   '/Users/rishav/Downloads/IIT_GN/Train_Data/train_data_final.csv',
    target_column='WeightedExpense'
)

# Predict on Test CSV
test_results = predict_on_test_data(
    test_csv_path='/Users/rishav/Downloads/test_data_final.csv',
    scaler=scaler,
    kmeans=kmeans,
    cluster_models=cluster_models,
    numeric_cols=numeric_cols,
    non_numeric_cols=non_numeric_cols,
    target_column='WeightedExpense'
)

# Aggregate predictions by household
final_results = aggregate_household_predictions(test_results)

# Optional: Compare predictions with actual values if you have them
# Define the comparison function directly here instead of trying to import it
def compare_expenses(predictions_csv, actual_csv, 
                     pred_col='predicted_expense', actual_col='TotalExpense',
                     join_col='HH_ID'):
    """
    Compare predicted expenses with actual expenses and calculate metrics.
    """
    # Load the datasets
    pred_df = pd.read_csv(predictions_csv)
    actual_df = pd.read_csv(actual_csv)
    
    # Check if the datasets have the required columns
    required_cols = {
        'predictions': [pred_col, join_col],
        'actual': [actual_col, join_col]
    }
    
    for df_name, cols in required_cols.items():
        df = pred_df if df_name == 'predictions' else actual_df
        missing = [col for col in cols if col not in df.columns]
        if missing:
            raise ValueError(f"{df_name} dataset is missing columns: {missing}")
    
    # Merge the datasets
    pred_unique = pred_df.drop_duplicates(subset=[join_col])[[join_col, pred_col]]
    actual_unique = actual_df.drop_duplicates(subset=[join_col])[[join_col, actual_col]]
    merged_df = pd.merge(pred_unique, actual_unique, on=join_col, how='inner')
    
    # Print the number of households we can compare
    print(f"Number of households for comparison: {len(merged_df)}")
    
    # Extract the values for comparison
    y_pred = merged_df[pred_col].values
    y_true = merged_df[actual_col].values
    
    # Calculate metrics
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true - y_pred))
    
    # Calculate Mean Percentage Error (avoiding division by zero)
    non_zero_mask = y_true != 0
    if np.any(non_zero_mask):
        mpe = np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
    else:
        mpe = np.nan
    
    # Calculate Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(1e-8, np.abs(y_true)))) * 100
    
    # Display metrics
    print("\n----- Evaluation Metrics -----")
    print(f"R² Score: {r2:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Percentage Error (MPE): {mpe:.2f}%")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
    
    # Return the metrics as a dictionary
    return {
        'r2': r2,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'mpe': mpe,
        'mape': mape
    }

# Check if test data has actual values
if 'TotalExpense' in pd.read_csv('/Users/rishav/Downloads/final_test_dataset.csv').columns:
    # Add the missing import
    from sklearn.metrics import mean_squared_error
    
    # Run comparison
    metrics = compare_expenses(
        predictions_csv='test_data_with_household_sums_xgb.csv',
        actual_csv='/Users/rishav/Downloads/final_test_dataset.csv',
        pred_col='predicted_expense',
        actual_col='TotalExpense',
        join_col='HH_ID'
    )
    
    print("\nFinal evaluation metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

Optimal number of clusters: 7

--- Cluster-wise XGBoost Prediction ---
Mean Percentage Error (MPE): 26.54%
Mean Absolute Error (MAE): 1189.40
R² Score: 0.6828

Predictions saved to test_data_with_predictions_xgb.csv

Sample Predictions:
   predicted_expense
0        6905.832031
1        4735.216309
2        4684.315430
3        3770.371582
4        9016.327148
Saved data with household sums to test_data_with_household_sums_xgb.csv

Sample rows from household HCES2022655561010131113011101202304:
                                 HH_ID  predicted_expense
0  HCES2022655561010131113011101202304       20095.735352
1  HCES2022655561010131113011101202304       20095.735352
2  HCES2022655561010131113011101202304       20095.735352
3  HCES2022655561010131113011101202304       20095.735352
Number of households for comparison: 52350

----- Evaluation Metrics -----
R² Score: 0.5590
Mean Squared Error (MSE): 82091652.47
Root Mean Squared Error (RMSE): 9060.44
Mean Absolute Error (MAE): 5252.16
Mean 

In [8]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# --- Standardize numeric features ---
def standardize_features(file_path, target_column):
    df = pd.read_csv(file_path)
    non_numeric_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
    if target_column in non_numeric_cols:
        non_numeric_cols.remove(target_column)
    X_numeric = df.drop(columns=[target_column] + non_numeric_cols)
    y = df[target_column].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_numeric)
    return X_scaled, y, scaler, non_numeric_cols, X_numeric.columns.tolist(), df

# --- Evaluation metrics ---
def mean_percentage_error(y_true, y_pred):
    # Avoid division by zero
    mask = y_true != 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

def evaluate_predictions(y_true, y_pred, method_name=""):
    mpe = mean_percentage_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n--- {method_name} ---")
    print(f"Mean Percentage Error (MPE): {mpe:.2f}%")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R² Score: {r2:.4f}")

# --- Direct XGBoost Training Function ---
def train_xgboost(file_path, target_column='WeightedExpense'):
    # Standardize
    X_scaled, y, scaler, non_numeric_cols, numeric_cols, df = standardize_features(file_path, target_column)
    
    print(f"Training XGBoost model directly on {len(y)} samples with {X_scaled.shape[1]} features")
    
    # XGBoost parameters
    xgb_params = {
        'n_estimators': 200,
        'learning_rate': 0.1,
        'max_depth': 7,
        'min_child_weight': 1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'reg:squarederror',
        'random_state': 42,
        'verbosity': 0,
        'n_jobs': -1  # Use all cores
    }
    
    # Train a single XGBoost model on all data
    model = XGBRegressor(**xgb_params)
    model.fit(X_scaled, y)
    
    # Make predictions on training data
    train_preds = model.predict(X_scaled)
    
    # Evaluate training performance
    evaluate_predictions(y, train_preds, method_name="XGBoost Prediction")
    
    # Get feature importance
    feature_importance = pd.DataFrame({
        'Feature': numeric_cols,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    # Return trained components for prediction
    return model, scaler, numeric_cols, non_numeric_cols

def predict_on_test_data(test_csv_path, model, scaler, numeric_cols, non_numeric_cols):
    """
    Predict expenses on test data using the trained XGBoost model.
    """
    # Load test data
    test_data = pd.read_csv(test_csv_path)
    
    # Drop 'TotalExpense' if present
    if 'TotalExpense' in test_data.columns:
        test_data = test_data.drop(columns=['TotalExpense'])
    
    # Check if all required numeric columns exist in test data
    missing_cols = [col for col in numeric_cols if col not in test_data.columns]
    if missing_cols:
        raise ValueError(f"Test data is missing columns that were used in training: {missing_cols}")
    
    # Create a DataFrame with only the numeric columns in the correct order
    X_numeric_test = pd.DataFrame(index=test_data.index)
    for col in numeric_cols:
        X_numeric_test[col] = test_data[col]
    
    # Handle missing values
    if X_numeric_test.isna().any().any():
        cols_with_nan = X_numeric_test.columns[X_numeric_test.isna().any()].tolist()
        print(f"Warning: NaN values found in columns: {cols_with_nan}")
        
        # Fill NaN values with appropriate defaults for each column
        for col in cols_with_nan:
            print(f"Filling NaN values in {col}")
            if col in ['NCO_3D', 'NIC_5D']:
                X_numeric_test[col] = X_numeric_test[col].fillna(1000)
            else:
                X_numeric_test[col] = X_numeric_test[col].fillna(X_numeric_test[col].median())
    
    # Double-check for any remaining NaN values
    if X_numeric_test.isna().any().any():
        raise ValueError("Failed to handle all NaN values in numeric features")
        
    # Standardize features using the same scaler from training
    X_scaled_test = scaler.transform(X_numeric_test)
    
    # Predict using the XGBoost model
    predictions = model.predict(X_scaled_test)
    
    # Add predictions to test DataFrame
    test_data['predicted_expense'] = predictions
    
    # Save predictions to CSV
    output_path = 'test_data_with_predictions_xgb_direct.csv'
    test_data.to_csv(output_path, index=False)
    print(f"\nPredictions saved to {output_path}")
    
    # Display sample predictions
    print("\nSample Predictions:")
    print(test_data[['predicted_expense']].head())
    
    return test_data

def aggregate_household_predictions(test_results):
    """
    Aggregate predictions by household ID
    """
    # Step 1: Calculate the sum of predicted_expense for each household ID
    household_sums = test_results.groupby('HH_ID')['predicted_expense'].sum()

    # Step 2: Replace each row's predicted_expense with its household sum
    test_results['predicted_expense'] = test_results['HH_ID'].map(household_sums)

    # Step 3: Save the updated dataframe to CSV
    output_path = 'test_data_with_household_sums_xgb_direct.csv'
    test_results.to_csv(output_path, index=False)
    print(f"Saved data with household sums to {output_path}")

    # Step 4: Verify the result with a sample household
    sample_household = test_results['HH_ID'].iloc[0]
    sample_rows = test_results[test_results['HH_ID'] == sample_household].head()
    print(f"\nSample rows from household {sample_household}:")
    print(sample_rows[['HH_ID', 'predicted_expense']])
    
    return test_results

# Function to compare predictions with actual values
def compare_expenses(predictions_csv, actual_csv, 
                     pred_col='predicted_expense', actual_col='TotalExpense',
                     join_col='HH_ID'):
    """
    Compare predicted expenses with actual expenses and calculate metrics.
    """
    # Load the datasets
    pred_df = pd.read_csv(predictions_csv)
    actual_df = pd.read_csv(actual_csv)
    
    # Check if the datasets have the required columns
    required_cols = {
        'predictions': [pred_col, join_col],
        'actual': [actual_col, join_col]
    }
    
    for df_name, cols in required_cols.items():
        df = pred_df if df_name == 'predictions' else actual_df
        missing = [col for col in cols if col not in df.columns]
        if missing:
            raise ValueError(f"{df_name} dataset is missing columns: {missing}")
    
    # Merge the datasets
    pred_unique = pred_df.drop_duplicates(subset=[join_col])[[join_col, pred_col]]
    actual_unique = actual_df.drop_duplicates(subset=[join_col])[[join_col, actual_col]]
    merged_df = pd.merge(pred_unique, actual_unique, on=join_col, how='inner')
    
    # Print the number of households we can compare
    print(f"Number of households for comparison: {len(merged_df)}")
    
    # Extract the values for comparison
    y_pred = merged_df[pred_col].values
    y_true = merged_df[actual_col].values
    
    # Calculate metrics
    r2 = r2_score(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(y_true - y_pred))
    
    # Calculate Mean Percentage Error (avoiding division by zero)
    non_zero_mask = y_true != 0
    if np.any(non_zero_mask):
        mpe = np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
    else:
        mpe = np.nan
    
    # Calculate Mean Absolute Percentage Error (MAPE)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(1e-8, np.abs(y_true)))) * 100
    
    # Display metrics
    print("\n----- Evaluation Metrics -----")
    print(f"R² Score: {r2:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Percentage Error (MPE): {mpe:.2f}%")
    print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
    
    # Return the metrics as a dictionary
    return {
        'r2': r2,
        'mse': mse,
        'rmse': rmse,
        'mae': mae,
        'mpe': mpe,
        'mape': mape
    }

# --- Main execution ---
# Training with direct XGBoost (no clustering)
model, scaler, numeric_cols, non_numeric_cols = train_xgboost(
   '/Users/rishav/Downloads/modified_data.csv',
    target_column='WeightedExpense'
)

# Predict on Test CSV
test_results = predict_on_test_data(
    test_csv_path='/Users/rishav/Downloads/test_data_final.csv',
    model=model,
    scaler=scaler,
    numeric_cols=numeric_cols,
    non_numeric_cols=non_numeric_cols
)

# Aggregate predictions by household
final_results = aggregate_household_predictions(test_results)

# Check if test data has actual values
if 'TotalExpense' in pd.read_csv('/Users/rishav/Downloads/final_test_dataset.csv').columns:
    # Run comparison
    metrics = compare_expenses(
        predictions_csv='test_data_with_household_sums_xgb_direct.csv',
        actual_csv='/Users/rishav/Downloads/final_test_dataset.csv',
        pred_col='predicted_expense',
        actual_col='TotalExpense',
        join_col='HH_ID'
    )
    
    print("\nFinal evaluation metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")

Training XGBoost model directly on 901723 samples with 48 features

--- XGBoost Prediction ---
Mean Percentage Error (MPE): 26.10%
Mean Absolute Error (MAE): 1180.08
R² Score: 0.6777

Top 10 Most Important Features:
                                              Feature  Importance
46                         Is_HH_Have_Washing_machine    0.173264
38                               Is_HH_Have_Laptop_PC    0.106223
0                                      Person Srl No.    0.105258
45                            Is_HH_Have_Refrigerator    0.088647
15                                             Sector    0.043663
22                                  HH Size (For FDQ)    0.037203
25               Is_online_Clothing_Purchased_Last365    0.036808
42                       Is_HH_Have_Motorcar_jeep_van    0.031044
7   Whether used internet from any location during...    0.029159
1                             Relation to head (code)    0.025902


ValueError: Test data is missing columns that were used in training: ['Person Srl No.', 'Relation to head (code)', 'Gender', 'Age(in years)', 'Marital Status (code)', 'Highest educational level attained (code)', 'Total year of education completed', 'Whether used internet from any location during last 30 days', 'No. of days stayed away from home during last 30 days', 'No. of meals usually taken in a day', 'No. of meals taken during last 30 days from school, balwadi etc.', 'No. of meals taken during last 30 days from employer as perquisites or part of wage', 'No. of meals taken during last 30 days  others', 'No. of meals taken during last 30 days on payment', 'No. of meals taken during last 30 days at home']

In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

# Standardize numeric features, exclude non-numeric (e.g., IDs)
def standardize_features(df, target_column):
    non_numeric_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
    if target_column in non_numeric_cols:
        non_numeric_cols.remove(target_column)

    X_numeric = df.drop(columns=[target_column] + non_numeric_cols)
    y = df[target_column].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_numeric)

    return X_scaled, y, scaler, non_numeric_cols, X_numeric.columns.tolist()

# Elbow method to find optimal clusters
def elbow_method_auto(X, max_clusters=10):
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
    # Find elbow: largest drop in inertia
    deltas = np.diff(inertias)
    second_deltas = np.diff(deltas)
    elbow_point = np.argmin(second_deltas) + 2  # +2 accounts for double diff index shift
    return elbow_point

# Mean Percentage Error
def mean_percentage_error(y_true, y_pred):
    # Avoid division by zero
    non_zero_mask = y_true != 0
    if np.any(non_zero_mask):
        return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
    return np.nan

# Evaluate and print metrics
def evaluate_predictions(y_true, y_pred, method_name=""):
    mpe = mean_percentage_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n--- {method_name} ---")
    print(f"Mean Percentage Error (MPE): {mpe:.2f}%")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R² Score: {r2:.4f}")
    
    return {
        'mpe': mpe,
        'mae': mae,
        'r2': r2
    }

# Select model based on model_type
def get_model(model_type):
    if model_type == 'linear':
        return LinearRegression()
    elif model_type == 'random_forest':
        return RandomForestRegressor(random_state=42, n_estimators=100)
    elif model_type == 'xgboost':
        return XGBRegressor(random_state=42, n_estimators=100, verbosity=0)
    else:
        raise ValueError("Invalid model_type. Choose from 'linear', 'random_forest', or 'xgboost'.")

# Main function: cluster, predict, evaluate
def cluster_and_predict(train_df, target_column='WeightedExpense', max_clusters=10, model_type='random_forest'):
    """
    Implement clustering approach with selected model for each cluster
    """
    # Step 1: Standardize numeric features
    X_scaled, y, scaler, non_numeric_cols, numeric_cols = standardize_features(train_df, target_column)

    # Step 2: Find optimal clusters
    optimal_k = elbow_method_auto(X_scaled, max_clusters=max_clusters)
    print(f"Optimal number of clusters (Elbow method): {optimal_k}")

    # Step 3: KMeans clustering
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
    clusters = kmeans.fit_predict(X_scaled)

    # Add cluster labels to DataFrame for reference
    df_with_clusters = train_df.copy()
    df_with_clusters['Cluster'] = clusters

    # Step 4A: Cluster-wise model training
    preds_model = np.zeros_like(y)
    cluster_models = {}
    
    for cluster_id in np.unique(clusters):
        cluster_idx = clusters == cluster_id
        X_cluster = X_scaled[cluster_idx]
        y_cluster = y[cluster_idx]
        
        print(f"Training {model_type} model for cluster {cluster_id} with {len(y_cluster)} samples")
        
        # Skip empty clusters (shouldn't happen but just in case)
        if len(y_cluster) == 0:
            continue

        # Train model for this cluster
        model = get_model(model_type)
        model.fit(X_cluster, y_cluster)
        
        # Make predictions for this cluster
        preds_model[cluster_idx] = model.predict(X_cluster)

        # Save model per cluster
        cluster_models[cluster_id] = model
        
        # Print feature importance if available
        if hasattr(model, 'feature_importances_') and len(numeric_cols) > 0:
            feature_importance = pd.DataFrame({
                'Feature': numeric_cols,
                'Importance': model.feature_importances_
            }).sort_values('Importance', ascending=False)
            
            print(f"\nTop 5 features for cluster {cluster_id}:")
            print(feature_importance.head(5))

    # Evaluate cluster-wise model
    evaluate_predictions(y, preds_model, method_name=f"Cluster-wise {model_type.capitalize()} Prediction")

    # Step 4B: Cluster mean assignment (as a simple baseline)
    preds_mean = np.zeros_like(y)
    for cluster_id in np.unique(clusters):
        cluster_idx = clusters == cluster_id
        y_cluster = y[cluster_idx]
        cluster_mean = np.mean(y_cluster)
        preds_mean[cluster_idx] = cluster_mean

    # Evaluate cluster mean assignment
    evaluate_predictions(y, preds_mean, method_name="Cluster Mean Assignment")

    return preds_model, preds_mean, cluster_models, kmeans, scaler, numeric_cols, non_numeric_cols, clusters, optimal_k

# FIXED: Predict on test data using cluster-specific models
def predict_on_test_data(test_df, kmeans, cluster_models, scaler, numeric_cols, non_numeric_cols):
    """
    Predict expenses on test data using trained cluster-specific models
    """
    # Create a DataFrame with only the numeric columns in the correct order
    X_numeric_test = pd.DataFrame(index=test_df.index)
    
    # Track any columns missing from test data
    missing_columns = []
    
    for col in numeric_cols:
        if col in test_df.columns:
            # Check if this specific column has nulls before adding it
            if test_df[col].isnull().any():
                # Fill nulls in this column with its median before adding
                X_numeric_test[col] = test_df[col].fillna(test_df[col].median())
                print(f"Filled nulls in column '{col}' with its median")
            else:
                X_numeric_test[col] = test_df[col]
        else:
            missing_columns.append(col)
            print(f"Warning: Column '{col}' missing from test data. Using zeros.")
            X_numeric_test[col] = 0
    
    # Report on missing columns if any
    if missing_columns:
        print(f"\nTotal missing columns: {len(missing_columns)}")
        if len(missing_columns) <= 10:  # Only print if there aren't too many
            print(f"Missing columns: {', '.join(missing_columns)}")
        else:
            print(f"First 10 missing columns: {', '.join(missing_columns[:10])}...")
    else:
        print("All required numeric columns found in test data")
    
    # Standardize features using the same scaler from training
    X_scaled_test = scaler.transform(X_numeric_test)
    
    # Predict clusters for test data
    test_clusters = kmeans.predict(X_scaled_test)
    
    # Predict using cluster-specific models
    predictions = np.zeros(X_scaled_test.shape[0])
    
    for cluster_id in np.unique(test_clusters):
        cluster_idx = test_clusters == cluster_id
        X_cluster_test = X_scaled_test[cluster_idx]
        
        if cluster_id in cluster_models:
            model = cluster_models[cluster_id]
            predictions[cluster_idx] = model.predict(X_cluster_test)
            print(f"Made predictions for {np.sum(cluster_idx)} samples in cluster {cluster_id}")
        else:
            print(f"Warning: No model for cluster {cluster_id}, using average prediction")
            # Use average of all predictions for this cluster
            cluster_means = np.mean([m.predict(X_cluster_test).mean() 
                                     for m in cluster_models.values()])
            predictions[cluster_idx] = cluster_means
    
    # Add predictions to test DataFrame
    test_df_with_preds = test_df.copy()
    test_df_with_preds['predicted_expense'] = predictions
    test_df_with_preds['assigned_cluster'] = test_clusters
    
    # Display sample predictions
    print("\nSample Predictions:")
    print(test_df_with_preds[['predicted_expense', 'assigned_cluster']].head())
    
    return test_df_with_preds

# Aggregate by household ID
def aggregate_household_predictions(test_results):
    """
    Aggregate predictions by household ID
    """
    if 'HH_ID' not in test_results.columns:
        print("Warning: 'HH_ID' column not found. Cannot aggregate by household.")
        return test_results
    
    # Calculate the sum of predicted_expense for each household ID
    household_sums = test_results.groupby('HH_ID')['predicted_expense'].sum()
    
    # Replace each row's predicted_expense with its household sum
    test_results_aggregated = test_results.copy()
    test_results_aggregated['predicted_expense'] = test_results_aggregated['HH_ID'].map(household_sums)
    
    # Verify the result with a sample household
    sample_household = test_results_aggregated['HH_ID'].iloc[0]
    sample_rows = test_results_aggregated[test_results_aggregated['HH_ID'] == sample_household].head()
    print(f"\nSample rows from household {sample_household}:")
    print(sample_rows[['HH_ID', 'predicted_expense', 'assigned_cluster']])
    
    return test_results_aggregated

# Compare predictions with actual values
def compare_expenses(predictions_df, actual_df, 
                     pred_col='predicted_expense', actual_col='TotalExpense',
                     join_col='HH_ID'):
    """
    Compare predicted expenses with actual expenses
    """
    # Check if the datasets have the required columns
    required_cols = {
        'predictions': [pred_col, join_col],
        'actual': [actual_col, join_col]
    }
    
    for df_name, cols in required_cols.items():
        df = predictions_df if df_name == 'predictions' else actual_df
        missing = [col for col in cols if col not in df.columns]
        if missing:
            raise ValueError(f"{df_name} dataset is missing columns: {missing}")
    
    # If we have multiple rows per household, keep only one row per household in each dataset
    pred_unique = predictions_df.drop_duplicates(subset=[join_col])[[join_col, pred_col]]
    actual_unique = actual_df.drop_duplicates(subset=[join_col])[[join_col, actual_col]]
    
    # Merge the datasets
    merged_df = pd.merge(pred_unique, actual_unique, on=join_col, how='inner')
    
    # Print the number of households we can compare
    print(f"Number of households for comparison: {len(merged_df)}")
    
    # Extract the values for comparison
    y_pred = merged_df[pred_col].values
    y_true = merged_df[actual_col].values
    
    # Calculate and return metrics
    metrics = evaluate_predictions(y_true, y_pred, method_name="Final Evaluation")
    
    # Save the comparison to a new CSV for further analysis
    merged_df['error'] = y_pred - y_true
    merged_df['percentage_error'] = (merged_df['error'] / merged_df[actual_col]) * 100
    merged_df.to_csv('expense_comparison_results.csv', index=False)
    print("\nDetailed comparison saved to 'expense_comparison_results.csv'")
    
    return metrics, merged_df

# Main execution
if __name__ == "__main__":
    # 1. Load datasets with paths from the provided code
    print("Loading datasets...")
    train_path = '/Users/rishav/Downloads/IIT_GN/Train_Data/train_data_final.csv'  # Using the path from the provided code
    test_path = '/Users/rishav/Downloads/test_data_final.csv'  # Using the path from the provided code
    
    try:
        train_data = pd.read_csv(train_path)
        test_data = pd.read_csv(test_path)
        
        print(f"Training data shape: {train_data.shape}")
        print(f"Testing data shape: {test_data.shape}")
        
        # Print information about null values in test data
        null_counts = test_data.isnull().sum()
        print("\nNull value counts in test data:")
        print(null_counts[null_counts > 0])
        if null_counts.sum() == 0:
            print("No null values found in test data")
        
        # 2. Fixed parameters
        target_column = 'WeightedExpense'
        model_type = 'random_forest'  # Could be 'linear', 'random_forest', or 'xgboost'
        
        # 3. Train cluster-wise models
        print(f"\nTraining cluster-wise {model_type} models...")
        preds_model, preds_mean, cluster_models, kmeans, scaler, numeric_cols, non_numeric_cols, clusters, optimal_k = cluster_and_predict(
            train_data, 
            target_column=target_column,
            max_clusters=10,
            model_type=model_type
        )

        # 4. Make predictions on test data
        print("\nMaking predictions on test data...")
        test_with_predictions = predict_on_test_data(
            test_data,
            kmeans,
            cluster_models,
            scaler,
            numeric_cols,
            non_numeric_cols
        )

        # 5. Aggregate predictions by household ID
        print("\nAggregating predictions by household...")
        final_results = aggregate_household_predictions(test_with_predictions)

        # 6. Save the final results
        output_path = f'test_predictions_{model_type}.csv'
        final_results.to_csv(output_path, index=False)
        print(f"\nFinal results saved to {output_path}")

        # 7. Compare with actual values (since we're loading final_test_dataset.csv which should have actual values)
        if 'TotalExpense' in test_data.columns:
            print("\nActual values found in test data. Comparing predictions...")
            metrics, comparison_df = compare_expenses(
                final_results,
                test_data,
                pred_col='predicted_expense',
                actual_col='TotalExpense',
                join_col='HH_ID'
            )
        else:
            print("\nNo 'TotalExpense' column found in test data. Skipping comparison.")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Loading datasets...
Training data shape: (901723, 50)
Testing data shape: (225316, 50)

Null value counts in test data:
Series([], dtype: int64)
No null values found in test data

Training cluster-wise random_forest models...
Optimal number of clusters (Elbow method): 7
Training random_forest model for cluster 0 with 170811 samples

Top 5 features for cluster 0:
                    Feature  Importance
0            Person Srl No.    0.132419
22        HH Size (For FDQ)    0.129969
1   Relation to head (code)    0.100957
24                   NIC_5D    0.061566
3             Age(in years)    0.053910
Training random_forest model for cluster 1 with 221499 samples

Top 5 features for cluster 1:
              Feature  Importance
18           District    0.107002
17         NSS-Region    0.092360
22  HH Size (For FDQ)    0.089114
24             NIC_5D    0.083437
3       Age(in years)    0.069685
Training random_forest model for cluster 2 with 119558 samples

Top 5 features for cluster 2:
   

In [14]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor

# Standardize numeric features, exclude non-numeric (e.g., IDs)
def standardize_features(df, target_column):
    non_numeric_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
    if target_column in non_numeric_cols:
        non_numeric_cols.remove(target_column)

    X_numeric = df.drop(columns=[target_column] + non_numeric_cols)
    y = df[target_column].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_numeric)

    return X_scaled, y, scaler, non_numeric_cols, X_numeric.columns.tolist()

# Elbow method to find optimal clusters
def elbow_method_auto(X, max_clusters=10):
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
    # Find elbow: largest drop in inertia
    deltas = np.diff(inertias)
    second_deltas = np.diff(deltas)
    elbow_point = np.argmin(second_deltas) + 2  # +2 accounts for double diff index shift
    return elbow_point

# Mean Percentage Error
def mean_percentage_error(y_true, y_pred):
    # Avoid division by zero
    non_zero_mask = y_true != 0
    if np.any(non_zero_mask):
        return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
    return np.nan

# Evaluate and print metrics
def evaluate_predictions(y_true, y_pred, method_name=""):
    mpe = mean_percentage_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n--- {method_name} ---")
    print(f"Mean Percentage Error (MPE): {mpe:.2f}%")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R² Score: {r2:.4f}")
    
    return {
        'mpe': mpe,
        'mae': mae,
        'r2': r2
    }

# Select model based on model_type
def get_model(model_type):
    if model_type == 'linear':
        return LinearRegression()
    elif model_type == 'random_forest':
        return RandomForestRegressor(random_state=42, n_estimators=100)
    elif model_type == 'xgboost':
        return XGBRegressor(random_state=42, n_estimators=100, verbosity=0)
    else:
        raise ValueError("Invalid model_type. Choose from 'linear', 'random_forest', or 'xgboost'.")

# Main function: cluster, predict, evaluate
def cluster_and_predict(train_df, target_column='WeightedExpense', max_clusters=10, model_type='random_forest'):
    """
    Implement clustering approach with selected model for each cluster
    """
    # Step 1: Standardize numeric features
    X_scaled, y, scaler, non_numeric_cols, numeric_cols = standardize_features(train_df, target_column)

    # Step 2: Find optimal clusters
    optimal_k = elbow_method_auto(X_scaled, max_clusters=max_clusters)
    print(f"Optimal number of clusters (Elbow method): {optimal_k}")

    # Step 3: KMeans clustering
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
    clusters = kmeans.fit_predict(X_scaled)

    # Add cluster labels to DataFrame for reference
    df_with_clusters = train_df.copy()
    df_with_clusters['Cluster'] = clusters

    # Step 4A: Cluster-wise model training
    preds_model = np.zeros_like(y)
    cluster_models = {}
    
    for cluster_id in np.unique(clusters):
        cluster_idx = clusters == cluster_id
        X_cluster = X_scaled[cluster_idx]
        y_cluster = y[cluster_idx]
        
        print(f"Training {model_type} model for cluster {cluster_id} with {len(y_cluster)} samples")
        
        # Skip empty clusters (shouldn't happen but just in case)
        if len(y_cluster) == 0:
            continue

        # Train model for this cluster
        model = get_model(model_type)
        model.fit(X_cluster, y_cluster)
        
        # Make predictions for this cluster
        preds_model[cluster_idx] = model.predict(X_cluster)

        # Save model per cluster
        cluster_models[cluster_id] = model
        
        # Print feature importance if available
        if hasattr(model, 'feature_importances_') and len(numeric_cols) > 0:
            feature_importance = pd.DataFrame({
                'Feature': numeric_cols,
                'Importance': model.feature_importances_
            }).sort_values('Importance', ascending=False)
            
            print(f"\nTop 5 features for cluster {cluster_id}:")
            print(feature_importance.head(5))

    # Evaluate cluster-wise model
    evaluate_predictions(y, preds_model, method_name=f"Cluster-wise {model_type.capitalize()} Prediction")

    # Step 4B: Cluster mean assignment (as a simple baseline)
    preds_mean = np.zeros_like(y)
    for cluster_id in np.unique(clusters):
        cluster_idx = clusters == cluster_id
        y_cluster = y[cluster_idx]
        cluster_mean = np.mean(y_cluster)
        preds_mean[cluster_idx] = cluster_mean

    # Evaluate cluster mean assignment
    evaluate_predictions(y, preds_mean, method_name="Cluster Mean Assignment")

    return preds_model, preds_mean, cluster_models, kmeans, scaler, numeric_cols, non_numeric_cols, clusters, optimal_k

# FIXED: Predict on test data using cluster-specific models
def predict_on_test_data(test_df, kmeans, cluster_models, scaler, numeric_cols, non_numeric_cols):
    """
    Predict expenses on test data using trained cluster-specific models
    """
    # Create a DataFrame with only the numeric columns in the correct order
    X_numeric_test = pd.DataFrame(index=test_df.index)
    
    # Track any columns missing from test data
    missing_columns = []
    
    for col in numeric_cols:
        if col in test_df.columns:
            # Check if this specific column has nulls before adding it
            if test_df[col].isnull().any():
                # Fill nulls in this column with its median before adding
                X_numeric_test[col] = test_df[col].fillna(test_df[col].median())
                print(f"Filled nulls in column '{col}' with its median")
            else:
                X_numeric_test[col] = test_df[col]
        else:
            missing_columns.append(col)
            print(f"Warning: Column '{col}' missing from test data. Using zeros.")
            X_numeric_test[col] = 0
    
    # Report on missing columns if any
    if missing_columns:
        print(f"\nTotal missing columns: {len(missing_columns)}")
        if len(missing_columns) <= 10:  # Only print if there aren't too many
            print(f"Missing columns: {', '.join(missing_columns)}")
        else:
            print(f"First 10 missing columns: {', '.join(missing_columns[:10])}...")
    else:
        print("All required numeric columns found in test data")
    
    # Standardize features using the same scaler from training
    X_scaled_test = scaler.transform(X_numeric_test)
    
    # Predict clusters for test data
    test_clusters = kmeans.predict(X_scaled_test)
    
    # Predict using cluster-specific models
    predictions = np.zeros(X_scaled_test.shape[0])
    
    for cluster_id in np.unique(test_clusters):
        cluster_idx = test_clusters == cluster_id
        X_cluster_test = X_scaled_test[cluster_idx]
        
        if cluster_id in cluster_models:
            model = cluster_models[cluster_id]
            predictions[cluster_idx] = model.predict(X_cluster_test)
            print(f"Made predictions for {np.sum(cluster_idx)} samples in cluster {cluster_id}")
        else:
            print(f"Warning: No model for cluster {cluster_id}, using average prediction")
            # Use average of all predictions for this cluster
            cluster_means = np.mean([m.predict(X_cluster_test).mean() 
                                     for m in cluster_models.values()])
            predictions[cluster_idx] = cluster_means
    
    # Add predictions to test DataFrame
    test_df_with_preds = test_df.copy()
    test_df_with_preds['predicted_expense'] = predictions
    test_df_with_preds['assigned_cluster'] = test_clusters
    
    # Display sample predictions
    print("\nSample Predictions:")
    print(test_df_with_preds[['predicted_expense', 'assigned_cluster']].head())
    
    return test_df_with_preds

# Aggregate by household ID
def aggregate_household_predictions(test_results):
    """
    Aggregate predictions by household ID
    """
    if 'HH_ID' not in test_results.columns:
        print("Warning: 'HH_ID' column not found. Cannot aggregate by household.")
        return test_results
    
    # Calculate the sum of predicted_expense for each household ID
    household_sums = test_results.groupby('HH_ID')['predicted_expense'].sum()
    
    # Replace each row's predicted_expense with its household sum
    test_results_aggregated = test_results.copy()
    test_results_aggregated['predicted_expense'] = test_results_aggregated['HH_ID'].map(household_sums)
    
    # Verify the result with a sample household
    sample_household = test_results_aggregated['HH_ID'].iloc[0]
    sample_rows = test_results_aggregated[test_results_aggregated['HH_ID'] == sample_household].head()
    print(f"\nSample rows from household {sample_household}:")
    print(sample_rows[['HH_ID', 'predicted_expense', 'assigned_cluster']])
    
    return test_results_aggregated

# Compare predictions with actual values
def compare_expenses(predictions_df, actual_df, 
                     pred_col='predicted_expense', actual_col='TotalExpense',
                     join_col='HH_ID'):
    """
    Compare predicted expenses with actual expenses
    """
    # Check if the datasets have the required columns
    required_cols = {
        'predictions': [pred_col, join_col],
        'actual': [actual_col, join_col]
    }
    
    for df_name, cols in required_cols.items():
        df = predictions_df if df_name == 'predictions' else actual_df
        missing = [col for col in cols if col not in df.columns]
        if missing:
            raise ValueError(f"{df_name} dataset is missing columns: {missing}")
    
    # If we have multiple rows per household, keep only one row per household in each dataset
    pred_unique = predictions_df.drop_duplicates(subset=[join_col])[[join_col, pred_col]]
    actual_unique = actual_df.drop_duplicates(subset=[join_col])[[join_col, actual_col]]
    
    # Merge the datasets
    merged_df = pd.merge(pred_unique, actual_unique, on=join_col, how='inner')
    
    # Print the number of households we can compare
    print(f"Number of households for comparison: {len(merged_df)}")
    
    # Extract the values for comparison
    y_pred = merged_df[pred_col].values
    y_true = merged_df[actual_col].values
    
    # Calculate and return metrics
    metrics = evaluate_predictions(y_true, y_pred, method_name="Final Evaluation")
    
    # Save the comparison to a new CSV for further analysis
    merged_df['error'] = y_pred - y_true
    merged_df['percentage_error'] = (merged_df['error'] / merged_df[actual_col]) * 100
    merged_df.to_csv('expense_comparison_results.csv', index=False)
    print("\nDetailed comparison saved to 'expense_comparison_results.csv'")
    
    return metrics, merged_df

# Main execution
if __name__ == "__main__":
    # 1. Load datasets with paths from the provided code
    print("Loading datasets...")
    train_path = '/Users/rishav/Downloads/IIT_GN/Train_Data/train_data_final.csv'  # Using the path from the provided code
    test_path = '/Users/rishav/Downloads/IIT_GN/Train_Data/test_data_final.csv'  # Using the path from the provided code
    
    try:
        train_data = pd.read_csv(train_path)
        test_data = pd.read_csv(test_path)
        
        print(f"Training data shape: {train_data.shape}")
        print(f"Testing data shape: {test_data.shape}")
        
        # Print information about null values in test data
        null_counts = test_data.isnull().sum()
        print("\nNull value counts in test data:")
        print(null_counts[null_counts > 0])
        if null_counts.sum() == 0:
            print("No null values found in test data")
        
        # 2. Fixed parameters
        target_column = 'WeightedExpense'
        model_type = 'xgboost'  # Changed from 'random_forest' to 'xgboost'
        
        # 3. Train cluster-wise models
        print(f"\nTraining cluster-wise {model_type} models...")
        preds_model, preds_mean, cluster_models, kmeans, scaler, numeric_cols, non_numeric_cols, clusters, optimal_k = cluster_and_predict(
            train_data, 
            target_column=target_column,
            max_clusters=10,
            model_type=model_type
        )

        # 4. Make predictions on test data
        print("\nMaking predictions on test data...")
        test_with_predictions = predict_on_test_data(
            test_data,
            kmeans,
            cluster_models,
            scaler,
            numeric_cols,
            non_numeric_cols
        )

        # 5. Aggregate predictions by household ID
        print("\nAggregating predictions by household...")
        final_results = aggregate_household_predictions(test_with_predictions)

        # 6. Save the final results
        output_path = f'test_predictions_{model_type}.csv'
        final_results.to_csv(output_path, index=False)
        print(f"\nFinal results saved to {output_path}")

        # 7. Compare with actual values (since we're loading final_test_dataset.csv which should have actual values)
        if 'TotalExpense' in test_data.columns:
            print("\nActual values found in test data. Comparing predictions...")
            metrics, comparison_df = compare_expenses(
                final_results,
                test_data,
                pred_col='predicted_expense',
                actual_col='TotalExpense',
                join_col='HH_ID'
            )
        else:
            print("\nNo 'TotalExpense' column found in test data. Skipping comparison.")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")


Loading datasets...
Training data shape: (901723, 50)
Testing data shape: (225316, 50)

Null value counts in test data:
Series([], dtype: int64)
No null values found in test data

Training cluster-wise xgboost models...
Optimal number of clusters (Elbow method): 7
Training xgboost model for cluster 0 with 170811 samples

Top 5 features for cluster 0:
                       Feature  Importance
0               Person Srl No.    0.240340
22           HH Size (For FDQ)    0.080720
45     Is_HH_Have_Refrigerator    0.059460
1      Relation to head (code)    0.056594
46  Is_HH_Have_Washing_machine    0.048659
Training xgboost model for cluster 1 with 221499 samples

Top 5 features for cluster 1:
                          Feature  Importance
45        Is_HH_Have_Refrigerator    0.162994
22              HH Size (For FDQ)    0.052001
42   Is_HH_Have_Motorcar_jeep_van    0.051991
36          Is_HH_Have_Television    0.045130
41  Is_HH_Have_Motorcycle_scooter    0.045014
Training xgboost model fo

In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

# Standardize numeric features, exclude non-numeric (e.g., IDs)
def standardize_features(df, target_column):
    non_numeric_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
    if target_column in non_numeric_cols:
        non_numeric_cols.remove(target_column)

    X_numeric = df.drop(columns=[target_column] + non_numeric_cols)
    y = df[target_column].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_numeric)

    return X_scaled, y, scaler, non_numeric_cols, X_numeric.columns.tolist()

# Elbow method to find optimal clusters but with a manual override option
def elbow_method_auto(X, max_clusters=10, manual_override=None):
    if manual_override is not None:
        print(f"Using manual override for number of clusters: {manual_override}")
        return manual_override
        
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
    # Find elbow: largest drop in inertia
    deltas = np.diff(inertias)
    second_deltas = np.diff(deltas)
    elbow_point = np.argmin(second_deltas) + 2  # +2 accounts for double diff index shift
    return elbow_point

# Mean Percentage Error
def mean_percentage_error(y_true, y_pred):
    # Avoid division by zero
    non_zero_mask = y_true != 0
    if np.any(non_zero_mask):
        return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
    return np.nan

# Evaluate and print metrics
def evaluate_predictions(y_true, y_pred, method_name=""):
    mpe = mean_percentage_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n--- {method_name} ---")
    print(f"Mean Percentage Error (MPE): {mpe:.2f}%")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R² Score: {r2:.4f}")
    
    return {
        'mpe': mpe,
        'mae': mae,
        'r2': r2
    }

# Select model based on model_type with regularization for XGBoost
def get_model(model_type):
    if model_type == 'linear':
        return LinearRegression()
    elif model_type == 'random_forest':
        return RandomForestRegressor(random_state=42, n_estimators=100)
    elif model_type == 'xgboost':
        # Added regularization parameters to reduce overfitting
        return XGBRegressor(
            random_state=42, 
            n_estimators=100, 
            max_depth=5,           # Reduced from default 6
            min_child_weight=3,    # Increased from default 1
            reg_alpha=1.0,         # L1 regularization
            reg_lambda=2.0,        # L2 regularization
            verbosity=0
        )
    else:
        raise ValueError("Invalid model_type. Choose from 'linear', 'random_forest', or 'xgboost'.")

# Feature selection to keep only top important features
def select_top_features(X, y, model_type='xgboost', top_n=15):
    model = get_model(model_type)
    model.fit(X, y)
    
    if not hasattr(model, 'feature_importances_'):
        print("Model doesn't have feature importances, skipping feature selection")
        return X, list(range(X.shape[1]))
    
    # Create a feature selector using SelectFromModel
    selector = SelectFromModel(model, threshold=-np.inf, max_features=top_n, prefit=True)
    X_selected = selector.transform(X)
    
    # Get indices of selected features
    selected_indices = np.where(selector.get_support())[0]
    
    print(f"Reduced features from {X.shape[1]} to {X_selected.shape[1]}")
    return X_selected, selected_indices

# Main function: cluster, predict, evaluate
def cluster_and_predict(train_df, target_column='WeightedExpense', max_clusters=10, model_type='random_forest', manual_clusters=5):
    """
    Implement clustering approach with selected model for each cluster
    """
    # Step 1: Standardize numeric features
    X_scaled, y, scaler, non_numeric_cols, numeric_cols = standardize_features(train_df, target_column)

    # Step 2: Find optimal clusters (with manual override)
    optimal_k = elbow_method_auto(X_scaled, max_clusters=max_clusters, manual_override=manual_clusters)
    print(f"Using {optimal_k} clusters")

    # Step 3: KMeans clustering
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
    clusters = kmeans.fit_predict(X_scaled)

    # Add cluster labels to DataFrame for reference
    df_with_clusters = train_df.copy()
    df_with_clusters['Cluster'] = clusters

    # Step 4A: Cluster-wise model training
    preds_model = np.zeros_like(y)
    cluster_models = {}
    feature_indices_per_cluster = {}
    
    for cluster_id in np.unique(clusters):
        cluster_idx = clusters == cluster_id
        X_cluster = X_scaled[cluster_idx]
        y_cluster = y[cluster_idx]
        
        print(f"Training {model_type} model for cluster {cluster_id} with {len(y_cluster)} samples")
        
        # Skip empty clusters (shouldn't happen but just in case)
        if len(y_cluster) == 0:
            continue

        # Split data for early stopping
        X_train, X_val, y_train, y_val = train_test_split(
            X_cluster, y_cluster, test_size=0.2, random_state=42
        )
        
        # Feature selection - limit to top 15 most important features
        X_train_selected, selected_indices = select_top_features(X_train, y_train, model_type, top_n=15)
        X_val_selected = X_val[:, selected_indices]
        
        # Store selected feature indices for prediction
        feature_indices_per_cluster[cluster_id] = selected_indices
        
        # Train model for this cluster
        model = get_model(model_type)
        
        # Use early stopping if XGBoost - simplified approach without early stopping
        if model_type == 'xgboost':
            # Simply fit without early stopping since it's causing compatibility issues
            model.fit(X_train_selected, y_train)
            print(f"XGBoost model fitted successfully for cluster")
        else:
            model.fit(X_train_selected, y_train)
        
        # Make predictions for this cluster using selected features
        X_cluster_selected = X_cluster[:, selected_indices]
        preds_model[cluster_idx] = model.predict(X_cluster_selected)

        # Save model per cluster
        cluster_models[cluster_id] = model
        
        # Print feature importance if available
        if hasattr(model, 'feature_importances_') and len(selected_indices) > 0:
            selected_feature_names = [numeric_cols[i] for i in selected_indices]
            feature_importance = pd.DataFrame({
                'Feature': selected_feature_names,
                'Importance': model.feature_importances_
            }).sort_values('Importance', ascending=False)
            
            print(f"\nTop 5 features for cluster {cluster_id}:")
            print(feature_importance.head(5))

    # Evaluate cluster-wise model
    evaluate_predictions(y, preds_model, method_name=f"Cluster-wise {model_type.capitalize()} Prediction")

    # Step 4B: Cluster mean assignment (as a simple baseline)
    preds_mean = np.zeros_like(y)
    for cluster_id in np.unique(clusters):
        cluster_idx = clusters == cluster_id
        y_cluster = y[cluster_idx]
        cluster_mean = np.mean(y_cluster)
        preds_mean[cluster_idx] = cluster_mean

    # Evaluate cluster mean assignment
    evaluate_predictions(y, preds_mean, method_name="Cluster Mean Assignment")

    return preds_model, preds_mean, cluster_models, kmeans, scaler, numeric_cols, non_numeric_cols, clusters, optimal_k, feature_indices_per_cluster

# FIXED: Predict on test data using cluster-specific models
def predict_on_test_data(test_df, kmeans, cluster_models, scaler, numeric_cols, non_numeric_cols, feature_indices_per_cluster):
    """
    Predict expenses on test data using trained cluster-specific models
    """
    # Create a DataFrame with only the numeric columns in the correct order
    X_numeric_test = pd.DataFrame(index=test_df.index)
    
    # Track any columns missing from test data
    missing_columns = []
    
    for col in numeric_cols:
        if col in test_df.columns:
            # Check if this specific column has nulls before adding it
            if test_df[col].isnull().any():
                # Fill nulls in this column with its median before adding
                X_numeric_test[col] = test_df[col].fillna(test_df[col].median())
                print(f"Filled nulls in column '{col}' with its median")
            else:
                X_numeric_test[col] = test_df[col]
        else:
            missing_columns.append(col)
            print(f"Warning: Column '{col}' missing from test data. Using zeros.")
            X_numeric_test[col] = 0
    
    # Report on missing columns if any
    if missing_columns:
        print(f"\nTotal missing columns: {len(missing_columns)}")
        if len(missing_columns) <= 10:  # Only print if there aren't too many
            print(f"Missing columns: {', '.join(missing_columns)}")
        else:
            print(f"First 10 missing columns: {', '.join(missing_columns[:10])}...")
    else:
        print("All required numeric columns found in test data")
    
    # Standardize features using the same scaler from training
    X_scaled_test = scaler.transform(X_numeric_test)
    
    # Predict clusters for test data
    test_clusters = kmeans.predict(X_scaled_test)
    
    # Predict using cluster-specific models
    predictions = np.zeros(X_scaled_test.shape[0])
    
    for cluster_id in np.unique(test_clusters):
        cluster_idx = test_clusters == cluster_id
        X_cluster_test = X_scaled_test[cluster_idx]
        
        if cluster_id in cluster_models:
            model = cluster_models[cluster_id]
            # Apply feature selection consistent with training
            selected_indices = feature_indices_per_cluster.get(cluster_id, None)
            if selected_indices is not None:
                X_cluster_test = X_cluster_test[:, selected_indices]
            
            predictions[cluster_idx] = model.predict(X_cluster_test)
            print(f"Made predictions for {np.sum(cluster_idx)} samples in cluster {cluster_id}")
        else:
            print(f"Warning: No model for cluster {cluster_id}, using average prediction")
            # Use average of all predictions for this cluster
            cluster_means = np.mean([m.predict(X_cluster_test[:, feature_indices_per_cluster.get(c_id, slice(None))]).mean() 
                                    for c_id, m in cluster_models.items()])
            predictions[cluster_idx] = cluster_means
    
    # Add predictions to test DataFrame
    test_df_with_preds = test_df.copy()
    test_df_with_preds['predicted_expense'] = predictions
    test_df_with_preds['assigned_cluster'] = test_clusters
    
    # Display sample predictions
    print("\nSample Predictions:")
    print(test_df_with_preds[['predicted_expense', 'assigned_cluster']].head())
    
    return test_df_with_preds

# Aggregate by household ID
def aggregate_household_predictions(test_results):
    """
    Aggregate predictions by household ID
    """
    if 'HH_ID' not in test_results.columns:
        print("Warning: 'HH_ID' column not found. Cannot aggregate by household.")
        return test_results
    
    # Calculate the sum of predicted_expense for each household ID
    household_sums = test_results.groupby('HH_ID')['predicted_expense'].sum()
    
    # Replace each row's predicted_expense with its household sum
    test_results_aggregated = test_results.copy()
    test_results_aggregated['predicted_expense'] = test_results_aggregated['HH_ID'].map(household_sums)
    
    # Verify the result with a sample household
    sample_household = test_results_aggregated['HH_ID'].iloc[0]
    sample_rows = test_results_aggregated[test_results_aggregated['HH_ID'] == sample_household].head()
    print(f"\nSample rows from household {sample_household}:")
    print(sample_rows[['HH_ID', 'predicted_expense', 'assigned_cluster']])
    
    return test_results_aggregated

# Compare predictions with actual values
def compare_expenses(predictions_df, actual_df, 
                     pred_col='predicted_expense', actual_col='TotalExpense',
                     join_col='HH_ID'):
    """
    Compare predicted expenses with actual expenses
    """
    # Check if the datasets have the required columns
    required_cols = {
        'predictions': [pred_col, join_col],
        'actual': [actual_col, join_col]
    }
    
    for df_name, cols in required_cols.items():
        df = predictions_df if df_name == 'predictions' else actual_df
        missing = [col for col in cols if col not in df.columns]
        if missing:
            raise ValueError(f"{df_name} dataset is missing columns: {missing}")
    
    # If we have multiple rows per household, keep only one row per household in each dataset
    pred_unique = predictions_df.drop_duplicates(subset=[join_col])[[join_col, pred_col]]
    actual_unique = actual_df.drop_duplicates(subset=[join_col])[[join_col, actual_col]]
    
    # Merge the datasets
    merged_df = pd.merge(pred_unique, actual_unique, on=join_col, how='inner')
    
    # Print the number of households we can compare
    print(f"Number of households for comparison: {len(merged_df)}")
    
    # Extract the values for comparison
    y_pred = merged_df[pred_col].values
    y_true = merged_df[actual_col].values
    
    # Calculate and return metrics
    metrics = evaluate_predictions(y_true, y_pred, method_name="Final Evaluation")
    
    # Save the comparison to a new CSV for further analysis
    merged_df['error'] = y_pred - y_true
    merged_df['percentage_error'] = (merged_df['error'] / merged_df[actual_col]) * 100
    merged_df.to_csv('expense_comparison_results.csv', index=False)
    print("\nDetailed comparison saved to 'expense_comparison_results.csv'")
    
    return metrics, merged_df

# Main execution
if __name__ == "__main__":
    # 1. Load datasets with paths from the provided code
    print("Loading datasets...")
    train_path = '/Users/rishav/Downloads/IIT_GN/Train_Data/train_data_final.csv'  # Using the path from the provided code
    test_path = '/Users/rishav/Downloads/IIT_GN/Train_Data/test_data_final.csv'  # Using the path from the provided code
    
    try:
        train_data = pd.read_csv(train_path)
        test_data = pd.read_csv(test_path)
        
        print(f"Training data shape: {train_data.shape}")
        print(f"Testing data shape: {test_data.shape}")
        
        # Print information about null values in test data
        null_counts = test_data.isnull().sum()
        print("\nNull value counts in test data:")
        print(null_counts[null_counts > 0])
        if null_counts.sum() == 0:
            print("No null values found in test data")
        
        # 2. Fixed parameters
        target_column = 'WeightedExpense'
        model_type = 'xgboost'  
        
        # 3. Train cluster-wise models with reduced number of clusters (5)
        print(f"\nTraining cluster-wise {model_type} models...")
        preds_model, preds_mean, cluster_models, kmeans, scaler, numeric_cols, non_numeric_cols, clusters, optimal_k, feature_indices_per_cluster = cluster_and_predict(
            train_data, 
            target_column=target_column,
            max_clusters=10,
            model_type=model_type,
            manual_clusters=5  # Manually use 5 clusters instead of elbow method
        )

        # 4. Make predictions on test data
        print("\nMaking predictions on test data...")
        test_with_predictions = predict_on_test_data(
            test_data,
            kmeans,
            cluster_models,
            scaler,
            numeric_cols,
            non_numeric_cols,
            feature_indices_per_cluster
        )

        # 5. Aggregate predictions by household ID
        print("\nAggregating predictions by household...")
        final_results = aggregate_household_predictions(test_with_predictions)

        # 6. Save the final results
        output_path = f'test_predictions_{model_type}_regularized.csv'
        final_results.to_csv(output_path, index=False)
        print(f"\nFinal results saved to {output_path}")

        # 7. Compare with actual values (since we're loading final_test_dataset.csv which should have actual values)
        if 'TotalExpense' in test_data.columns:
            print("\nActual values found in test data. Comparing predictions...")
            metrics, comparison_df = compare_expenses(
                final_results,
                test_data,
                pred_col='predicted_expense',
                actual_col='TotalExpense',
                join_col='HH_ID'
            )
        else:
            print("\nNo 'TotalExpense' column found in test data. Skipping comparison.")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Loading datasets...
Training data shape: (901723, 50)
Testing data shape: (225316, 50)

Null value counts in test data:
Series([], dtype: int64)
No null values found in test data

Training cluster-wise xgboost models...
Using manual override for number of clusters: 5
Using 5 clusters
Training xgboost model for cluster 0 with 123169 samples
Reduced features from 48 to 15
XGBoost model fitted successfully for cluster

Top 5 features for cluster 0:
                         Feature  Importance
0                 Person Srl No.    0.405651
12  Is_HH_Have_Motorcar_jeep_van    0.124966
8              HH Size (For FDQ)    0.124617
10          Is_HH_Have_Laptop_PC    0.060803
13    Is_HH_Have_Washing_machine    0.041186
Training xgboost model for cluster 1 with 259209 samples
Reduced features from 48 to 15
XGBoost model fitted successfully for cluster

Top 5 features for cluster 1:
                                 Feature  Importance
13               Is_HH_Have_Refrigerator    0.319152
12       

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

# Standardize numeric features, exclude non-numeric (e.g., IDs)
def standardize_features(df, target_column):
    non_numeric_cols = df.select_dtypes(include=['object', 'string']).columns.tolist()
    if target_column in non_numeric_cols:
        non_numeric_cols.remove(target_column)

    X_numeric = df.drop(columns=[target_column] + non_numeric_cols)
    y = df[target_column].values
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_numeric)

    return X_scaled, y, scaler, non_numeric_cols, X_numeric.columns.tolist()

# Elbow method to find optimal clusters
def elbow_method_auto(X, max_clusters=10):
    inertias = []
    for k in range(1, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        kmeans.fit(X)
        inertias.append(kmeans.inertia_)
    # Find elbow: largest drop in inertia
    deltas = np.diff(inertias)
    second_deltas = np.diff(deltas)
    elbow_point = np.argmin(second_deltas) + 2  # +2 accounts for double diff index shift
    return elbow_point

# Mean Percentage Error
def mean_percentage_error(y_true, y_pred):
    # Avoid division by zero
    non_zero_mask = y_true != 0
    if np.any(non_zero_mask):
        return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100
    return np.nan

# Evaluate and print metrics
def evaluate_predictions(y_true, y_pred, method_name=""):
    mpe = mean_percentage_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"\n--- {method_name} ---")
    print(f"Mean Percentage Error (MPE): {mpe:.2f}%")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"R² Score: {r2:.4f}")
    
    return {
        'mpe': mpe,
        'mae': mae,
        'r2': r2
    }

# Get base models for stacking
def get_base_models():
    models = {
        'random_forest': RandomForestRegressor(random_state=42, n_estimators=100),
        'xgboost': XGBRegressor(random_state=42, n_estimators=100, verbosity=0),
        'lightgbm': LGBMRegressor(random_state=42, n_estimators=100, verbose=-1)
    }
    return models

# Get meta model for stacking
def get_meta_model():
    return LinearRegression()

# Stacked model implementation for a specific cluster
def train_stacked_model(X, y, n_folds=5):
    # Setup K-fold cross-validation
    kfold = KFold(n_splits=n_folds, random_state=42, shuffle=True)
    
    # Get base models and meta model
    base_models = get_base_models()
    meta_model = get_meta_model()
    
    # Placeholder for meta features (predictions from base models)
    meta_features = np.zeros((X.shape[0], len(base_models)))
    
    # Train base models and generate meta features
    for i, (model_name, model) in enumerate(base_models.items()):
        print(f"Training base model: {model_name}")
        # Use cross-validation to create out-of-fold predictions
        oof_predictions = np.zeros(X.shape[0])
        
        for train_idx, val_idx in kfold.split(X):
            X_train, X_val = X[train_idx], X[val_idx]
            y_train, y_val = y[train_idx], y[val_idx]
            
            model.fit(X_train, y_train)
            oof_predictions[val_idx] = model.predict(X_val)
        
        # Store out-of-fold predictions as meta features
        meta_features[:, i] = oof_predictions
        
        # Retrain the model on the full data
        model.fit(X, y)
    
    # Train meta model on the meta features
    meta_model.fit(meta_features, y)
    
    # Create a stacked model dictionary containing all models
    stacked_model = {
        'base_models': base_models,
        'meta_model': meta_model
    }
    
    return stacked_model

# Main function: cluster, predict, evaluate using stacked model
def cluster_and_predict_stacked(train_df, target_column='WeightedExpense', max_clusters=10):
    """
    Implement clustering approach with stacked model for each cluster
    """
    # Step 1: Standardize numeric features
    X_scaled, y, scaler, non_numeric_cols, numeric_cols = standardize_features(train_df, target_column)

    # Step 2: Find optimal clusters
    optimal_k = elbow_method_auto(X_scaled, max_clusters=max_clusters)
    print(f"Optimal number of clusters (Elbow method): {optimal_k}")

    # Step 3: KMeans clustering
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init='auto')
    clusters = kmeans.fit_predict(X_scaled)

    # Add cluster labels to DataFrame for reference
    df_with_clusters = train_df.copy()
    df_with_clusters['Cluster'] = clusters

    # Step 4: Cluster-wise stacked model training
    preds_model = np.zeros_like(y)
    cluster_models = {}
    
    for cluster_id in np.unique(clusters):
        cluster_idx = clusters == cluster_id
        X_cluster = X_scaled[cluster_idx]
        y_cluster = y[cluster_idx]
        
        print(f"Training stacked model for cluster {cluster_id} with {len(y_cluster)} samples")
        
        # Skip empty clusters (shouldn't happen but just in case)
        if len(y_cluster) < n_folds:
            print(f"Skipping cluster {cluster_id} - not enough samples for {n_folds}-fold CV")
            continue

        # Train stacked model for this cluster
        stacked_model = train_stacked_model(X_cluster, y_cluster)
        
        # Make predictions for this cluster
        # First get base model predictions
        base_preds = np.zeros((X_cluster.shape[0], len(stacked_model['base_models'])))
        for i, (name, model) in enumerate(stacked_model['base_models'].items()):
            base_preds[:, i] = model.predict(X_cluster)
            
        # Then use meta model to make final predictions
        preds_model[cluster_idx] = stacked_model['meta_model'].predict(base_preds)

        # Save stacked model per cluster
        cluster_models[cluster_id] = stacked_model
        
        # Print feature importance for Random Forest and tree-based models
        for name, model in stacked_model['base_models'].items():
            if hasattr(model, 'feature_importances_') and len(numeric_cols) > 0:
                feature_importance = pd.DataFrame({
                    'Feature': numeric_cols,
                    'Importance': model.feature_importances_
                }).sort_values('Importance', ascending=False)
                
                print(f"\nTop 5 features for {name} in cluster {cluster_id}:")
                print(feature_importance.head(5))

    # Evaluate cluster-wise stacked model
    evaluate_predictions(y, preds_model, method_name="Cluster-wise Stacked Model Prediction")

    # Step 4B: Cluster mean assignment (as a simple baseline)
    preds_mean = np.zeros_like(y)
    for cluster_id in np.unique(clusters):
        cluster_idx = clusters == cluster_id
        y_cluster = y[cluster_idx]
        cluster_mean = np.mean(y_cluster)
        preds_mean[cluster_idx] = cluster_mean

    # Evaluate cluster mean assignment
    evaluate_predictions(y, preds_mean, method_name="Cluster Mean Assignment")

    return preds_model, preds_mean, cluster_models, kmeans, scaler, numeric_cols, non_numeric_cols, clusters, optimal_k

# Predict on test data using cluster-specific stacked models
def predict_on_test_data_stacked(test_df, kmeans, cluster_models, scaler, numeric_cols, non_numeric_cols):
    """
    Predict expenses on test data using trained cluster-specific stacked models
    """
    # Create a DataFrame with only the numeric columns in the correct order
    X_numeric_test = pd.DataFrame(index=test_df.index)
    
    # Track any columns missing from test data
    missing_columns = []
    
    for col in numeric_cols:
        if col in test_df.columns:
            # Check if this specific column has nulls before adding it
            if test_df[col].isnull().any():
                # Fill nulls in this column with its median before adding
                X_numeric_test[col] = test_df[col].fillna(test_df[col].median())
                print(f"Filled nulls in column '{col}' with its median")
            else:
                X_numeric_test[col] = test_df[col]
        else:
            missing_columns.append(col)
            print(f"Warning: Column '{col}' missing from test data. Using zeros.")
            X_numeric_test[col] = 0
    
    # Report on missing columns if any
    if missing_columns:
        print(f"\nTotal missing columns: {len(missing_columns)}")
        if len(missing_columns) <= 10:  # Only print if there aren't too many
            print(f"Missing columns: {', '.join(missing_columns)}")
        else:
            print(f"First 10 missing columns: {', '.join(missing_columns[:10])}...")
    else:
        print("All required numeric columns found in test data")
    
    # Standardize features using the same scaler from training
    X_scaled_test = scaler.transform(X_numeric_test)
    
    # Predict clusters for test data
    test_clusters = kmeans.predict(X_scaled_test)
    
    # Predict using cluster-specific stacked models
    predictions = np.zeros(X_scaled_test.shape[0])
    
    for cluster_id in np.unique(test_clusters):
        cluster_idx = test_clusters == cluster_id
        X_cluster_test = X_scaled_test[cluster_idx]
        
        if cluster_id in cluster_models:
            stacked_model = cluster_models[cluster_id]
            
            # Get predictions from base models
            base_preds = np.zeros((X_cluster_test.shape[0], len(stacked_model['base_models'])))
            for i, (name, model) in enumerate(stacked_model['base_models'].items()):
                base_preds[:, i] = model.predict(X_cluster_test)
            
            # Use meta model for final prediction
            cluster_preds = stacked_model['meta_model'].predict(base_preds)
            predictions[cluster_idx] = cluster_preds
            
            print(f"Made predictions for {np.sum(cluster_idx)} samples in cluster {cluster_id}")
        else:
            print(f"Warning: No model for cluster {cluster_id}, using average prediction")
            # Use average of all predictions for this cluster
            all_preds = []
            for c_id, stacked_model in cluster_models.items():
                # Get base model predictions
                base_preds = np.zeros((X_cluster_test.shape[0], len(stacked_model['base_models'])))
                for i, (name, model) in enumerate(stacked_model['base_models'].items()):
                    base_preds[:, i] = model.predict(X_cluster_test)
                
                # Get meta model predictions
                c_preds = stacked_model['meta_model'].predict(base_preds)
                all_preds.append(np.mean(c_preds))
            
            # Use average of all cluster predictions
            predictions[cluster_idx] = np.mean(all_preds)
    
    # Add predictions to test DataFrame
    test_df_with_preds = test_df.copy()
    test_df_with_preds['predicted_expense'] = predictions
    test_df_with_preds['assigned_cluster'] = test_clusters
    
    # Display sample predictions
    print("\nSample Predictions:")
    print(test_df_with_preds[['predicted_expense', 'assigned_cluster']].head())
    
    return test_df_with_preds

# Aggregate by household ID
def aggregate_household_predictions(test_results):
    """
    Aggregate predictions by household ID
    """
    if 'HH_ID' not in test_results.columns:
        print("Warning: 'HH_ID' column not found. Cannot aggregate by household.")
        return test_results
    
    # Calculate the sum of predicted_expense for each household ID
    household_sums = test_results.groupby('HH_ID')['predicted_expense'].sum()
    
    # Replace each row's predicted_expense with its household sum
    test_results_aggregated = test_results.copy()
    test_results_aggregated['predicted_expense'] = test_results_aggregated['HH_ID'].map(household_sums)
    
    # Verify the result with a sample household
    sample_household = test_results_aggregated['HH_ID'].iloc[0]
    sample_rows = test_results_aggregated[test_results_aggregated['HH_ID'] == sample_household].head()
    print(f"\nSample rows from household {sample_household}:")
    print(sample_rows[['HH_ID', 'predicted_expense', 'assigned_cluster']])
    
    return test_results_aggregated

# Compare predictions with actual values
def compare_expenses(predictions_df, actual_df, 
                     pred_col='predicted_expense', actual_col='TotalExpense',
                     join_col='HH_ID'):
    """
    Compare predicted expenses with actual expenses
    """
    # Check if the datasets have the required columns
    required_cols = {
        'predictions': [pred_col, join_col],
        'actual': [actual_col, join_col]
    }
    
    for df_name, cols in required_cols.items():
        df = predictions_df if df_name == 'predictions' else actual_df
        missing = [col for col in cols if col not in df.columns]
        if missing:
            raise ValueError(f"{df_name} dataset is missing columns: {missing}")
    
    # If we have multiple rows per household, keep only one row per household in each dataset
    pred_unique = predictions_df.drop_duplicates(subset=[join_col])[[join_col, pred_col]]
    actual_unique = actual_df.drop_duplicates(subset=[join_col])[[join_col, actual_col]]
    
    # Merge the datasets
    merged_df = pd.merge(pred_unique, actual_unique, on=join_col, how='inner')
    
    # Print the number of households we can compare
    print(f"Number of households for comparison: {len(merged_df)}")
    
    # Extract the values for comparison
    y_pred = merged_df[pred_col].values
    y_true = merged_df[actual_col].values
    
    # Calculate and return metrics
    metrics = evaluate_predictions(y_true, y_pred, method_name="Final Evaluation")
    
    # Save the comparison to a new CSV for further analysis
    merged_df['error'] = y_pred - y_true
    merged_df['percentage_error'] = (merged_df['error'] / merged_df[actual_col]) * 100
    merged_df.to_csv('expense_comparison_results.csv', index=False)
    print("\nDetailed comparison saved to 'expense_comparison_results.csv'")
    
    return metrics, merged_df

# Main execution
if __name__ == "__main__":
    # 1. Load datasets with paths from the provided code
    print("Loading datasets...")
    train_path = '/Users/rishav/Downloads/IIT_GN/Train_Data/train_data_final.csv'  # Using the path from the provided code
    test_path = '/Users/rishav/Downloads/IIT_GN/Train_Data/test_data_final.csv'  # Using the path from the provided code
    
    try:
        train_data = pd.read_csv(train_path)
        test_data = pd.read_csv(test_path)
        
        print(f"Training data shape: {train_data.shape}")
        print(f"Testing data shape: {test_data.shape}")
        
        # Print information about null values in test data
        null_counts = test_data.isnull().sum()
        print("\nNull value counts in test data:")
        print(null_counts[null_counts > 0])
        if null_counts.sum() == 0:
            print("No null values found in test data")
        
        # 2. Fixed parameters
        target_column = 'WeightedExpense'
        n_folds = 5  # Number of folds for cross-validation in stacking
        
        # 3. Train cluster-wise stacked models
        print(f"\nTraining cluster-wise stacked models...")
        preds_model, preds_mean, cluster_models, kmeans, scaler, numeric_cols, non_numeric_cols, clusters, optimal_k = cluster_and_predict_stacked(
            train_data, 
            target_column=target_column,
            max_clusters=10
        )

        # 4. Make predictions on test data using stacked models
        print("\nMaking predictions on test data...")
        test_with_predictions = predict_on_test_data_stacked(
            test_data,
            kmeans,
            cluster_models,
            scaler,
            numeric_cols,
            non_numeric_cols
        )

        # 5. Aggregate predictions by household ID
        print("\nAggregating predictions by household...")
        final_results = aggregate_household_predictions(test_with_predictions)

        # 6. Save the final results
        output_path = 'test_predictions_stacked_model.csv'
        final_results.to_csv(output_path, index=False)
        print(f"\nFinal results saved to {output_path}")

        # 7. Compare with actual values (since we're loading final_test_dataset.csv which should have actual values)
        if 'TotalExpense' in test_data.columns:
            print("\nActual values found in test data. Comparing predictions...")
            metrics, comparison_df = compare_expenses(
                final_results,
                test_data,
                pred_col='predicted_expense',
                actual_col='TotalExpense',
                join_col='HH_ID'
            )
        else:
            print("\nNo 'TotalExpense' column found in test data. Skipping comparison.")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Loading datasets...
Training data shape: (901723, 50)
Testing data shape: (225316, 50)

Null value counts in test data:
Series([], dtype: int64)
No null values found in test data

Training cluster-wise stacked models...
Optimal number of clusters (Elbow method): 7
Training stacked model for cluster 0 with 170811 samples
Training base model: random_forest
Training base model: xgboost
Training base model: lightgbm





Top 5 features for random_forest in cluster 0:
                    Feature  Importance
0            Person Srl No.    0.132419
22        HH Size (For FDQ)    0.129969
1   Relation to head (code)    0.100957
24                   NIC_5D    0.061566
3             Age(in years)    0.053910

Top 5 features for xgboost in cluster 0:
                       Feature  Importance
0               Person Srl No.    0.240340
22           HH Size (For FDQ)    0.080720
45     Is_HH_Have_Refrigerator    0.059460
1      Relation to head (code)    0.056594
46  Is_HH_Have_Washing_machine    0.048659

Top 5 features for lightgbm in cluster 0:
                                              Feature  Importance
16                                              State         320
18                                           District         315
22                                  HH Size (For FDQ)         243
17                                         NSS-Region         241
13  No. of meals taken during last 30 d




Top 5 features for random_forest in cluster 1:
              Feature  Importance
18           District    0.107002
17         NSS-Region    0.092360
22  HH Size (For FDQ)    0.089114
24             NIC_5D    0.083437
3       Age(in years)    0.069685

Top 5 features for xgboost in cluster 1:
                          Feature  Importance
45        Is_HH_Have_Refrigerator    0.162994
22              HH Size (For FDQ)    0.052001
42   Is_HH_Have_Motorcar_jeep_van    0.051991
36          Is_HH_Have_Television    0.045130
41  Is_HH_Have_Motorcycle_scooter    0.045014

Top 5 features for lightgbm in cluster 1:
              Feature  Importance
18           District         458
16              State         395
17         NSS-Region         320
22  HH Size (For FDQ)         217
3       Age(in years)         139
Training stacked model for cluster 2 with 119558 samples
Training base model: random_forest
Training base model: xgboost
Training base model: lightgbm





Top 5 features for random_forest in cluster 2:
                    Feature  Importance
22        HH Size (For FDQ)    0.126328
3             Age(in years)    0.092126
1   Relation to head (code)    0.085538
0            Person Srl No.    0.083067
18                 District    0.074336

Top 5 features for xgboost in cluster 2:
                                              Feature  Importance
0                                      Person Srl No.    0.191149
22                                  HH Size (For FDQ)    0.065539
42                       Is_HH_Have_Motorcar_jeep_van    0.060616
46                         Is_HH_Have_Washing_machine    0.045552
11  No. of meals taken during last 30 days from em...    0.033243

Top 5 features for lightgbm in cluster 2:
              Feature  Importance
18           District         415
22  HH Size (For FDQ)         325
16              State         243
3       Age(in years)         201
24             NIC_5D         178
Training stacked model for 




Top 5 features for random_forest in cluster 3:
                    Feature  Importance
18                 District    0.107087
3             Age(in years)    0.104129
17               NSS-Region    0.089408
22        HH Size (For FDQ)    0.085690
1   Relation to head (code)    0.068747

Top 5 features for xgboost in cluster 3:
                                              Feature  Importance
0                                      Person Srl No.    0.184640
22                                  HH Size (For FDQ)    0.050548
7   Whether used internet from any location during...    0.048634
25               Is_online_Clothing_Purchased_Last365    0.042322
5           Highest educational level attained (code)    0.040728

Top 5 features for lightgbm in cluster 3:
              Feature  Importance
18           District         423
16              State         307
22  HH Size (For FDQ)         302
17         NSS-Region         269
23             NCO_3D         123
Training stacked model for 




Top 5 features for random_forest in cluster 4:
                    Feature  Importance
22        HH Size (For FDQ)    0.139393
1   Relation to head (code)    0.074046
0            Person Srl No.    0.070954
3             Age(in years)    0.070589
24                   NIC_5D    0.068641

Top 5 features for xgboost in cluster 4:
                                Feature  Importance
0                        Person Srl No.    0.166646
42         Is_HH_Have_Motorcar_jeep_van    0.078829
22                    HH Size (For FDQ)    0.071321
38                 Is_HH_Have_Laptop_PC    0.039813
47  Is_HH_Have_Airconditioner_aircooler    0.032921

Top 5 features for lightgbm in cluster 4:
              Feature  Importance
18           District         321
22  HH Size (For FDQ)         269
16              State         256
23             NCO_3D         217
24             NIC_5D         199
Training stacked model for cluster 5 with 14874 samples
Training base model: random_forest
Training base model:




Top 5 features for random_forest in cluster 5:
                         Feature  Importance
22             HH Size (For FDQ)    0.130342
42  Is_HH_Have_Motorcar_jeep_van    0.087196
23                        NCO_3D    0.071835
1        Relation to head (code)    0.064049
3                  Age(in years)    0.054536

Top 5 features for xgboost in cluster 5:
                                              Feature  Importance
46                         Is_HH_Have_Washing_machine    0.139834
42                       Is_HH_Have_Motorcar_jeep_van    0.119322
38                               Is_HH_Have_Laptop_PC    0.069219
22                                  HH Size (For FDQ)    0.055239
8   No. of days stayed away from home during last ...    0.053595

Top 5 features for lightgbm in cluster 5:
              Feature  Importance
18           District         291
23             NCO_3D         290
22  HH Size (For FDQ)         290
24             NIC_5D         219
16              State         2




Top 5 features for random_forest in cluster 6:
              Feature  Importance
22  HH Size (For FDQ)    0.090357
3       Age(in years)    0.085026
18           District    0.073536
24             NIC_5D    0.065570
17         NSS-Region    0.060186

Top 5 features for xgboost in cluster 6:
                       Feature  Importance
45     Is_HH_Have_Refrigerator    0.150747
0               Person Srl No.    0.100554
46  Is_HH_Have_Washing_machine    0.075944
22           HH Size (For FDQ)    0.051849
1      Relation to head (code)    0.044064

Top 5 features for lightgbm in cluster 6:
              Feature  Importance
18           District         397
17         NSS-Region         290
16              State         271
22  HH Size (For FDQ)         262
3       Age(in years)         156

--- Cluster-wise Stacked Model Prediction ---
Mean Percentage Error (MPE): 16.75%
Mean Absolute Error (MAE): 822.24
R² Score: 0.8276

--- Cluster Mean Assignment ---
Mean Percentage Error (MPE): 46.71



Made predictions for 42075 samples in cluster 0




Made predictions for 55738 samples in cluster 1




Made predictions for 29688 samples in cluster 2




Made predictions for 41565 samples in cluster 3




Made predictions for 21690 samples in cluster 4
Made predictions for 3707 samples in cluster 5




Made predictions for 30853 samples in cluster 6

Sample Predictions:
   predicted_expense  assigned_cluster
0        6582.274325                 3
1        3931.624001                 2
2        3846.261876                 2
3        3094.223577                 2
4        8172.224191                 4

Aggregating predictions by household...

Sample rows from household HCES2022655561010131113011101202304:
                                 HH_ID  predicted_expense  assigned_cluster
0  HCES2022655561010131113011101202304        17454.38378                 3
1  HCES2022655561010131113011101202304        17454.38378                 2
2  HCES2022655561010131113011101202304        17454.38378                 2
3  HCES2022655561010131113011101202304        17454.38378                 2

Final results saved to test_predictions_stacked_model.csv

Actual values found in test data. Comparing predictions...
Number of households for comparison: 52350

--- Final Evaluation ---
Mean Percentage Error (