In [21]:
import pandas as pd
import numpy as np
import mlflow
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns   
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

In [13]:
# Load data
train_df = pd.read_parquet('../data/processed/train.parquet')
test_df = pd.read_parquet('../data/processed/test.parquet')


In [32]:
# create sklearn pipeline with pre-processing of numerical and categorical features
def pipeline(train_df):
    """Create a machine learning pipeline with preprocessing and model"""
    numeric_features = train_df.drop("resale_price", axis=1).select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = train_df.select_dtypes(include=['object']).columns.tolist()
    print(f"Numeric features: {numeric_features}")
    print(f"Categorical features: {categorical_features}")

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ])
    return pipeline

In [33]:
# train model
def train_model(pipeline, train_df):
    """Train the model using cross-validation"""
    print("Training model")
    X = train_df.drop(columns=['resale_price'])
    y = train_df['resale_price']
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y, cv=kf, scoring='neg_root_mean_squared_error',
                             verbose=1)
    
    print(f"Cross-validated RMSE: {-scores.mean():.2f} ± {scores.std():.2f}")
    
    pipeline.fit(X, y)
    
    return pipeline

In [34]:
# Training model with the pipeline
trained_pipeline = train_model(pipeline=pipeline(train_df), train_df=train_df)
trained_pipeline

Numeric features: ['flat_age_years', 'floor_area_sqm', 'days_from_earliest_data']
Categorical features: ['town', 'flat_type', 'flat_model_revised']
Training model


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.9s finished


Cross-validated RMSE: 73115.96 ± 305.89


0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [None]:
def load_data(train_path='../data/processed/train.parquet', test_path='../data/processed/test.parquet'):
    """Load train and test datasets"""
    try:
        train_df = pd.read_parquet(train_path)
        test_df = pd.read_parquet(test_path)
        print(f"Train data shape: {train_df.shape}")
        print(f"Test data shape: {test_df.shape}")
        return train_df, test_df
    except FileNotFoundError as e:
        print(f"Error loading data: {e}")
        print("Please ensure train.parquet and test.parquet are in the current directory")
        return None, None

def validate_features(df, required_features):
    """Validate that required features are present in the dataset"""
    missing_features = set(required_features) - set(df.columns)
    if missing_features:
        print(f"Missing features in dataset: {missing_features}")
        print(f"Available columns: {list(df.columns)}")
        return False
    return True

def preprocess_data(train_df, test_df, feature_cols, target_col):
    """Preprocess the data for modeling with dummy encoding"""
    
    # Validate features exist
    all_required_cols = feature_cols + [target_col]
    if not validate_features(train_df, all_required_cols):
        return None, None, None, None, None
    
    if not validate_features(test_df, feature_cols):
        return None, None, None, None, None
    
    # Extract features and target from train data
    X_train = train_df[feature_cols].copy()
    y_train = train_df[target_col].copy()
    X_test = test_df[feature_cols].copy()
    
    print(f"Features used: {feature_cols}")
    print(f"Target variable: {target_col}")
    
    # Identify numeric and categorical columns
    numeric_columns = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_columns = X_train.select_dtypes(include=['object']).columns.tolist()
    
    print(f"Numeric features: {numeric_columns}")
    print(f"Categorical features: {categorical_columns}")
    
    # Handle missing values
    # Numeric columns - impute with median
    if numeric_columns:
        numeric_imputer = SimpleImputer(strategy='median')
        X_train[numeric_columns] = numeric_imputer.fit_transform(X_train[numeric_columns])
        X_test[numeric_columns] = numeric_imputer.transform(X_test[numeric_columns])
    
    # Categorical columns - impute with mode
    if categorical_columns:
        categorical_imputer = SimpleImputer(strategy='most_frequent')
        X_train[categorical_columns] = categorical_imputer.fit_transform(X_train[categorical_columns])
        X_test[categorical_columns] = categorical_imputer.transform(X_test[categorical_columns])
    
    # Apply dummy encoding (one-hot encoding) to categorical variables
    if categorical_columns:
        # Combine train and test for consistent dummy encoding
        combined_categorical = pd.concat([
            X_train[categorical_columns], 
            X_test[categorical_columns]
        ], axis=0, ignore_index=True)
        
        # Create dummy variables
        combined_dummies = pd.get_dummies(combined_categorical, drop_first=True, prefix=categorical_columns)
        
        # Split back into train and test
        n_train = len(X_train)
        train_dummies = combined_dummies.iloc[:n_train]
        test_dummies = combined_dummies.iloc[n_train:]
        
        # Reset indices
        train_dummies.index = X_train.index
        test_dummies.index = X_test.index
        
        # Combine numeric and dummy encoded categorical features
        X_train_processed = pd.concat([X_train[numeric_columns], train_dummies], axis=1)
        X_test_processed = pd.concat([X_test[numeric_columns], test_dummies], axis=1)
        
        dummy_columns = list(train_dummies.columns)
        
    else:
        X_train_processed = X_train[numeric_columns].copy()
        X_test_processed = X_test[numeric_columns].copy()
        dummy_columns = []
    
    print(f"Final feature count after dummy encoding: {X_train_processed.shape[1]}")
    print(f"Dummy encoded columns created: {len(dummy_columns)}")
    
    return X_train_processed, X_test_processed, y_train, categorical_columns, dummy_columns


In [4]:
def cross_validate_model(X_train, y_train, model_type='random_forest', cv_folds=5):
    """Perform cross-validation for model selection"""
    
    # Initialize model
    if model_type == 'random_forest':
        model = RandomForestRegressor(
            n_estimators=100,
            max_depth=15,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42
        )
    elif model_type == 'linear_regression':
        model = LinearRegression()
    
    # Scale features for cross-validation
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Set up cross-validation
    kf = KFold(n_splits=cv_folds, shuffle=True, random_state=42)
    
    # Perform cross-validation
    cv_mae_scores = -cross_val_score(model, X_train_scaled, y_train, 
                                    cv=kf, scoring='neg_mean_absolute_error')
    cv_rmse_scores = np.sqrt(-cross_val_score(model, X_train_scaled, y_train, 
                                             cv=kf, scoring='neg_mean_squared_error'))
    cv_r2_scores = cross_val_score(model, X_train_scaled, y_train, 
                                  cv=kf, scoring='r2')
    
    # Calculate mean and std of CV scores
    cv_results = {
        'cv_mae_mean': cv_mae_scores.mean(),
        'cv_mae_std': cv_mae_scores.std(),
        'cv_rmse_mean': cv_rmse_scores.mean(),
        'cv_rmse_std': cv_rmse_scores.std(),
        'cv_r2_mean': cv_r2_scores.mean(),
        'cv_r2_std': cv_r2_scores.std()
    }
    
    return cv_results, model, scaler

def train_final_model(X_train, X_test, y_train, model_type='random_forest'):
    """Train final model on full training data and make test predictions"""
    
    with mlflow.start_run():
        # Perform cross-validation first
        cv_results, model, scaler = cross_validate_model(X_train, y_train, model_type)
        
        # Log parameters
        mlflow.log_param("model_type", model_type)
        mlflow.log_param("n_features", X_train.shape[1])
        mlflow.log_param("n_train_samples", X_train.shape[0])
        mlflow.log_param("n_test_samples", X_test.shape[0])
        mlflow.log_param("cv_folds", 5)
        
        # Log hyperparameters
        if model_type == 'random_forest':
            mlflow.log_param("n_estimators", 100)
            mlflow.log_param("max_depth", 15)
            mlflow.log_param("min_samples_split", 5)
            mlflow.log_param("min_samples_leaf", 2)
        
        # Log cross-validation results
        for metric, value in cv_results.items():
            mlflow.log_metric(metric, value)
        
        # Train final model on full training data
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        model.fit(X_train_scaled, y_train)
        
        # Make predictions
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)
        
        # Calculate training metrics (for comparison with CV)
        train_mae = mean_absolute_error(y_train, y_train_pred)
        train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
        train_r2 = r2_score(y_train, y_train_pred)
        
        # Log training metrics
        mlflow.log_metric("train_mae", train_mae)
        mlflow.log_metric("train_rmse", train_rmse)
        mlflow.log_metric("train_r2", train_r2)
        
        # Log model artifacts
        mlflow.sklearn.log_model(model, "model")
        mlflow.sklearn.log_model(scaler, "scaler")
        
        # Create and log feature importance plot (for tree-based models)
        if hasattr(model, 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature': X_train.columns,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            # Plot top 20 features (or all if less than 20)
            top_features = feature_importance.head(20)
            
            plt.figure(figsize=(10, 12))
            sns.barplot(data=top_features, x='importance', y='feature')
            plt.title(f'Top {len(top_features)} Feature Importances - {model_type.replace("_", " ").title()}')
            plt.xlabel('Importance')
            plt.tight_layout()
            plt.savefig('feature_importance.png', dpi=300, bbox_inches='tight')
            mlflow.log_artifact('feature_importance.png')
            plt.close()
            
            # Log feature importance as text
            importance_text = feature_importance.to_string(index=False)
            mlflow.log_text(importance_text, "feature_importance.txt")
        
        # Create and log training predictions plot
        plt.figure(figsize=(10, 8))
        plt.scatter(y_train, y_train_pred, alpha=0.6)
        plt.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
        plt.xlabel('Actual Resale Price')
        plt.ylabel('Predicted Resale Price')
        plt.title(f'{model_type.replace("_", " ").title()} - Training: Actual vs Predicted')
        plt.tight_layout()
        plt.savefig('train_predictions_plot.png', dpi=300, bbox_inches='tight')
        mlflow.log_artifact('train_predictions_plot.png')
        plt.close()
        
        # Create cross-validation results plot
        cv_metrics = ['MAE', 'RMSE', 'R²']
        cv_means = [cv_results['cv_mae_mean'], cv_results['cv_rmse_mean'], cv_results['cv_r2_mean']]
        cv_stds = [cv_results['cv_mae_std'], cv_results['cv_rmse_std'], cv_results['cv_r2_std']]
        
        fig, axes = plt.subplots(1, 3, figsize=(15, 5))
        for i, (metric, mean, std) in enumerate(zip(cv_metrics, cv_means, cv_stds)):
            axes[i].bar(metric, mean, yerr=std, capsize=5, alpha=0.7)
            axes[i].set_title(f'5-Fold CV {metric}')
            axes[i].set_ylabel(metric)
            if metric != 'R²':
                axes[i].set_ylabel(f'{metric} ($)')
        
        plt.suptitle(f'{model_type.replace("_", " ").title()} - Cross-Validation Results')
        plt.tight_layout()
        plt.savefig('cv_results_plot.png', dpi=300, bbox_inches='tight')
        mlflow.log_artifact('cv_results_plot.png')
        plt.close()
        
        # Log model summary
        mlflow.log_text(f"""
        Model Training Summary:
        =====================
        Model Type: {model_type}
        Training Samples: {X_train.shape[0]}
        Test Samples: {X_test.shape[0]}
        Features: {X_train.shape[1]}
        
        Cross-Validation Results (5-Fold):
        =================================
        CV MAE: ${cv_results['cv_mae_mean']:,.2f} ± ${cv_results['cv_mae_std']:,.2f}
        CV RMSE: ${cv_results['cv_rmse_mean']:,.2f} ± ${cv_results['cv_rmse_std']:,.2f}
        CV R²: {cv_results['cv_r2_mean']:.4f} ± {cv_results['cv_r2_std']:.4f}
        
        Training Set Performance:
        ========================
        Training MAE: ${train_mae:,.2f}
        Training RMSE: ${train_rmse:,.2f}
        Training R²: {train_r2:.4f}
        
        Note: Model selection based on cross-validation R² score.
        """, "model_summary.txt")
        
        print(f"\n{model_type.replace('_', ' ').title()} Results:")
        print(f"Cross-Validation R²: {cv_results['cv_r2_mean']:.4f} ± {cv_results['cv_r2_std']:.4f}")
        print(f"Cross-Validation MAE: ${cv_results['cv_mae_mean']:,.2f} ± ${cv_results['cv_mae_std']:,.2f}")
        print(f"Cross-Validation RMSE: ${cv_results['cv_rmse_mean']:,.2f} ± ${cv_results['cv_rmse_std']:,.2f}")
        print(f"Training R²: {train_r2:.4f}")
        
        return model, scaler, y_test_pred, cv_results


In [5]:
def main():
    """Main function to run the house price prediction pipeline"""
    
    # Set MLflow experiment
    mlflow.set_experiment("Singapore_HDB_Resale_Price_CV_Selection")
    
    print("Starting Singapore HDB Resale Price Prediction Pipeline...")
    print("Using 5-Fold Cross-Validation for Model Selection")
    print("=" * 65)
    
    # Define features and target
    feature_cols = ['town', 'flat_type', 'flat_model_revised', 'flat_age_years', 
                   'floor_area_sqm', 'days_from_earliest_data']
    target_col = 'resale_price'
    
    # Load data
    print("1. Loading data...")
    train_df, test_df = load_data()
    
    if train_df is None or test_df is None:
        return
    
    # Basic data exploration
    print("\n2. Data Overview:")
    print(f"Train data shape: {train_df.shape}")
    print(f"Test data shape: {test_df.shape}")
    
    if target_col in train_df.columns:
        print(f"Resale price range: ${train_df[target_col].min():,.0f} - ${train_df[target_col].max():,.0f}")
        print(f"Average resale price: ${train_df[target_col].mean():,.0f}")
        print(f"Median resale price: ${train_df[target_col].median():,.0f}")
    
    print(f"\nMissing values in train data:")
    missing_train = train_df[feature_cols + [target_col]].isnull().sum()
    print(missing_train[missing_train > 0] if missing_train.sum() > 0 else "No missing values")
    
    print(f"\nMissing values in test data:")
    missing_test = test_df[feature_cols].isnull().sum()
    print(missing_test[missing_test > 0] if missing_test.sum() > 0 else "No missing values")
    
    # Data distribution for categorical features
    print(f"\nCategorical feature distributions:")
    categorical_features = ['town', 'flat_type', 'flat_model_revised']
    for feature in categorical_features:
        if feature in train_df.columns:
            print(f"\n{feature}: {train_df[feature].nunique()} unique values")
            print(train_df[feature].value_counts().head())
    
    # Preprocess data
    print("\n3. Preprocessing data with dummy encoding...")
    result = preprocess_data(train_df, test_df, feature_cols, target_col)
    
    if result[0] is None:
        print("Preprocessing failed. Please check your data.")
        return
    
    X_train, X_test, y_train, categorical_columns, dummy_columns = result
    
    print(f"Preprocessed training data shape: {X_train.shape}")
    print(f"Preprocessed test data shape: {X_test.shape}")
    
    # Train multiple models with cross-validation
    print("\n4. Training models with 5-fold cross-validation...")
    models = ['random_forest', 'linear_regression']
    results = {}
    predictions = {}
    
    for model_type in models:
        print(f"\nTraining {model_type.replace('_', ' ').title()}...")
        model, scaler, test_pred, cv_metrics = train_final_model(X_train, X_test, y_train, model_type)
        results[model_type] = cv_metrics
        predictions[model_type] = test_pred
    
    # Compare cross-validation results
    print("\n" + "=" * 65)
    print("MODEL COMPARISON (Cross-Validation Performance)")
    print("=" * 65)
    
    comparison_data = {}
    for model_type, metrics in results.items():
        comparison_data[model_type] = {
            'CV_R²_Mean': metrics['cv_r2_mean'],
            'CV_R²_Std': metrics['cv_r2_std'],
            'CV_MAE_Mean': metrics['cv_mae_mean'],
            'CV_MAE_Std': metrics['cv_mae_std'],
            'CV_RMSE_Mean': metrics['cv_rmse_mean'],
            'CV_RMSE_Std': metrics['cv_rmse_std']
        }
    
    comparison_df = pd.DataFrame(comparison_data).T
    comparison_df = comparison_df.round(4)
    print(comparison_df)
    
    # Find best model based on cross-validation R²
    best_model = comparison_df['CV_R²_Mean'].idxmax()
    best_r2 = comparison_df.loc[best_model, 'CV_R²_Mean']
    best_r2_std = comparison_df.loc[best_model, 'CV_R²_Std']
    
    print(f"\nBest performing model: {best_model.replace('_', ' ').title()}")
    print(f"Best CV R² score: {best_r2:.4f} ± {best_r2_std:.4f}")
    
    # Save predictions to CSV files
    print("\n5. Saving test predictions...")
    for model_name, pred in predictions.items():
        pred_df = test_df.copy()
        pred_df['predicted_resale_price'] = pred
        filename = f'{model_name}_predictions.csv'
        pred_df.to_csv(filename, index=False)
        print(f"Saved {model_name} predictions to {filename}")
    
    print(f"\nPipeline completed successfully!")
    print("Model selection was based on 5-fold cross-validation performance.")
    print("Test set was used only for final predictions, not for model selection.")
    print("You can view the experiment results in MLflow UI by running: mlflow ui")
    print("Open http://localhost:5000 in your browser to see tracked experiments.")
    mlflow.end_run()

In [None]:
# Output model
torch.save(model.state_dict(), os.path.join(MODEL_PATH, "rainfall_lstm_baseline.pth"))
print("Model saved to", os.path.join(MODEL_PATH, "rainfall_lstm_baseline.pth"))