# Initialization

In [101]:
! pip install xgboost scipy lightgbm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from scipy.stats import skew
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor



In [102]:
# Load the data
train_data = pd.read_csv('./data/train.csv')
test_data = pd.read_csv('./data/test.csv')

train_data.isnull()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,False,False,False,False,False,False,True,False,False,False,...,False,True,True,True,False,False,False,False,False,False
1,False,False,False,False,False,False,True,False,False,False,...,False,True,True,True,False,False,False,False,False,False
2,False,False,False,False,False,False,True,False,False,False,...,False,True,True,True,False,False,False,False,False,False
3,False,False,False,False,False,False,True,False,False,False,...,False,True,True,True,False,False,False,False,False,False
4,False,False,False,False,False,False,True,False,False,False,...,False,True,True,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,False,False,False,False,False,False,True,False,False,False,...,False,True,True,True,False,False,False,False,False,False
1456,False,False,False,False,False,False,True,False,False,False,...,False,True,False,True,False,False,False,False,False,False
1457,False,False,False,False,False,False,True,False,False,False,...,False,True,False,False,False,False,False,False,False,False
1458,False,False,False,False,False,False,True,False,False,False,...,False,True,True,True,False,False,False,False,False,False


In [103]:
# Replace the NA in specific columns
train_data['Alley'].fillna('No Alley', inplace=True)
train_data['PoolQC'].fillna('No Pool', inplace=True)
train_data['Fence'].fillna('No Fence', inplace=True)

# Replace the NA with 0 in the rest of the columns
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

# Train test split
X_train, X_dev, y_train, y_dev = train_test_split(train_data.drop('SalePrice', axis=1), train_data['SalePrice'], test_size=0.2, random_state=42)
X_test = test_data

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Alley'].fillna('No Alley', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['PoolQC'].fillna('No Pool', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are se

# Feature engineering

In [104]:
# Split the data into numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

## Handle numerical features

In [105]:
# Total Square Footage: Combine basement, first floor, and second floor square footage
X_train['TotalSF'] = X_train['TotalBsmtSF'] + X_train['1stFlrSF'] + X_train['2ndFlrSF']
X_dev['TotalSF'] = X_dev['TotalBsmtSF'] + X_dev['1stFlrSF'] + X_dev['2ndFlrSF']
X_test['TotalSF'] = X_test['TotalBsmtSF'] + X_test['1stFlrSF'] + X_test['2ndFlrSF']

In [106]:
# House Age: Calculate the age of the house at the time of sale
X_train['HouseAge'] = X_train['YrSold'] - X_train['YearBuilt']
X_dev['HouseAge'] = X_dev['YrSold'] - X_dev['YearBuilt']
X_test['HouseAge'] = X_test['YrSold'] - X_test['YearBuilt']

In [107]:
# Remodel Age: Time since the last remodel
X_train['RemodelAge'] = X_train['YrSold'] - X_train['YearRemodAdd']
X_dev['RemodelAge'] = X_dev['YrSold'] - X_dev['YearRemodAdd']
X_test['RemodelAge'] = X_test['YrSold'] - X_test['YearRemodAdd']

In [108]:
# Total Bathrooms: Combine all bathroom-related features
X_train['TotalBath'] = X_train['FullBath'] + 0.5 * X_train['HalfBath'] + X_train['BsmtFullBath'] + 0.5 * X_train['BsmtHalfBath']
X_dev['TotalBath'] = X_dev['FullBath'] + 0.5 * X_dev['HalfBath'] + X_dev['BsmtFullBath'] + 0.5 * X_dev['BsmtHalfBath']
X_test['TotalBath'] = X_test['FullBath'] + 0.5 * X_test['HalfBath'] + X_test['BsmtFullBath'] + 0.5 * X_test['BsmtHalfBath']

## Handle ordinal features

In [109]:
# Define mappings for ordinal features
ordinal_mappings = {
    'Alley': {'No Alley': 0, 'Grvl': 1, 'Pave': 2},
    'LotShape': {'Reg': 0, 'IR1': 1, 'IR2': 2, 'IR3': 3},
    'LandSlope': {'Gtl': 0, 'Mod': 1, 'Sev': 2},
    'ExterQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'ExterCond': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtQual': {0: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtCond': {0: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'BsmtExposure': {0: 0, 'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4},
    'BsmtFinType1': {0: 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'BsmtFinType2': {0: 0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6},
    'HeatingQC': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'KitchenQual': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'Functional': {'Sal': 0, 'Sev': 1, 'Maj2': 2, 'Maj1': 3, 'Mod': 4, 'Min2': 5, 'Min1': 6, 'Typ': 7},
    'FireplaceQu': {0: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageFinish': {0: 0, 'Unf': 1, 'RFn': 2, 'Fin': 3},
    'GarageQual': {0: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'GarageCond': {0: 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5},
    'PavedDrive': {'N': 0, 'P': 1, 'Y': 2},
    'PoolQC': {'No Pool': 0, 'Fa': 1, 'TA': 2, 'Gd': 3, 'Ex': 4},
    'Fence': {'No Fence': 0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}
}

# Apply mappings to ordinal features
for col, mapping in ordinal_mappings.items():
    X_train[col] = X_train[col].map(mapping).fillna(0)
    X_dev[col] = X_dev[col].map(mapping).fillna(0)
    X_test[col] = X_test[col].map(mapping).fillna(0)

## Handle nominal features

In [110]:
nominal_cols = [
    'MSZoning', 'Street', 'LandContour', 'Utilities', 'LotConfig', 'Neighborhood', 
    'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 
    'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 
    'Electrical', 'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition'
]

label_encoders = {}

for col in nominal_cols:
    le = LabelEncoder()
    # Convert all values to strings to ensure consistency
    X_train[col] = X_train[col].astype(str)
    X_dev[col] = X_dev[col].astype(str)
    X_test[col] = X_test[col].astype(str)
    # Fit on combined unique values
    all_values = pd.concat([X_train[col], X_dev[col], X_test[col]]).unique()
    le.fit(all_values)
    X_train[col] = le.transform(X_train[col])
    X_dev[col] = le.transform(X_dev[col])
    X_test[col] = le.transform(X_test[col])
    label_encoders[col] = le

In [111]:
# Handle skewness in numerical features
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
skewed_feats = X_train[numerical_cols].apply(lambda x: skew(x.dropna()))
skewed_cols = skewed_feats[skewed_feats > 0.75].index
for col in skewed_cols:
    X_train[col] = np.log1p(X_train[col])
    X_dev[col] = np.log1p(X_dev[col])
    X_test[col] = np.log1p(X_test[col])

# Update numerical columns after feature engineering
numerical_cols = X_train.columns

In [112]:
# Scale all features
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols)
    ])

X_train_processed = preprocessor.fit_transform(X_train)
X_dev_processed = preprocessor.transform(X_dev)
X_test_processed = preprocessor.transform(X_test)

# Implement Feature Selection with Random Forest

In [113]:
# Convert processed data to DataFrame for feature names
X_train_processed_df = pd.DataFrame(X_train_processed, columns=X_train.columns)
X_dev_processed_df = pd.DataFrame(X_dev_processed, columns=X_dev.columns)
X_test_processed_df = pd.DataFrame(X_test_processed, columns=X_dev.columns)

# Log-transform target
y_train_log = np.log1p(y_train)
y_dev_log = np.log1p(y_dev)

# Train Random Forest to get feature importances
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_processed_df, y_train_log)

# Get feature importances
importances = pd.Series(rf.feature_importances_, index=X_train_processed_df.columns)
importances_sorted = importances.sort_values(ascending=False)

# Try different feature selection thresholds
thresholds = [0.5, 0.6, 0.7, 0.8]
results = {}

for threshold in thresholds:
    # Select features above threshold
    selected_features = importances_sorted[importances_sorted > importances_sorted.quantile(1-threshold)].index.tolist()
    
    # Train and evaluate Random Forest with selected features
    rf_selected = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_selected.fit(X_train_processed_df[selected_features], y_train_log)
    
    # Predict on dev set
    y_dev_pred_log_rf = rf_selected.predict(X_dev_processed_df[selected_features])
    rmse_log_rf = np.sqrt(mean_squared_error(y_dev_log, y_dev_pred_log_rf))
    
    # Store results
    results[threshold] = {
        'features': selected_features,
        'count': len(selected_features),
        'rmse': rmse_log_rf
    }

# Print results
print("Feature Selection Results:")
for threshold, result in results.items():
    print(f"\nThreshold {threshold}:")
    print(f"Number of features: {result['count']}")
    print(f"RMSE (log scale): {result['rmse']:.4f}")
    print("Selected features:")
    print(result['features'])

# Find the best threshold (lowest RMSE)
best_threshold = min(results, key=lambda x: results[x]['rmse'])
best_features = results[best_threshold]['features']

print("\n--- Best Feature Set ---")
print(f"Threshold: {best_threshold}")
print(f"Number of features: {len(best_features)}")
print("Features:", best_features)

Feature Selection Results:

Threshold 0.5:
Number of features: 42
RMSE (log scale): 0.1451
Selected features:
['TotalSF', 'OverallQual', 'GrLivArea', 'CentralAir', 'LotArea', 'GarageCars', 'TotalBath', 'GarageArea', 'YearBuilt', 'HouseAge', 'OverallCond', 'BsmtFinSF1', 'RemodelAge', 'GarageYrBlt', 'BsmtUnfSF', 'YearRemodAdd', 'FireplaceQu', 'BsmtQual', 'Neighborhood', '1stFlrSF', 'KitchenQual', '2ndFlrSF', 'TotalBsmtSF', 'GarageFinish', 'Id', 'BsmtFinType1', 'LotFrontage', 'MoSold', 'OpenPorchSF', 'GarageType', 'MSZoning', 'WoodDeckSF', 'GarageQual', 'Functional', 'LotShape', 'LandContour', 'GarageCond', 'SaleCondition', 'BsmtExposure', 'Exterior1st', 'MasVnrArea', 'MSSubClass']

Threshold 0.6:
Number of features: 50
RMSE (log scale): 0.1447
Selected features:
['TotalSF', 'OverallQual', 'GrLivArea', 'CentralAir', 'LotArea', 'GarageCars', 'TotalBath', 'GarageArea', 'YearBuilt', 'HouseAge', 'OverallCond', 'BsmtFinSF1', 'RemodelAge', 'GarageYrBlt', 'BsmtUnfSF', 'YearRemodAdd', 'FireplaceQ

# Model selection

In [114]:
# Update DataFrames with best features
X_train_processed_df = X_train_processed_df[best_features]
X_dev_processed_df = X_dev_processed_df[best_features]
X_test_processed_df = X_test_processed_df[best_features]

In [119]:
# Models to evaluate
models = [
    ('LR', LinearRegression()),
    ('KNN', KNeighborsRegressor()),
    ('CART', DecisionTreeRegressor()),
    ('RF', RandomForestRegressor(random_state=len(best_features))),
    ('GBM', GradientBoostingRegressor(random_state=len(best_features))),
    ('XGBoost', XGBRegressor(objective='reg:squarederror', random_state=len(best_features))),
    ('LightGBM', LGBMRegressor(verbose=0, random_state=len(best_features)))
]

# Evaluate each model
results = []
for name, model in models:
    model.fit(X_train_processed_df, y_train_log)
    y_dev_pred_log = model.predict(X_dev_processed_df)
    rmse_log = np.sqrt(mean_squared_error(y_dev_log, y_dev_pred_log))
    results.append((name, rmse_log))
    print(f"{name} RMSE (log scale): {rmse_log}")

# Sort models by RMSE and select the best
results.sort(key=lambda x: x[1])
best_model_name, best_rmse = results[0]
print(f"\nBest model: {best_model_name} with RMSE (log scale): {best_rmse}")

LR RMSE (log scale): 0.1390830844719857
KNN RMSE (log scale): 0.17796004917584107
CART RMSE (log scale): 0.21755412384837475
RF RMSE (log scale): 0.14751044628093998
GBM RMSE (log scale): 0.13622716450506756
XGBoost RMSE (log scale): 0.1452718095246526
LightGBM RMSE (log scale): 0.13881345460184485

Best model: GBM with RMSE (log scale): 0.13622716450506756


In [121]:
param_grids = {
    'GBM': {
        # Number of boosting stages (estimators)
        'n_estimators': [50, 100, 200, 500, 1000],
    
        # Learning rate (step size shrinkage)
        'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2],
        
        # Maximum depth of individual trees
        'max_depth': [3, 4, 5, 6, 7, 10],
        
        # Fraction of samples used for fitting individual trees
        'subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
        
        # Minimum number of samples required to be at a leaf node
        'min_samples_leaf': [1, 2, 4],
        
        # Minimum number of samples required to split an internal node
        'min_samples_split': [2, 5, 10],
        
        # Number of features to consider when looking for the best split
        'max_features': ['sqrt', 'log2', None],
        
        # Loss function to be optimized
        'loss': ['ls', 'lad', 'huber']
    }
}

# Tune the best model (or force GBM for this example)
best_model_name = 'GBM'  # Override to focus on GBM as per your request
best_model_class = dict(models)[best_model_name]
param_grid = param_grids[best_model_name]

if param_grid:
    random_search = RandomizedSearchCV(
        estimator=best_model_class,
        param_distributions=param_grid,
        n_iter=30,  # Increased for GBM
        scoring='neg_root_mean_squared_error',
        cv=5,
        verbose=1,
        random_state=42,
        n_jobs=-1
    )
    random_search.fit(X_dev_processed_df, y_dev_log)
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    print(f"Best parameters for {best_model_name}: {best_params}")
else:
    best_model = best_model_class
    best_model.fit(X_dev_processed_df, y_dev_log)
    
# Evaluate tuned model on dev set
y_dev_pred_log_tuned = best_model.predict(X_dev_processed_df)
rmse_log_tuned = np.sqrt(mean_squared_error(y_dev_log, y_dev_pred_log_tuned))
print(f"Tuned {best_model_name} RMSE (log scale): {rmse_log_tuned}")

Fitting 5 folds for each of 30 candidates, totalling 150 fits


120 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
7 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/apple/miniforge3/envs/colx535/lib/python3.13/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/apple/miniforge3/envs/colx535/lib/python3.13/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "/Users/apple/miniforge3/envs/colx535/lib/python3.13/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
    

Best parameters for GBM: {'subsample': 0.8, 'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 3, 'loss': 'huber', 'learning_rate': 0.1}
Tuned GBM RMSE (log scale): 0.06194673793565137


In [122]:
# Train final model on full training data
best_model.fit(X_train_processed_df, y_train_log)

# Final evaluation on dev set
y_dev_pred_log_final = best_model.predict(X_dev_processed_df)
rmse_log_final = np.sqrt(mean_squared_error(y_dev_log, y_dev_pred_log_final))
y_dev_pred_final = np.expm1(y_dev_pred_log_final)
rmse_original_final = np.sqrt(mean_squared_error(y_dev, y_dev_pred_final))
print(f"Final {best_model_name} RMSE (log scale): {rmse_log_final}")
print(f"Final {best_model_name} RMSE (original scale): {rmse_original_final}")

# Predict on test set
y_test_pred_log = best_model.predict(X_test_processed_df)
y_test_pred = np.expm1(y_test_pred_log)

# Create submission
submission = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': y_test_pred})
submission.to_csv(f'submission_{best_model_name.lower()}.csv', index=False)

Final GBM RMSE (log scale): 0.13221814367468793
Final GBM RMSE (original scale): 25979.364201390086
