In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Core Data & Math Library

import numpy as np # For General Math Library
import pandas as pd # For Data Manipulation

%matplotlib inline

import matplotlib.pyplot as plt # For Plotting
import seaborn as sns # For Plotting

#Scikit-learn: Metrics

from sklearn.metrics import mean_absolute_error


In [5]:
#Scikit-learn: Model Selection & Validation

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error

## Reading Data

In [6]:
# Read Data train data

loc_train_data = "/content/drive/MyDrive/home-data-for-ml-course/train.csv"

train_df = pd.read_csv(loc_train_data)

# Read Data test data

loc_test_data = "/content/drive/MyDrive/home-data-for-ml-course/test.csv"

test_df = pd.read_csv(loc_test_data)

test_ids = test_df['Id']


In [7]:
#Visualize the data first 5 rows

train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
#train_df.info() Idx | Column_Name | Count | Dtype
print(train_df.shape) # Statistics of the data
print(test_df.shape)


(1460, 81)
(1459, 80)


In [9]:
set(train_df.columns) - set(test_df.columns)

{'SalePrice'}

## Choosing features based on mutual info regression against "SalePrice"

In [10]:
# Before we use mutual info regressor we need to encode categorical data with codes,
# Though theoretically mutual info regressor work with categorical/discrete data in algos we have to code them.

num_features = train_df.select_dtypes(include=['int64', 'float64']).drop(['Id'], axis=1).columns.drop(['SalePrice'])
cat_features = train_df.select_dtypes(include=['object']).columns

all_features = num_features.tolist() + cat_features.tolist()

#----------------------------------------------------------------

t_num_features = test_df.select_dtypes(include=['int64', 'float64']).drop(['Id'], axis=1).columns
t_cat_features = test_df.select_dtypes(include=['object']).columns

print(len(num_features))
print(len(cat_features))

print(len(t_num_features))
print(len(t_cat_features))

36
43
36
43


In [11]:
# To find out mutual info between saleprice and features of the df [ categorical ]

from sklearn.feature_selection import mutual_info_regression

# The above features contains names of the columns excluding "saleprice" and "id"

# Now we need to encode categorical data into codes
cat_encoded = train_df[cat_features].apply(lambda x: x.astype('category').cat.codes)

# Also fill NA values with mean
fillNA_num = train_df[num_features].fillna(train_df[num_features].mean())

concat_df = pd.concat([cat_encoded, fillNA_num], axis=1)

print(concat_df.shape)


(1460, 79)


In [12]:

# Compute mutual information between categorical feature and target
mi_scores = mutual_info_regression(concat_df, train_df['SalePrice'])

# Convert to series for easy viewing
mi_series = pd.Series( mi_scores, index= all_features ).sort_values(ascending=False)

print("Mutual Information Scores:\n", mi_series)

Mutual Information Scores:
 Condition2     0.560637
BsmtFinSF1     0.509709
BsmtCond       0.480966
ExterQual      0.365522
HouseStyle     0.364627
                 ...   
LotFrontage    0.000000
Alley          0.000000
PavedDrive     0.000000
MiscFeature    0.000000
SaleType       0.000000
Length: 79, dtype: float64


In [64]:
#selecting top categorical features with mutual relationships to sales price

top_features = mi_series[mi_series > 0.015].index.tolist()  # threshold can be tuned

print("Number of Selected Features:", len(top_features))
print("Selected High-Impact Categorical Features:", top_features)


Number of Selected Features: 61
Selected High-Impact Categorical Features: ['Condition2', 'BsmtFinSF1', 'BsmtCond', 'ExterQual', 'HouseStyle', 'GarageType', 'FireplaceQu', 'FullBath', 'KitchenAbvGr', '3SsnPorch', 'ExterCond', 'Functional', 'BsmtFinType2', 'LandSlope', 'MoSold', 'RoofStyle', 'MiscVal', 'Electrical', 'Foundation', 'PoolArea', 'Neighborhood', 'BedroomAbvGr', 'WoodDeckSF', 'BsmtFullBath', 'KitchenQual', 'Condition1', 'Exterior1st', 'GarageYrBlt', 'GarageQual', 'GrLivArea', 'MSSubClass', 'MasVnrType', 'BldgType', 'GarageFinish', 'RoofMatl', 'BsmtHalfBath', 'MSZoning', '1stFlrSF', 'Heating', 'HeatingQC', 'OverallQual', 'Utilities', 'Fireplaces', 'YrSold', 'LotConfig', 'OpenPorchSF', 'TotRmsAbvGrd', 'LotShape', 'GarageCars', 'EnclosedPorch', 'TotalBsmtSF', 'LotArea', 'BsmtFinSF2', 'Street', 'OverallCond', 'CentralAir', 'GarageCond', 'PoolQC', 'BsmtExposure', 'LowQualFinSF', 'HalfBath']


In [65]:

# creating new data frame with only top num & cat features with respect to relationship

train_mi = train_df[top_features]

print(train_mi.shape)

(1460, 61)


In [66]:
# test maker

test_mi = test_df[top_features]

print(test_mi.shape)

(1459, 61)


In [43]:
#train_mi.head(-1)   # Shows first 5 and last 5
#test_mi.head(-1)

In [44]:
set(train_mi.columns) - set(test_mi.columns)  # to check if any columns are missing

set()

In [67]:
# Separate num and cat features from the top feature list

top_num_features = train_mi.select_dtypes(include=['int64', 'float64']).columns
top_cat_features = train_mi.select_dtypes(include=['object']).columns

print(len(top_num_features))
print(len(top_cat_features))

28
33


In [68]:
# Separate num and cat features from the top feature list

top_num_features_test = test_mi.select_dtypes(include=['int64', 'float64']).columns
top_cat_features_test = test_mi.select_dtypes(include=['object']).columns

print(len(top_num_features_test))
print(len(top_cat_features_test))

28
33


## Defining Transformers For Numeric & Categorical Data

In [69]:
# -------------------------------
#  Define the transformers
# -------------------------------

numeric_transformer = Pipeline(steps=[
  ('imputer', SimpleImputer(strategy='mean')),   # Fill missing numeric values with mean
   ('scaler', StandardScaler())                   # Scale numerics i.e. standardize the data where mean tends to 0 and StandDevia = 1
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Fill missing categorical values with the most frequent
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))   # Encode categoricals
])



In [70]:
# -------------------------------
#  Define the preprocessor
# -------------------------------

# Combine preprocessing steps

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, top_num_features),
        ('cat', categorical_transformer, top_cat_features)
    ])


In [71]:
# Apply fit and apply transformation cat & num

df_transf_cat = categorical_transformer.fit_transform(train_mi[top_cat_features])
df_transf_num = numeric_transformer.fit_transform(train_mi[top_num_features])


num_features_out = top_num_features
cat_features_out = categorical_transformer.named_steps['encoder'].get_feature_names_out(top_cat_features)

# Combine all features

all_features_out = np.concatenate((num_features_out, cat_features_out))

print("Length of Transformed DataFrame Features:", len(all_features_out))


Length of Transformed DataFrame Features: 221


In [72]:
# Apply transformation to cat & num for test

df_transf_cat_test = categorical_transformer.transform(test_mi[top_cat_features_test])
df_transf_num_test = numeric_transformer.transform(test_mi[top_num_features_test])


In [73]:
# Merge Num & Cat DataFrame After Transforming

X_num = pd.DataFrame(df_transf_num, columns=num_features_out)
X_cat = pd.DataFrame(df_transf_cat, columns=cat_features_out)

X_num_test = pd.DataFrame(df_transf_num_test, columns=num_features_out)
X_cat_test = pd.DataFrame(df_transf_cat_test, columns=cat_features_out)

Trasformed_df = pd.concat([X_num, X_cat], axis=1)
Trasformed_df_test = pd.concat([X_num_test, X_cat_test], axis=1)

print(Trasformed_df.shape)
print(Trasformed_df_test.shape)

(1460, 221)
(1459, 221)


In [74]:
# Convert the transformed array back to a DataFrame

train_df_tranformed = pd.DataFrame(
    Trasformed_df,  # your transformed NumPy array
    columns=all_features_out,  # assign names
)

print("\nTransformed DataFrame:\n", train_df_tranformed.shape)



Transformed DataFrame:
 (1460, 221)


In [75]:
# test maker

test_df_tranformed = pd.DataFrame(
    Trasformed_df_test,  # your transformed NumPy array
    columns=all_features_out,  # assign names
)

## Making an " X " , " y " Split

In [76]:

X = train_df_tranformed

y = train_df['SalePrice']


In [77]:
# test maker

X_test = test_df_tranformed


In [78]:
print(X.shape)
print(X_test.shape)

(1460, 221)
(1459, 221)


In [79]:
# Split into validation and training data

train_X_raw, val_X_raw, train_y, val_y = train_test_split(X, y, random_state=1 )

print("Creating train / test split...")

Creating train / test split...


## XGBoost Model

In [80]:
!pip install xgboost
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


# 2️⃣ Define XGBoost model
xgb_model = XGBRegressor(
    n_estimators=500,       # number of boosting rounds (trees)
    learning_rate=0.1,     # step size shrinkage
    max_depth=3,            # maximum depth of a tree
    subsample=1.0,          # fraction of samples per tree
    colsample_bytree=0.8,   # fraction of features per tree
    random_state=42,
    objective='reg:squarederror',
    n_jobs=-1
)

# 3️⃣ Fit the model
xgb_model.fit(train_X_raw, train_y)

# 4️⃣ Predict on validation data
y_pred = xgb_model.predict(val_X_raw)

# 5️⃣ Evaluate performance
mae = mean_absolute_error(val_y, y_pred)
print(f"Validation MAE: {mae:.4f}")

Validation MAE: 15937.3711


## Stacked XGboost

In [81]:
# --- Imports ---
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, StackingRegressor

# --- Base models ---
xgb = XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

krr = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

# --- Meta-model (final estimator) ---
meta_model = RidgeCV(alphas=(0.1, 1.0, 10.0))

# --- Stacking Ensemble ---
ensemble = StackingRegressor(
    estimators=[
        ('xgb', xgb),
        ('krr', krr),
        ('rf', rf)
    ],
    final_estimator=meta_model,
    passthrough=False,   # set True to add original features to meta-model
    n_jobs=-1
)

# --- Train ensemble ---
ensemble.fit(train_X_raw, train_y)

# --- Evaluate ---
y_pred_stack = ensemble.predict(val_X_raw)

# 7️⃣ Evaluate performance
mae = mean_absolute_error(val_y, y_pred_stack)
print(f"Stacked Model Validation MAE: {mae:.4f}")

Stacked Model Validation MAE: 14323.5524


## Hyper Parameter Tunning with Grid SearchCV

In [83]:
param_grid = {
    'xgb__max_depth': [2, 3, 5],
    'xgb__learning_rate': [0.01, 0.02, 0.05],
    'krr__alpha': [0.1,0.3, 0.5],
}

grid = GridSearchCV(
    estimator=ensemble,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    n_jobs=-1
)
grid.fit(train_X_raw, train_y)

# --- Evaluate ---
y_pred_grid = grid.predict(val_X_raw)

# 7️⃣ Evaluate performance
mae = mean_absolute_error(val_y, y_pred_grid)
print(f"Tuned Model Validation MAE: {mae:.4f}")
print("Best params:", grid.best_params_)

Tuned Model Validation MAE: 15270.8677
Best params: {'krr__alpha': 0.1, 'xgb__learning_rate': 0.02, 'xgb__max_depth': 3}


## Test Predict And Submission File

In [84]:
# Predict With Stacked Model

ensemble_test_predict = ensemble.predict(X_test)

In [85]:
print(ensemble_test_predict.shape)

(1459,)


In [86]:
# --- Save Submission File ---
output = pd.DataFrame({
    'Id': test_ids, # Use the saved 'test_ids'
    'SalePrice': ensemble_test_predict
})
output.to_csv('submission.csv', index=False)

#