In [1]:
# Importing libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder

In [3]:
train_data = pd.read_csv('train.csv', index_col = "Id")
test_data = pd.read_csv('test.csv', index_col = "Id")

In [4]:
y = train_data.pop("SalePrice")

In [5]:
train_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal


EDA

In [6]:
del_cols = ["MiscFeature", "Fence", "PoolQC", "FireplaceQu", "Alley", "Utilities"]

In [7]:
train_data = train_data.drop(del_cols, axis=1)

In [9]:
numeric_cols = []
categorical_cols = []
binary_cols = []

for column in train_data.columns:
    unique_values = train_data[column].nunique()
    if (train_data[column].dtype == "int64" or train_data[column].dtype == "float64") and unique_values > 8:
        numeric_cols.append(column)
    elif unique_values == 2:
        binary_cols.append(column)
    elif train_data[column].dtype == "object" or (unique_values > 2 and unique_values <=8):
        categorical_cols.append(column)
    else:
        print("error")

print("Numeric Columns:", numeric_cols)
print("Categorical Columns:", categorical_cols)
print("Binary Columns:", binary_cols)
print(len(numeric_cols)+len(categorical_cols)+len(binary_cols))

Numeric Columns: ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal', 'MoSold']
Categorical Columns: ['MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'Electrical', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'Functional', 'Fireplaces', 'GarageType', 'GarageFinish', 'GarageCars', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolArea', 'YrSold', 'Sale

In [10]:
cat_pipeline = Pipeline(steps=
                         [('imputer',SimpleImputer(strategy="most_frequent")),
                          ('onehot', OneHotEncoder(handle_unknown='ignore')),
                          ])

num_pipeline = Pipeline(steps=
                         [('imputer',KNNImputer(n_neighbors=3))
                          ])

byn_pipeline = Pipeline(steps=
                      [('binary_encoder', OneHotEncoder())
                      ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num',num_pipeline,numeric_cols),
        ('cate',cat_pipeline,categorical_cols),
        ('byn', byn_pipeline, binary_cols)
    ]
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", XGBRegressor(learning_rate = 0.1, max_depth = 3, n_estimators = 200))
    ]
)

In [11]:
X = train_data.copy()

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

In [13]:
clf.fit(X_train, y_train)

In [15]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

y_pred = clf.predict(X_valid)

mse = mean_squared_error(y_valid, y_pred)
mae = mean_absolute_error(y_valid, y_pred)
r2 = r2_score(y_valid, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 963633551.3457295
Mean Absolute Error (MAE): 16468.434677333047
R-squared (R2): 0.8604612674227464


In [16]:
cat_encoder = cat_pipeline
cat_encoder.fit(X_train[categorical_cols])
byn_encoder = byn_pipeline
byn_encoder.fit(X_train[binary_cols])
cat_column_names = cat_encoder.get_feature_names_out(input_features=categorical_cols)
byn_column_names = byn_encoder.get_feature_names_out(input_features=binary_cols)
num_column_names = numeric_cols
all_column_names = num_column_names + list(cat_column_names) + list(byn_column_names)
xgb_model = clf.named_steps['regressor']
feature_importance = xgb_model.feature_importances_
feature_importance_df = pd.DataFrame(data=feature_importance, index=all_column_names, columns=['importance'])
feature_importance_df = feature_importance_df.sort_values(by='importance', ascending=False)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [18]:
print(feature_importance_df)

                       importance
OverallQual              0.162840
GarageCars_3             0.161509
GarageCars_2             0.094554
BsmtQual_Ex              0.072445
GarageType_Detchd        0.038452
GrLivArea                0.038290
GarageFinish_Unf         0.034601
KitchenQual_TA           0.027823
Fireplaces_0             0.024993
TotalBsmtSF              0.024300
KitchenQual_Gd           0.023733
BsmtFinType1_GLQ         0.023103
HalfBath_1               0.016861
MSZoning_RM              0.016472
CentralAir_N             0.015417
BsmtFinSF1               0.014183
KitchenQual_Ex           0.011857
YearRemodAdd             0.011197
BedroomAbvGr_3           0.010826
BsmtQual_Gd              0.008283
MSZoning_RL              0.007613
Fireplaces_2             0.007159
BsmtExposure_Gd          0.006512
ExterQual_TA             0.005850
TotRmsAbvGrd             0.005475
1stFlrSF                 0.005305
SaleType_New             0.005161
RoofMatl_WdShngl         0.005057
SaleType_WD   

In [19]:
important_cols = feature_importance_df[feature_importance_df['importance'] > 0]
selected_feature_names = important_cols.index.tolist()

final_important_cols = []
for col in selected_feature_names:
    if "_" in col:
        col_wo_ = col.split("_")[0]
        if col_wo_ not in final_important_cols:
            final_important_cols.append(col_wo_)
    else:
        final_important_cols.append(col)
print(final_important_cols)

['OverallQual', 'GarageCars', 'BsmtQual', 'GarageType', 'GrLivArea', 'GarageFinish', 'KitchenQual', 'Fireplaces', 'TotalBsmtSF', 'BsmtFinType1', 'HalfBath', 'MSZoning', 'CentralAir', 'BsmtFinSF1', 'YearRemodAdd', 'BedroomAbvGr', 'BsmtExposure', 'ExterQual', 'TotRmsAbvGrd', '1stFlrSF', 'SaleType', 'RoofMatl', 'BldgType', 'LotArea', 'YearBuilt', 'MoSold', 'LotShape', '2ndFlrSF', 'OverallCond', 'Condition1', 'MasVnrType', 'Neighborhood', 'OpenPorchSF', 'GarageArea', 'Exterior1st', 'SaleCondition', 'MasVnrArea', 'LotFrontage', 'KitchenAbvGr', 'RoofStyle', 'GarageYrBlt', 'Condition2', 'FullBath', 'Functional', 'Exterior2nd', 'ScreenPorch', 'WoodDeckSF', 'BsmtFullBath', 'BsmtUnfSF', 'LotConfig', 'LowQualFinSF', 'PoolArea', 'Electrical', 'HouseStyle', 'BsmtFinSF2', 'LandContour', '3SsnPorch', 'Heating', 'GarageQual', 'PavedDrive', 'EnclosedPorch', 'YrSold', 'Foundation', 'BsmtFinType2', 'LandSlope', 'MSSubClass', 'ExterCond']


In [20]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')

In [21]:
for column in train_data.columns:
    if column not in final_important_cols:
        train_data[column] = 0
        test_data[column] = 0
train_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,LotConfig,LandSlope,Neighborhood,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,0,Reg,Lvl,Inside,Gtl,CollgCr,...,61,0,0,0,0,0,2,2008,WD,Normal
2,20,RL,80.0,9600,0,Reg,Lvl,FR2,Gtl,Veenker,...,0,0,0,0,0,0,5,2007,WD,Normal
3,60,RL,68.0,11250,0,IR1,Lvl,Inside,Gtl,CollgCr,...,42,0,0,0,0,0,9,2008,WD,Normal
4,70,RL,60.0,9550,0,IR1,Lvl,Corner,Gtl,Crawfor,...,35,272,0,0,0,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,0,IR1,Lvl,FR2,Gtl,NoRidge,...,84,0,0,0,0,0,12,2008,WD,Normal


In [22]:
X = train_data.copy()

In [23]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

In [24]:
clf.fit(X_train, y_train)

In [25]:
y_train_pred = clf.predict(X_valid)

mse = mean_squared_error(y_valid, y_train_pred)
mae = mean_absolute_error(y_valid, y_train_pred)
r2 = r2_score(y_valid, y_train_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2):", r2)

Mean Squared Error (MSE): 958329038.3481942
Mean Absolute Error (MAE): 16476.090231699487
R-squared (R2): 0.8612293861952632


In [26]:
X_test = test_data.copy()
X_test.drop(del_cols, axis=1, inplace=True)
submition = clf.predict(X_test)

In [27]:
# Create submission file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': submition})
output.to_csv('submission.csv', index=False)