In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [47]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,142125


In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [49]:
null_ratio = df.isnull().mean()
cols_to_drop = null_ratio[null_ratio > 0.7].index
print("Deleted columns due to high null ratio")
print(list(cols_to_drop))
df = df.drop(columns=cols_to_drop)

Deleted columns due to high null ratio
['Alley', 'PoolQC', 'Fence', 'MiscFeature']


In [50]:
df['FireplaceQu'] = df['FireplaceQu'].fillna("No Fireplace")

In [51]:
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

In [52]:
garages = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in garages:
    df[col] = df[col].fillna("No Garage")

In [53]:
df['MasVnrType'] = df['MasVnrType'].fillna("No Masonry Veneer")
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

In [54]:
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(df['GarageYrBlt'].median())

In [55]:
df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])

In [56]:
bsmt_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for col in bsmt_cols:
    df[col] = df[col].fillna("No Basement")

In [58]:
y = df['SalePrice']
X = df.drop(['SalePrice', 'Id'], axis=1)

In [59]:
categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(exclude=['object']).columns

In [61]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [66]:
from catboost import CatBoostRegressor

y = df['SalePrice']
X = df.drop(['SalePrice', 'Id'], axis=1)

categorical_features = X.select_dtypes(include=['object']).columns.tolist()

model = CatBoostRegressor(
    iterations=2000,
    depth=6,
    learning_rate=0.05,
    loss_function='RMSE',
    verbose=False
)

model.fit(X, y, cat_features=categorical_features)


<catboost.core.CatBoostRegressor at 0x24b89df3380>

In [69]:
preds = model.predict(X)
rmse = np.sqrt(mean_squared_error(y, preds))
print("RMSE:", rmse)
r2 = r2_score(y, preds)
print("R^2:", r2)

RMSE: 7838.332664589847
R^2: 0.9902582022325117


In [71]:
feature_importance = model.get_feature_importance()
features = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance
}).sort_values(by='Importance', ascending=False)

features.head(20)

Unnamed: 0,Feature,Importance
15,OverallQual,21.711273
44,GrLivArea,13.585604
32,BsmtFinSF1,4.429473
41,1stFlrSF,4.223974
59,GarageCars,3.89891
55,FireplaceQu,3.878301
28,BsmtQual,3.714188
36,TotalBsmtSF,3.659792
3,LotArea,3.195286
51,KitchenQual,2.621419


In [None]:
test = pd.read_csv("test.csv")

cols_to_drop = ['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence']
for col in cols_to_drop:
    if col in test.columns:
        test = test.drop(columns=col)

test['FireplaceQu'] = test['FireplaceQu'].fillna("No Fireplace")

garage_cols_obj = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for col in garage_cols_obj:
    test[col] = test[col].fillna("No Garage")

test['GarageYrBlt'] = test['GarageYrBlt'].fillna(0)

bsmt_cols = ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2']
for col in bsmt_cols:
    test[col] = test[col].fillna("No Basement")

test['MasVnrArea'] = test['MasVnrArea'].fillna(0)
test['MasVnrType'] = test['MasVnrType'].fillna("None")

test['Electrical'] = test['Electrical'].fillna(test['Electrical'].mode()[0])

test['LotFrontage'] = test.groupby('Neighborhood')['LotFrontage'] \
                          .transform(lambda x: x.fillna(x.median()))
test['LotFrontage'] = test['LotFrontage'].fillna(test['LotFrontage'].median())  # Catch remaining

categorical_features = test.select_dtypes(include=['object']).columns.tolist()



In [76]:
for col in categorical_features:
    X[col] = X[col].fillna('Missing')
    X[col] = X[col].astype(str)

model.fit(X, y, cat_features=categorical_features)

<catboost.core.CatBoostRegressor at 0x24b89df3380>

In [86]:
for col in categorical_features:
    test[col] = test[col].fillna("Unknown")

In [87]:
test_preds = model.predict(test)

In [88]:
submission = pd.DataFrame({
    "Id": pd.read_csv("test.csv")["Id"],
    "SalePrice": test_preds
})

submission.to_csv("submission.csv", index=False)