In [37]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

import warnings
warnings.simplefilter('ignore')

In [52]:
train = pd.read_csv('train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [53]:
numerics = ['int64', 'float64']

train = train.select_dtypes(include=numerics)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 38 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   OverallQual    1460 non-null   int64  
 5   OverallCond    1460 non-null   int64  
 6   YearBuilt      1460 non-null   int64  
 7   YearRemodAdd   1460 non-null   int64  
 8   MasVnrArea     1452 non-null   float64
 9   BsmtFinSF1     1460 non-null   int64  
 10  BsmtFinSF2     1460 non-null   int64  
 11  BsmtUnfSF      1460 non-null   int64  
 12  TotalBsmtSF    1460 non-null   int64  
 13  1stFlrSF       1460 non-null   int64  
 14  2ndFlrSF       1460 non-null   int64  
 15  LowQualFinSF   1460 non-null   int64  
 16  GrLivArea      1460 non-null   int64  
 17  BsmtFullBath   1460 non-null   int64  
 18  BsmtHalf

In [54]:
def remove_outlier_IQR(df):
    Q1=df.quantile(0.05)
    Q3=df.quantile(0.95)
    IQR=Q3-Q1
    df_final=df[~((df<(Q1-1.5*IQR)) | (df>(Q3+1.5*IQR)))]
    return df_final

In [55]:
train = remove_outlier_IQR(train)
len(train)

1460

In [56]:
train.isna().sum()

Id                 0
MSSubClass         0
LotFrontage      261
LotArea           15
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea        11
BsmtFinSF1         1
BsmtFinSF2        10
BsmtUnfSF          0
TotalBsmtSF        1
1stFlrSF           1
2ndFlrSF           0
LowQualFinSF      26
GrLivArea          1
BsmtFullBath       1
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       1
KitchenAbvGr      68
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt       81
GarageCars         0
GarageArea         0
WoodDeckSF         1
OpenPorchSF        3
EnclosedPorch      1
3SsnPorch         24
ScreenPorch        3
PoolArea           7
MiscVal           52
MoSold             0
YrSold             0
SalePrice          2
dtype: int64

In [57]:
train.dropna(inplace=True)
len(train)

988

In [64]:
X = train.drop(columns=['Id','SalePrice'])
y = train['SalePrice']

finalize = {'models':[],
            'description':[],
            'cv_score':[],
            'cv_mean':[],
            'cv_std':[]}

finalize['models'] = [RandomForestRegressor(), LGBMRegressor(verbose=-1), CatBoostRegressor(verbose=False)]
finalize['description'] = ['default random forest', 'default lgbm', 'default catboost']

In [65]:
for i in range(len(finalize['models'])):
    model = finalize['models'][i]

    score = np.around(cross_val_score(model, X, y, cv=10, scoring='neg_root_mean_squared_error'),4)*-1
    mean = round(np.mean(score),4)
    std = round(np.std(score),4)
    finalize['cv_score'].append(score)
    finalize['cv_mean'].append(mean)
    finalize['cv_std'].append(std)

results = pd.DataFrame(finalize)
results

Unnamed: 0,models,description,cv_score,cv_mean,cv_std
0,RandomForestRegressor(),default random forest,"[23529.9847, 24967.1118, 21896.5625, 24429.368...",25348.9795,2894.4043
1,LGBMRegressor(verbose=-1),default lgbm,"[21533.5655, 26410.2824, 22803.3311, 23067.459...",25007.2108,3387.1617
2,<catboost.core.CatBoostRegressor object at 0x0...,default catboost,"[20360.4501, 21741.9752, 20432.9742, 21189.283...",22376.3258,3300.1902


In [67]:
final_model = CatBoostRegressor(verbose=False)
final_model.fit(X,y)

test = pd.read_csv('test.csv')
X_test = test[X.columns]
test_preds = final_model.predict(X_test)

output = pd.DataFrame({'Id': test.Id,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)