In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

## Data Preparation

### Data Loading

In [2]:
path_train = '/Users/vasevooo/projects/house_prices/housing_kaggle/data/train.csv'
path_test = '/Users/vasevooo/projects/house_prices/housing_kaggle/data/test.csv'

df_train = pd.read_csv(path_train)
df_test = pd.read_csv(path_test)

df_train.head()
df_train.shape, df_test.shape


((1460, 81), (1459, 80))

In [3]:
df = pd.concat([df_train.drop('SalePrice', axis=1), df_test], axis=0)
df.reset_index(inplace=True, drop=True)
y = np.log(df_train['SalePrice'])
df.head(5)

ind_train = df_train.shape[0]
ind_test = df_test.shape[0]

ind_train, ind_test


(1460, 1459)

In [4]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


### feature preparation

In [5]:
null_value_stats = df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType      1766
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
dtype: int64

In [6]:
df.fillna(-999, inplace=True)

In [7]:
null_value_stats = df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Series([], dtype: int64)

In [8]:
df_train = df.loc[:ind_test,:].drop('Id', axis= 1)
df_test = df.loc[ind_train:,:]

df_train.shape, df_test.shape

((1460, 79), (1459, 80))

In [9]:
print(df_train.dtypes)

categorical_features_indices = np.where(df_train.dtypes != float)[0]
categorical_features_indices

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 79, dtype: object


array([ 0,  1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 32, 34, 38, 39,
       40, 41, 42, 43, 44, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 59,
       62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78])

### Data Splitting

In [10]:
from sklearn.model_selection import train_test_split

X = df_train
X_test = df_test.drop('Id',axis=1)

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)
X_train.shape, X_validation.shape, y_train.shape, y_validation.shape


((1095, 79), (365, 79), (1095,), (365,))

## CatBoost Basics

In [11]:
from catboost import CatBoostRegressor, Pool, metrics, cv
from sklearn.metrics import mean_squared_error

### Model Training

In [12]:
model = CatBoostRegressor(
    loss_function='RMSE',
    random_seed=42,
    logging_level='Silent'
)

In [13]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    plot=True
);

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

### Model Cross-Validation

In [14]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': 'RMSE'
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [15]:
print('Best validation accuracy score: {:.4f}±{:.3f} on step {}'.format(
    np.min(cv_data['test-RMSE-mean']),
    cv_data['test-RMSE-std'][np.argmin(cv_data['test-RMSE-mean'])],
    np.argmin(cv_data['test-RMSE-mean'])
))

Best validation accuracy score: 0.1716±0.014 on step 999


In [16]:
print('Precise validation accuracy score: {}'.format(np.min(cv_data['test-RMSE-mean'])))

Precise validation accuracy score: 0.1715679101864691


### Model Applying

In [17]:
submission_baseline = pd.DataFrame()
submission_baseline['Id'] = df_test['Id']
submission_baseline ['SalePrice'] = np.exp(model.predict(X_test))

In [18]:
# submission_baseline.to_csv('/Users/vasevooo/projects/house_prices/housing_kaggle/data/submission_baseline.csv', index=False)

In [19]:
print("train acc:", mean_squared_error(y_train, model.predict(X_train), squared=False))
print("valid acc:", mean_squared_error(y_validation, model.predict(X_validation), squared=False))

train acc: 0.07741940218304316
valid acc: 0.13523559769658716


## CatBoost Features

In [20]:
params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

### Using the best model

In [21]:
model = CatBoostRegressor(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostRegressor(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

print('Simple model validation accuracy: {:.4}'.format(
    mean_squared_error(y_validation, model.predict(X_validation), squared=False)
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    mean_squared_error(y_validation, best_model.predict(X_validation), squared=False)
))

Simple model validation accuracy: 0.1394

Best model validation accuracy: 0.1393


### Early Stopping

In [22]:
%%time
model = CatBoostRegressor(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: user 25.7 s, sys: 4.23 s, total: 29.9 s
Wall time: 7.83 s


<catboost.core.CatBoostRegressor at 0x14f5b4490>

In [23]:
%%time
earlystop_params = params.copy()
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostRegressor(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool);

CPU times: user 20.6 s, sys: 3.46 s, total: 24.1 s
Wall time: 5.61 s


In [24]:
print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    mean_squared_error(y_validation, model.predict(X_validation), squared=False)
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    mean_squared_error(y_validation, earlystop_model.predict(X_validation), squared=False)
))

Simple model tree count: 500
Simple model validation accuracy: 0.1394

Early-stopped model tree count: 409
Early-stopped model validation accuracy: 0.1397


### Using Baseline

In [25]:
current_params = params.copy()
current_params.update({
    'iterations': 10
})
model = CatBoostRegressor(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline);

### Snapshot Support

In [26]:
params_with_snapshot = params.copy()
params_with_snapshot.update({
    'iterations': 50,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})
model = CatBoostRegressor(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)
params_with_snapshot.update({
    'iterations': 100,
    'learning_rate': 0.1,
})
model = CatBoostRegressor(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)


bestTest = 0.1572779214
bestIteration = 34


bestTest = 0.1572779214
bestIteration = 34



### Feature Importances

In [27]:
model = CatBoostRegressor(iterations=500, random_seed=42, logging_level='Silent').fit(train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

OverallQual: 12.820283405473697
TotalBsmtSF: 5.790748460122556
Neighborhood: 5.719308611704258
ExterQual: 5.180685438748352
FireplaceQu: 5.172588857397844
TotRmsAbvGrd: 4.70432654082583
Fireplaces: 4.0504581533253905
GarageArea: 3.539415005810179
BsmtFinSF1: 3.4864897330861333
FullBath: 3.4668949913473828
KitchenQual: 3.268796225695479
BsmtQual: 3.251912162079809
GarageCars: 3.0078851767579957
MSSubClass: 2.388714011894593
GarageFinish: 1.9778037845974157
HeatingQC: 1.6995310086397997
HalfBath: 1.594240820953803
OverallCond: 1.471836402974526
GarageQual: 1.349847081946364
GarageCond: 1.32725390855051
LotShape: 1.1707377471372036
PavedDrive: 1.147164503192137
CentralAir: 1.0937176975373222
BedroomAbvGr: 1.0871058624769077
GarageType: 1.0743862041742855
PoolQC: 0.9417048264766691
YearRemodAdd: 0.8562472467397516
MSZoning: 0.8390541333926195
SaleCondition: 0.7940709894739365
HouseStyle: 0.7880550200006974
BldgType: 0.7580650814558276
GarageYrBlt: 0.7111929689294042
Condition2: 0.688204536

### Eval Metrics

In [28]:
model = CatBoostRegressor(iterations=500, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, 'RMSE', plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

### Learning Processes Comparison 

In [29]:
model1 = CatBoostRegressor(iterations=100, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostRegressor(iterations=100, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

In [30]:
from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

### Model Saving

In [31]:
model = CatBoostRegressor(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostRegressor()
model.load_model('catboost_model.dump');

## Parameters Tuning

In [32]:
import hyperopt

def hyperopt_objective(params):
    model = CatBoostRegressor(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='RMSE',
        random_seed=42,
        verbose=False,
        loss_function='RMSE',
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params(),
        logging_level='Silent',
    )
    best_error = np.min(cv_data['test-RMSE-mean'])
    
    return best_error # as hyperopt minimises

In [33]:
from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 0.01, 0.5),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    # rstate=123
)

print(best)

 42%|████▏     | 21/50 [10:11<14:04, 29.13s/trial, best loss: 0.17283827567468735]


KeyboardInterrupt: 

In [34]:
model = CatBoostRegressor(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='RMSE',
    random_seed=42,
    verbose=False,
    loss_function='RMSE',
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

NameError: name 'best' is not defined

In [None]:
print('Precise validation RMSE score: {}'.format(np.min(cv_data['test-RMSE-mean'])))

Precise validation RMSE score: 0.16386573950333302


## Grid Search

### Simple grid search

In [35]:
model = CatBoostRegressor(
    eval_metric='RMSE',
    random_seed=42,
    verbose=False,
    loss_function='RMSE')

In [36]:
grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth':[1, 2,3],
    'l2_leaf_reg': [1, 3, 5], 
    'iterations': [50, 100, 500, 1000]
}
grid_search_results = model.grid_search(grid, train_pool, shuffle=False, verbose=3, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


bestTest = 7.308193184
bestIteration = 49

0:	loss: 7.3081932	best: 7.3081932 (0)	total: 93.2ms	remaining: 9.97s

bestTest = 0.9999062959
bestIteration = 49


bestTest = 0.2181508862
bestIteration = 49


bestTest = 7.32156386
bestIteration = 49

3:	loss: 7.3215639	best: 0.2181509 (2)	total: 244ms	remaining: 6.34s

bestTest = 1.008326419
bestIteration = 49


bestTest = 0.2245610796
bestIteration = 49


bestTest = 7.334355487
bestIteration = 49

6:	loss: 7.3343555	best: 0.2181509 (2)	total: 379ms	remaining: 5.46s

bestTest = 1.016274016
bestIteration = 49


bestTest = 0.2358990745
bestIteration = 49


bestTest = 4.438651491
bestIteration = 99

9:	loss: 4.4386515	best: 0.2181509 (2)	total: 560ms	remaining: 5.48s

bestTest = 0.2176611756
bestIteration = 99


bestTest = 0.1572959333
bestIteration = 99


bestTest = 4.453932911
bestIteration = 99

12:	loss: 4.4539329	best: 0.1572959 (11)	total: 827ms	remaining: 6.04s

bestTest = 0.2238350822
bestIteration = 99


bestTest = 0.1627473452
bestI

In [None]:
grid_search_results['params']

{'depth': 3, 'l2_leaf_reg': 1, 'iterations': 1000, 'learning_rate': 0.1}

In [None]:
grid_search_results['cv_results'].keys()

dict_keys(['iterations', 'test-RMSE-mean', 'test-RMSE-std', 'train-RMSE-mean', 'train-RMSE-std'])

In [37]:
simple_gs_model = CatBoostRegressor(
    eval_metric='RMSE',
    random_seed=42,
    verbose=False,
    loss_function='RMSE', 
    learning_rate=grid_search_results['params']['learning_rate'], 
    depth=grid_search_results['params']['depth'],
    l2_leaf_reg=grid_search_results['params']['l2_leaf_reg'],
    iterations=grid_search_results['params']['iterations']

).fit(train_pool)

cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

Training on fold [0/3]

bestTest = 0.1527830821
bestIteration = 935

Training on fold [1/3]

bestTest = 0.1488158056
bestIteration = 797

Training on fold [2/3]

bestTest = 0.1226435902
bestIteration = 817



In [38]:
mean_squared_error(y_validation, simple_gs_model.predict(X_validation), squared=False)

0.13457163823548607

In [42]:
submission_tuning = pd.DataFrame()
submission_tuning['Id'] = df_test['Id']


submission_tuning ['SalePrice'] = np.exp(simple_gs_model.predict(X_test))
submission_tuning['SalePrice'].mean()

178471.78100128766

In [212]:
submission_tuning.to_csv('/Users/vasevooo/projects/house_prices/housing_kaggle/data/submission_tuning.csv', index=False)

## Stacking

In [233]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
ohe = OneHotEncoder()

### feature preparation

In [251]:
df_cat = df.select_dtypes(include = ['object']).astype(str)
ohe.fit(df_cat)
encoded_features = ohe.transform(df_cat).toarray()

feature_names = []
for i, cat in enumerate(ohe.categories_):
    feature_names.extend([f"{df_cat.columns[i]}_{value}" for value in cat])

encoded_df = pd.DataFrame(encoded_features, columns=feature_names)

In [265]:
df_num = df.select_dtypes(include = ['int', 'float']).drop('Id',axis=1)

In [266]:
scaled_df = pd.DataFrame(ss.fit_transform(df_num), columns=df_num.columns)

In [267]:
scaled_df.shape, encoded_df.shape

((2919, 36), (2919, 274))

In [268]:
df_stacking = pd.concat([scaled_df, encoded_df], axis=1)

In [270]:
df_train = df_stacking.loc[:ind_test,:]
df_test = df_stacking.loc[ind_train:,:]

df_train.shape, df_test.shape

((1460, 310), (1459, 310))

In [272]:
X = df_train
X_test = df_test

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)
X_train.shape, X_validation.shape, y_train.shape, y_validation.shape

((1095, 310), (365, 310), (1095,), (365,))

In [301]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import GridSearchCV

In [328]:
# Create a list of base models to use in the stacked ensemble

base_models = [
    ('knn', KNeighborsRegressor()),
    ('rf', RandomForestRegressor(n_estimators = 5, max_depth = 2)),
    ('lr', LinearRegression())
    
]

# Create a stacked ensemble model using the CatBoost model and the base models
stacked_model = StackingRegressor(
    estimators=base_models,
    final_estimator=simple_gs_model,
    cv=5
)

# Fit the stacked model to your data
stacked_model.fit(X_train, y_train)

# Use the fitted model to make predictions on new data
predictions = stacked_model.predict(X_test)

In [329]:
val_rmse = mean_squared_error(y_validation, stacked_model.predict(X_validation), squared=False)
train_rmse = mean_squared_error(y_train, stacked_model.predict(X_train), squared=False)

print(f'Validation RMSE: {val_rmse:.4f}')
print(f'Train RMSE: {train_rmse:.4f}')

Validation RMSE: 0.1441
Train RMSE: 0.1092


In [330]:
submission_stacking = pd.DataFrame()
submission_stacking['Id'] = df.loc[ind_train:,'Id']
submission_stacking ['SalePrice'] = np.exp(stacked_model.predict(X_test))

In [331]:
submission_stacking.to_csv('/Users/vasevooo/projects/house_prices/housing_kaggle/data/submission_stacking.csv', index=False)