In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_log_error
import re
from haversine import haversine, Unit
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression
import xgboost
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
import optuna

In [2]:
dfTrain = pd.read_csv("prepared_data/apartments_and_building_train.csv")
dfTest = pd.read_csv("prepared_data/apartments_and_building_test.csv")

In [3]:
def get_distance_center(lat1, lon1):
    dist = haversine((lat1, lon1), (55.751244, 37.618423), unit=Unit.METERS)
    return round(dist/1000, 3)

def get_shortest_distance(lat, lon, coordinates):
    shortest = np.inf
    for coord in coordinates:
        dist = haversine(coord, (lat, lon), unit=Unit.METERS)
        if dist < shortest:
            shortest = dist
    if shortest == np.inf:
        return 100
    return round(shortest/1000, 3)

In [4]:
dfTrain["distance_center"] = dfTrain.loc[:, "latitude":"longitude"].apply(lambda x: get_distance_center(x.latitude, x.longitude), axis=1)
dfTest["distance_center"] = dfTest.loc[:, "latitude":"longitude"].apply(lambda x: get_distance_center(x.latitude, x.longitude), axis=1)

In [5]:
def add_distance_to(target, df_train, df_test):
    targets = pd.read_csv(f"prepared_data/extra_features/{target}.csv")
    coordinates = targets.values.tolist()

    df_train[f"distance_{target}"] = df_train.loc[:, "latitude":"longitude"].apply(lambda x: get_shortest_distance(x.latitude, x.longitude, coordinates), axis=1)
    df_test[f"distance_{target}"] = df_test.loc[:, "latitude":"longitude"].apply(lambda x: get_shortest_distance(x.latitude, x.longitude, coordinates), axis=1)

""" positive """
add_distance_to("metro", dfTrain, dfTest)
add_distance_to("university", dfTrain, dfTest)
add_distance_to("shopping_center", dfTrain, dfTest)
add_distance_to("park", dfTrain, dfTest)
add_distance_to("golf_course", dfTrain, dfTest)
""" negative """
add_distance_to("airport", dfTrain, dfTest)
add_distance_to("prison", dfTrain, dfTest)

In [6]:
def process_data(dataframe, keep_col):
    remove_columns = ['street', 'address', 'building_id', 'id_y', 'id_x', 'price']
    
    for col in keep_col:
        remove_columns.remove(col)

    dataframe = dataframe.drop(remove_columns, axis=1)

    return dataframe

# Optimize parameters

## LGBMRegression

In [None]:
# normalize training data
train_data = dfTrain.copy()
train_data = process_data(train_data, keep_col=['price'])

# split training data into test and training split
X_train, X_test = train_test_split(train_data, test_size=0.10, random_state=42)

# remove apartments with very high price from training data
qhigh = X_train["price"].quantile(0.999)
X_train = X_train[(train_data["price"] < qhigh)]

# get price columns 
y_train = X_train.pop("price")
y_test = X_test.pop("price")

cols = X_train.columns

# log prices
y_train = np.log(y_train)

scaler = preprocessing.StandardScaler().fit(X_train)
imputor=IterativeImputer(
    estimator=BayesianRidge(),
    imputation_order='ascending', 
    max_iter=100,
    tol=1e-5)

X_train = imputor.fit_transform(X_train)
X_train = scaler.transform(X_train)

X_test = imputor.fit_transform(X_test)
X_test = scaler.transform(X_test)


"""
lgbm_regressor = lgb.LGBMRegressor(
    num_leaves=62,
    max_depth=80, 
    random_state=42,
    metric='rmse',
    n_jobs=5, 
    n_estimators=3821,
    colsample_bytree=0.5466374179397641,
    subsample=0.9925544863710687,
    learning_rate=0.041096962826700785
)
"""

def objective(trial):
    num_leaves = trial.suggest_int('num_leaves', 50, 74)
    max_depth = trial.suggest_int('max_depth', 60, 100)
    n_jobs = trial.suggest_int('n_jobs', 3, 7)
    n_estimators = trial.suggest_int('n_estimators', 3500, 4120)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.44, 0.64)
    subsample = trial.suggest_uniform('subsample', 0.9, 1)
    learning_rate = trial.suggest_uniform('learning_rate', 0.02, 0.06)

    clf = lgb.LGBMRegressor(
        num_leaves=num_leaves,
        max_depth=max_depth, 
        random_state=42,
        metric='rmse',
        n_jobs=n_jobs, 
        n_estimators=n_estimators,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        learning_rate=learning_rate
    )

    clf.fit(X_train, y_train)
    lgbm_prediction = clf.predict(X_test, num_iteration=clf.best_iteration_)
    lgbm_prediction = np.exp(lgbm_prediction)

    return np.sqrt(mean_squared_log_error(lgbm_prediction, y_test))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)
print(study.best_trial)

# 'num_leaves': 57, 'max_depth': 86, 'n_jobs': 5, 'n_estimators': 3708, 'colsample_bytree': 0.4811674364805376, 'subsample': 0.9293927819656886, 'learning_rate': 0.05183967241031373
# 'num_leaves': 52, 'max_depth': 75, 'n_jobs': 4, 'n_estimators': 3640, 'colsample_bytree': 0.48432530282059805, 'subsample': 0.9272124012179532, 'learning_rate': 0.05084923664278231


## XGBoost

In [23]:
# normalize training data
train_data = dfTrain.copy()
train_data = process_data(train_data, keep_col=['price'])

# split training data into test and training split
X_train, X_test = train_test_split(train_data, test_size=0.10, random_state=42)

# remove apartments with very high price from training data
qhigh = X_train["price"].quantile(0.999)
X_train = X_train[(train_data["price"] < qhigh)]

# get price columns 
y_train = X_train.pop("price")
y_test = X_test.pop("price")

cols = X_train.columns

# log prices
y_train = np.log(y_train)

scaler = preprocessing.StandardScaler().fit(X_train)
imputor=IterativeImputer(
    estimator=BayesianRidge(),
    imputation_order='ascending', 
    max_iter=100,
    tol=1e-5)

X_train = imputor.fit_transform(X_train)
X_train = scaler.transform(X_train)

X_test = imputor.fit_transform(X_test)
X_test = scaler.transform(X_test)


"""
xgboost_regressor=xgboost.XGBRegressor(
    base_score=0.6809801659452004, booster='gbtree', colsample_bylevel=1,
    colsample_bynode=1, colsample_bytree=0.7880864100886343, enable_categorical=False,
    gamma=0, gpu_id=-1, importance_type=None,
    interaction_constraints='', learning_rate=0.04158422646007316, max_delta_step=0,
    max_depth=12, min_child_weight=7,
    monotone_constraints='()', n_estimators=1894, n_jobs=16,
    num_parallel_tree=1, predictor='auto', random_state=42, reg_alpha=0,
    reg_lambda=1, scale_pos_weight=1, subsample=0.8295598441506259, tree_method='exact',
    validate_parameters=1, verbosity=None, objective='reg:squarederror'
)
"""

def objective(trial):
    base_score = trial.suggest_uniform('base_score', 0.6, 0.76)
    max_depth = trial.suggest_int('max_depth', 10, 14)
    n_jobs = trial.suggest_int('n_jobs', 14, 18)
    min_child_weight = trial.suggest_int('min_child_weight', 6, 10)
    n_estimators = trial.suggest_int('n_estimators', 1594, 2194)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.7, 0.86)
    subsample = trial.suggest_uniform('subsample', 0.73, 0.9)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.06)

    clf=xgboost.XGBRegressor(base_score=base_score, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=colsample_bytree,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=learning_rate, max_delta_step=0,
             max_depth=max_depth, min_child_weight=min_child_weight,
             monotone_constraints='()', n_estimators=n_estimators, n_jobs=n_jobs,
             num_parallel_tree=1, predictor='auto', random_state=42, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=subsample, tree_method='exact',
             validate_parameters=1, verbosity=None, objective='reg:squarederror',
             enable_categorical=False)

    clf.fit(X_train, y_train, early_stopping_rounds=10, eval_set=[(X_train, y_train), (X_test, np.log(y_test))])
    xgboost_prediction = clf.predict(X_test)
    xgboost_prediction = np.exp(xgboost_prediction)

    return np.sqrt(mean_squared_log_error(xgboost_prediction, y_test))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)
print(study.best_trial)

  X_train = X_train[(train_data["price"] < qhigh)]
[32m[I 2021-11-16 20:28:07,527][0m A new study created in memory with name: no-name-7d44a243-1a1b-46e7-a7f2-2396a452d068[0m


Parameters: { enable_categorical } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:14.93111	validation_1-rmse:14.93187
[1]	validation_0-rmse:14.19897	validation_1-rmse:14.20022
[2]	validation_0-rmse:13.50282	validation_1-rmse:13.50436
[3]	validation_0-rmse:12.84087	validation_1-rmse:12.84309
[4]	validation_0-rmse:12.21158	validation_1-rmse:12.21448
[5]	validation_0-rmse:11.61303	validation_1-rmse:11.61655
[6]	validation_0-rmse:11.04392	validation_1-rmse:11.04781
[7]	validation_0-rmse:10.50268	validation_1-rmse:10.50703
[8]	validation_0-rmse:9.98807	validation_1-rmse:9.99289
[9]	validation_0-rmse:9.49854	validation_1-rmse:9.50393
[10]	validation_0-rmse:9.03317	validation_1-rmse:9.03886
[11]	validation_0-rmse:8.59063	validation_1-rmse:8.59697
[12]	validation_

[32m[I 2021-11-16 20:28:13,588][0m Trial 0 finished with value: 0.12828821598611984 and parameters: {'base_score': 0.7492546989054832, 'max_depth': 12, 'n_jobs': 16, 'min_child_weight': 10, 'n_estimators': 2152, 'colsample_bytree': 0.7901313901911611, 'subsample': 0.7739265386526697, 'learning_rate': 0.0490824366951667}. Best is trial 0 with value: 0.12828821598611984.[0m


Parameters: { enable_categorical } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:15.45938	validation_1-rmse:15.45965
[1]	validation_0-rmse:15.21231	validation_1-rmse:15.21274
[2]	validation_0-rmse:14.96920	validation_1-rmse:14.96987
[3]	validation_0-rmse:14.73000	validation_1-rmse:14.73077
[4]	validation_0-rmse:14.49469	validation_1-rmse:14.49555
[5]	validation_0-rmse:14.26311	validation_1-rmse:14.26405
[6]	validation_0-rmse:14.03520	validation_1-rmse:14.03628
[7]	validation_0-rmse:13.81096	validation_1-rmse:13.81219
[8]	validation_0-rmse:13.59033	validation_1-rmse:13.59181
[9]	validation_0-rmse:13.37318	validation_1-rmse:13.37484
[10]	validation_0-rmse:13.15952	validation_1-rmse:13.16141
[11]	validation_0-rmse:12.94928	validation_1-rmse:12.95149
[12]	val

[32m[I 2021-11-16 20:28:31,793][0m Trial 1 finished with value: 0.12755621159709343 and parameters: {'base_score': 0.7396168964688108, 'max_depth': 10, 'n_jobs': 14, 'min_child_weight': 10, 'n_estimators': 1656, 'colsample_bytree': 0.7361508057895989, 'subsample': 0.7652792105520043, 'learning_rate': 0.015997140327359122}. Best is trial 1 with value: 0.12755621159709343.[0m


Parameters: { enable_categorical } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:15.37627	validation_1-rmse:15.37662
[1]	validation_0-rmse:15.05884	validation_1-rmse:15.05945
[2]	validation_0-rmse:14.74797	validation_1-rmse:14.74872
[3]	validation_0-rmse:14.44352	validation_1-rmse:14.44453
[4]	validation_0-rmse:14.14541	validation_1-rmse:14.14665
[5]	validation_0-rmse:13.85342	validation_1-rmse:13.85496
[6]	validation_0-rmse:13.56750	validation_1-rmse:13.56915
[7]	validation_0-rmse:13.28746	validation_1-rmse:13.28937
[8]	validation_0-rmse:13.01322	validation_1-rmse:13.01545
[9]	validation_0-rmse:12.74460	validation_1-rmse:12.74715
[10]	validation_0-rmse:12.48156	validation_1-rmse:12.48427
[11]	validation_0-rmse:12.22400	validation_1-rmse:12.22703
[12]	val

[32m[I 2021-11-16 20:28:54,362][0m Trial 2 finished with value: 0.12469757974374289 and parameters: {'base_score': 0.7496883842846649, 'max_depth': 11, 'n_jobs': 14, 'min_child_weight': 7, 'n_estimators': 1830, 'colsample_bytree': 0.7735299481694955, 'subsample': 0.8988956362147709, 'learning_rate': 0.020664667417701982}. Best is trial 2 with value: 0.12469757974374289.[0m


Parameters: { enable_categorical } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:14.99716	validation_1-rmse:14.99793
[1]	validation_0-rmse:14.27347	validation_1-rmse:14.27472
[2]	validation_0-rmse:13.58484	validation_1-rmse:13.58637
[3]	validation_0-rmse:12.92952	validation_1-rmse:12.93170
[4]	validation_0-rmse:12.30602	validation_1-rmse:12.30884
[5]	validation_0-rmse:11.71245	validation_1-rmse:11.71587
[6]	validation_0-rmse:11.14758	validation_1-rmse:11.15169
[7]	validation_0-rmse:10.60994	validation_1-rmse:10.61462
[8]	validation_0-rmse:10.09833	validation_1-rmse:10.10329
[9]	validation_0-rmse:9.61131	validation_1-rmse:9.61662
[10]	validation_0-rmse:9.14791	validation_1-rmse:9.15358
[11]	validation_0-rmse:8.70688	validation_1-rmse:8.71308
[12]	validatio

[32m[I 2021-11-16 20:28:59,767][0m Trial 3 finished with value: 0.12811178183419022 and parameters: {'base_score': 0.6926593277831476, 'max_depth': 10, 'n_jobs': 16, 'min_child_weight': 6, 'n_estimators': 1610, 'colsample_bytree': 0.8533848706626727, 'subsample': 0.7896354366663468, 'learning_rate': 0.048299277077769175}. Best is trial 2 with value: 0.12469757974374289.[0m


Parameters: { enable_categorical } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:15.56144	validation_1-rmse:15.56167
[1]	validation_0-rmse:15.36271	validation_1-rmse:15.36307
[2]	validation_0-rmse:15.16655	validation_1-rmse:15.16700
[3]	validation_0-rmse:14.97289	validation_1-rmse:14.97353
[4]	validation_0-rmse:14.78177	validation_1-rmse:14.78248
[5]	validation_0-rmse:14.59304	validation_1-rmse:14.59396
[6]	validation_0-rmse:14.40675	validation_1-rmse:14.40773
[7]	validation_0-rmse:14.22281	validation_1-rmse:14.22390
[8]	validation_0-rmse:14.04123	validation_1-rmse:14.04246
[9]	validation_0-rmse:13.86195	validation_1-rmse:13.86333
[10]	validation_0-rmse:13.68498	validation_1-rmse:13.68658
[11]	validation_0-rmse:13.51027	validation_1-rmse:13.51213
[12]	val

[32m[I 2021-11-16 20:29:22,412][0m Trial 4 finished with value: 0.12706833509295926 and parameters: {'base_score': 0.6872462907368927, 'max_depth': 12, 'n_jobs': 17, 'min_child_weight': 10, 'n_estimators': 1671, 'colsample_bytree': 0.7924416179939264, 'subsample': 0.7926979707946822, 'learning_rate': 0.012781848538679948}. Best is trial 2 with value: 0.12469757974374289.[0m


Parameters: { enable_categorical } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:15.40113	validation_1-rmse:15.40152
[1]	validation_0-rmse:15.03363	validation_1-rmse:15.03433
[2]	validation_0-rmse:14.67492	validation_1-rmse:14.67577
[3]	validation_0-rmse:14.32478	validation_1-rmse:14.32597
[4]	validation_0-rmse:13.98304	validation_1-rmse:13.98450
[5]	validation_0-rmse:13.64944	validation_1-rmse:13.65107
[6]	validation_0-rmse:13.32383	validation_1-rmse:13.32563
[7]	validation_0-rmse:13.00597	validation_1-rmse:13.00812
[8]	validation_0-rmse:12.69570	validation_1-rmse:12.69803
[9]	validation_0-rmse:12.39281	validation_1-rmse:12.39544
[10]	validation_0-rmse:12.09716	validation_1-rmse:12.10007
[11]	validation_0-rmse:11.80864	validation_1-rmse:11.81191
[12]	val

[32m[I 2021-11-16 20:29:35,725][0m Trial 5 finished with value: 0.12680478388152497 and parameters: {'base_score': 0.672359129062591, 'max_depth': 10, 'n_jobs': 18, 'min_child_weight': 8, 'n_estimators': 2095, 'colsample_bytree': 0.8295242535341589, 'subsample': 0.8999013157014284, 'learning_rate': 0.023885181767593806}. Best is trial 2 with value: 0.12469757974374289.[0m


Parameters: { enable_categorical } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:14.94450	validation_1-rmse:14.94531
[1]	validation_0-rmse:14.16511	validation_1-rmse:14.16658
[2]	validation_0-rmse:13.42657	validation_1-rmse:13.42866
[3]	validation_0-rmse:12.72660	validation_1-rmse:12.72949
[4]	validation_0-rmse:12.06316	validation_1-rmse:12.06653
[5]	validation_0-rmse:11.43431	validation_1-rmse:11.43818
[6]	validation_0-rmse:10.83835	validation_1-rmse:10.84281
[7]	validation_0-rmse:10.27339	validation_1-rmse:10.27815
[8]	validation_0-rmse:9.73794	validation_1-rmse:9.74312
[9]	validation_0-rmse:9.23034	validation_1-rmse:9.23584
[10]	validation_0-rmse:8.74933	validation_1-rmse:8.75496
[11]	validation_0-rmse:8.29348	validation_1-rmse:8.29961
[12]	validation_

[32m[I 2021-11-16 20:29:44,388][0m Trial 6 finished with value: 0.12603958017797454 and parameters: {'base_score': 0.68336312228434, 'max_depth': 12, 'n_jobs': 15, 'min_child_weight': 10, 'n_estimators': 2049, 'colsample_bytree': 0.8184980660006107, 'subsample': 0.8696813967664683, 'learning_rate': 0.05220186724592196}. Best is trial 2 with value: 0.12469757974374289.[0m


Parameters: { enable_categorical } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:15.21842	validation_1-rmse:15.21898
[1]	validation_0-rmse:14.68602	validation_1-rmse:14.68694
[2]	validation_0-rmse:14.17242	validation_1-rmse:14.17358
[3]	validation_0-rmse:13.67674	validation_1-rmse:13.67818
[4]	validation_0-rmse:13.19851	validation_1-rmse:13.20033
[5]	validation_0-rmse:12.73697	validation_1-rmse:12.73929
[6]	validation_0-rmse:12.29162	validation_1-rmse:12.29448
[7]	validation_0-rmse:11.86179	validation_1-rmse:11.86501
[8]	validation_0-rmse:11.44706	validation_1-rmse:11.45057
[9]	validation_0-rmse:11.04679	validation_1-rmse:11.05047
[10]	validation_0-rmse:10.66056	validation_1-rmse:10.66443
[11]	validation_0-rmse:10.28784	validation_1-rmse:10.29220
[12]	val

[32m[I 2021-11-16 20:29:53,903][0m Trial 7 finished with value: 0.1261530462900999 and parameters: {'base_score': 0.6799534947242954, 'max_depth': 13, 'n_jobs': 17, 'min_child_weight': 8, 'n_estimators': 2184, 'colsample_bytree': 0.7658590358391841, 'subsample': 0.8332887517591898, 'learning_rate': 0.03501417921533656}. Best is trial 2 with value: 0.12469757974374289.[0m


Parameters: { enable_categorical } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:15.53219	validation_1-rmse:15.53243
[1]	validation_0-rmse:15.32179	validation_1-rmse:15.32214
[2]	validation_0-rmse:15.11427	validation_1-rmse:15.11472
[3]	validation_0-rmse:14.90955	validation_1-rmse:14.91010
[4]	validation_0-rmse:14.70764	validation_1-rmse:14.70834
[5]	validation_0-rmse:14.50844	validation_1-rmse:14.50940
[6]	validation_0-rmse:14.31196	validation_1-rmse:14.31305
[7]	validation_0-rmse:14.11812	validation_1-rmse:14.11946
[8]	validation_0-rmse:13.92692	validation_1-rmse:13.92833
[9]	validation_0-rmse:13.73830	validation_1-rmse:13.73990
[10]	validation_0-rmse:13.55223	validation_1-rmse:13.55394
[11]	validation_0-rmse:13.36871	validation_1-rmse:13.37070
[12]	val

[32m[I 2021-11-16 20:30:18,478][0m Trial 8 finished with value: 0.1250769096623659 and parameters: {'base_score': 0.7045233696479124, 'max_depth': 13, 'n_jobs': 17, 'min_child_weight': 9, 'n_estimators': 1595, 'colsample_bytree': 0.7973951885352326, 'subsample': 0.8789177618063352, 'learning_rate': 0.013558394689787581}. Best is trial 2 with value: 0.12469757974374289.[0m


Parameters: { enable_categorical } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	validation_0-rmse:15.16318	validation_1-rmse:15.16379
[1]	validation_0-rmse:14.58575	validation_1-rmse:14.58673
[2]	validation_0-rmse:14.03036	validation_1-rmse:14.03161
[3]	validation_0-rmse:13.49617	validation_1-rmse:13.49797
[4]	validation_0-rmse:12.98249	validation_1-rmse:12.98481
[5]	validation_0-rmse:12.48830	validation_1-rmse:12.49105
[6]	validation_0-rmse:12.01293	validation_1-rmse:12.01612
[7]	validation_0-rmse:11.55565	validation_1-rmse:11.55931
[8]	validation_0-rmse:11.11582	validation_1-rmse:11.11970
[9]	validation_0-rmse:10.69267	validation_1-rmse:10.69692
[10]	validation_0-rmse:10.28574	validation_1-rmse:10.29021
[11]	validation_0-rmse:9.89433	validation_1-rmse:9.89936
[12]	valid

[32m[I 2021-11-16 20:30:30,633][0m Trial 9 finished with value: 0.1261527817536986 and parameters: {'base_score': 0.6866286065017388, 'max_depth': 10, 'n_jobs': 17, 'min_child_weight': 9, 'n_estimators': 1941, 'colsample_bytree': 0.8073613974252026, 'subsample': 0.7739952989335789, 'learning_rate': 0.03811785096815648}. Best is trial 2 with value: 0.12469757974374289.[0m


FrozenTrial(number=2, values=[0.12469757974374289], datetime_start=datetime.datetime(2021, 11, 16, 20, 28, 31, 793079), datetime_complete=datetime.datetime(2021, 11, 16, 20, 28, 54, 362527), params={'base_score': 0.7496883842846649, 'max_depth': 11, 'n_jobs': 14, 'min_child_weight': 7, 'n_estimators': 1830, 'colsample_bytree': 0.7735299481694955, 'subsample': 0.8988956362147709, 'learning_rate': 0.020664667417701982}, distributions={'base_score': UniformDistribution(high=0.76, low=0.6), 'max_depth': IntUniformDistribution(high=14, low=10, step=1), 'n_jobs': IntUniformDistribution(high=18, low=14, step=1), 'min_child_weight': IntUniformDistribution(high=10, low=6, step=1), 'n_estimators': IntUniformDistribution(high=2194, low=1594, step=1), 'colsample_bytree': UniformDistribution(high=0.86, low=0.7), 'subsample': UniformDistribution(high=0.9, low=0.73), 'learning_rate': UniformDistribution(high=0.06, low=0.01)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=2, state=T

# Catboost

In [8]:
# normalize training data
train_data = dfTrain.copy()
train_data = process_data(train_data, keep_col=['price'])

# split training data into test and training split
X_train, X_test = train_test_split(train_data, test_size=0.10, random_state=42)

# remove apartments with very high price from training data
qhigh = X_train["price"].quantile(0.999)
X_train = X_train[(train_data["price"] < qhigh)]

# get price columns 
y_train = X_train.pop("price")
y_test = X_test.pop("price")

cols = X_train.columns

# log prices
y_train = np.log(y_train)

scaler = preprocessing.StandardScaler().fit(X_train)
imputor=IterativeImputer(
    estimator=BayesianRidge(),
    imputation_order='ascending', 
    max_iter=100,
    tol=1e-5)

X_train = imputor.fit_transform(X_train)
X_train = scaler.transform(X_train)

X_test = imputor.fit_transform(X_test)
X_test = scaler.transform(X_test)


"""
catboost_regressor = CatBoostRegressor(
    n_estimators=2180,
    learning_rate=0.04364756663567214,
    thread_count=-1,
    depth=8,
    silent=True,
    random_state=42,
    bagging_temperature=0.16948019209038917
)
"""
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 1500, 2800)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.1)
    depth = trial.suggest_int('depth', 5, 12)
    bagging_temperature = trial.suggest_uniform('bagging_temperature', 0.05, 0.3)

    clf=CatBoostRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        thread_count=-1,
        depth=depth,
        silent=True,
        random_state=42,
        bagging_temperature=bagging_temperature
    )

    clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0, early_stopping_rounds=10)
    cat_boost_prediction = clf.predict(X_test)
    cat_boost_prediction = np.exp(cat_boost_prediction)

    return np.sqrt(mean_squared_log_error(cat_boost_prediction, y_test))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)
print(study.best_trial)

  X_train = X_train[(train_data["price"] < qhigh)]
[32m[I 2021-11-16 21:33:16,513][0m A new study created in memory with name: no-name-56c2c238-da96-476e-9b4a-2e9b9932036b[0m
[32m[I 2021-11-16 21:35:19,810][0m Trial 0 finished with value: 0.14827788748084952 and parameters: {'n_estimators': 2572, 'learning_rate': 0.02187446040965243, 'depth': 12, 'bagging_temperature': 0.14190010939091607}. Best is trial 0 with value: 0.14827788748084952.[0m
[32m[I 2021-11-16 21:35:21,622][0m Trial 1 finished with value: 0.15907758154063162 and parameters: {'n_estimators': 2717, 'learning_rate': 0.0417690769577881, 'depth': 6, 'bagging_temperature': 0.13560244833904733}. Best is trial 0 with value: 0.14827788748084952.[0m
[32m[I 2021-11-16 21:35:28,196][0m Trial 2 finished with value: 0.1540579717224881 and parameters: {'n_estimators': 1841, 'learning_rate': 0.021467649745829806, 'depth': 8, 'bagging_temperature': 0.08177135246802035}. Best is trial 0 with value: 0.14827788748084952.[0m
[3

FrozenTrial(number=4, values=[0.14457284224062036], datetime_start=datetime.datetime(2021, 11, 16, 21, 35, 33, 956276), datetime_complete=datetime.datetime(2021, 11, 16, 21, 35, 46, 595139), params={'n_estimators': 2043, 'learning_rate': 0.09604047649624867, 'depth': 10, 'bagging_temperature': 0.16597749203341156}, distributions={'n_estimators': IntUniformDistribution(high=2800, low=1500, step=1), 'learning_rate': UniformDistribution(high=0.1, low=0.01), 'depth': IntUniformDistribution(high=12, low=5, step=1), 'bagging_temperature': UniformDistribution(high=0.3, low=0.05)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=4, state=TrialState.COMPLETE, value=None)


# Random Forest

In [8]:
# normalize training data
train_data = dfTrain.copy()
train_data = process_data(train_data, keep_col=['price'])

# split training data into test and training split
X_train, X_test = train_test_split(train_data, test_size=0.10, random_state=42)

# remove apartments with very high price from training data
qhigh = X_train["price"].quantile(0.999)
X_train = X_train[(train_data["price"] < qhigh)]

# get price columns 
y_train = X_train.pop("price")
y_test = X_test.pop("price")

cols = X_train.columns

# log prices
y_train = np.log(y_train)

scaler = preprocessing.StandardScaler().fit(X_train)
imputor=IterativeImputer(
    estimator=BayesianRidge(),
    imputation_order='ascending', 
    max_iter=100,
    tol=1e-5)

X_train = imputor.fit_transform(X_train)
X_train = scaler.transform(X_train)

X_test = imputor.fit_transform(X_test)
X_test = scaler.transform(X_test)


"""
random_forest_regressor = RandomForestRegressor(
    n_estimators=100,
    criterion='mse',
    max_depth=None,
#     min_samples_split=2,
#     min_samples_leaf=2,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    bootstrap=True,
    oob_score=False,
    n_jobs=None,
    random_state=42,
    verbose=0,
    warm_start=False,
    ccp_alpha=0.0,
    max_samples=None
)
"""
# 0.146
# Trial 1 finished with value: 0.14516572314604645 and parameters: {'n_estimators': 361, 'max_leaf_nodes': 7728}
# Trial 1 finished with value: 0.14511747770969016 and parameters: {'n_estimators': 333, 'max_leaf_nodes': 9732}

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 300, 400)
    #max_depth = trial.suggest_int("max_depth", 18, 32, log=True)
    #min_samples_split = trial.suggest_int('min_samples_split', 60, 120)
    #min_samples_leaf = trial.suggest_int('min_samples_leaf', 25, 65)
    max_leaf_nodes = trial.suggest_int('max_leaf_nodes', 7000, 10000)
    #max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt','log2'])

    clf=RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=None,
        #min_samples_split=min_samples_split,
        #min_samples_leaf=min_samples_leaf,
        min_weight_fraction_leaf=0.0,
        max_features='auto',
        max_leaf_nodes=max_leaf_nodes,
        min_impurity_decrease=0.0,
        bootstrap=True,
        oob_score=False,
        n_jobs=None,
        random_state=42,
        verbose=0,
        warm_start=False,
        ccp_alpha=0.0,
        max_samples=None
    )

    clf.fit(X_train, y_train)
    random_forest_prediction = clf.predict(X_test)
    random_forest_prediction = np.exp(random_forest_prediction)

    return np.sqrt(mean_squared_log_error(random_forest_prediction, y_test))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)
print(study.best_trial)

  X_train = X_train[(train_data["price"] < qhigh)]
[32m[I 2021-11-16 22:25:26,241][0m A new study created in memory with name: no-name-55e5ec17-45db-411b-aebf-0780560e05d1[0m
[32m[I 2021-11-16 22:26:42,568][0m Trial 0 finished with value: 0.14516143294393405 and parameters: {'n_estimators': 367, 'max_leaf_nodes': 8188}. Best is trial 0 with value: 0.14516143294393405.[0m
[32m[I 2021-11-16 22:27:52,626][0m Trial 1 finished with value: 0.14511747770969016 and parameters: {'n_estimators': 333, 'max_leaf_nodes': 9732}. Best is trial 1 with value: 0.14511747770969016.[0m
[32m[I 2021-11-16 22:29:09,844][0m Trial 2 finished with value: 0.14519095854407266 and parameters: {'n_estimators': 368, 'max_leaf_nodes': 8365}. Best is trial 1 with value: 0.14511747770969016.[0m
[32m[I 2021-11-16 22:30:33,078][0m Trial 3 finished with value: 0.14524646820814577 and parameters: {'n_estimators': 388, 'max_leaf_nodes': 8235}. Best is trial 1 with value: 0.14511747770969016.[0m
[32m[I 2021-1

FrozenTrial(number=1, values=[0.14511747770969016], datetime_start=datetime.datetime(2021, 11, 16, 22, 26, 42, 569000), datetime_complete=datetime.datetime(2021, 11, 16, 22, 27, 52, 626704), params={'n_estimators': 333, 'max_leaf_nodes': 9732}, distributions={'n_estimators': IntUniformDistribution(high=400, low=300, step=1), 'max_leaf_nodes': IntUniformDistribution(high=10000, low=7000, step=1)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=1, state=TrialState.COMPLETE, value=None)


# Ada boosting

In [12]:
# normalize training data
train_data = dfTrain.copy()
train_data = process_data(train_data, keep_col=['price'])

# split training data into test and training split
X_train, X_test = train_test_split(train_data, test_size=0.10, random_state=42)

# remove apartments with very high price from training data
qhigh = X_train["price"].quantile(0.999)
X_train = X_train[(train_data["price"] < qhigh)]

# get price columns 
y_train = X_train.pop("price")
y_test = X_test.pop("price")

cols = X_train.columns

# log prices
y_train = np.log(y_train)

scaler = preprocessing.StandardScaler().fit(X_train)
imputor=IterativeImputer(
    estimator=BayesianRidge(),
    imputation_order='ascending', 
    max_iter=100,
    tol=1e-5)

X_train = imputor.fit_transform(X_train)
X_train = scaler.transform(X_train)

X_test = imputor.fit_transform(X_test)
X_test = scaler.transform(X_test)


"""
ada_boost_regressor = AdaBoostRegressor(
    n_estimators=1500,
    learning_rate=0.05,
    loss='square',
    random_state=42
)
"""
# 0.304
# 0.3039 {'n_estimators': 713, 'learning_rate': 0.17754141972985152} 
# 0.296 {'n_estimators': 949, 'learning_rate': 0.02833005273221735}
# 0.2938 {'n_estimators': 650, 'learning_rate': 0.05553081486523326}

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 600, 1100)
    learning_rate = trial.suggest_uniform('learning_rate', 0.01, 0.06)

    clf=AdaBoostRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        loss='square',
        random_state=42
    )

    clf.fit(X_train, y_train)
    ada_boost_prediction = clf.predict(X_test)
    ada_boost_prediction = np.exp(ada_boost_prediction)

    return np.sqrt(mean_squared_log_error(ada_boost_prediction, y_test))

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)
print(study.best_trial)

  X_train = X_train[(train_data["price"] < qhigh)]
[32m[I 2021-11-16 22:54:52,172][0m A new study created in memory with name: no-name-417f2d19-15eb-4452-b495-e594a25c4c85[0m
[32m[I 2021-11-16 22:55:25,741][0m Trial 0 finished with value: 0.300660822366838 and parameters: {'n_estimators': 620, 'learning_rate': 0.03286704071648859}. Best is trial 0 with value: 0.300660822366838.[0m
[32m[I 2021-11-16 22:56:08,341][0m Trial 1 finished with value: 0.3013101446407485 and parameters: {'n_estimators': 754, 'learning_rate': 0.027310725327456868}. Best is trial 0 with value: 0.300660822366838.[0m
[32m[I 2021-11-16 22:56:38,773][0m Trial 2 finished with value: 0.29386621821893455 and parameters: {'n_estimators': 650, 'learning_rate': 0.05553081486523326}. Best is trial 2 with value: 0.29386621821893455.[0m
[32m[I 2021-11-16 22:57:20,230][0m Trial 3 finished with value: 0.31431176076257117 and parameters: {'n_estimators': 613, 'learning_rate': 0.011304113145412905}. Best is trial 2 

FrozenTrial(number=2, values=[0.29386621821893455], datetime_start=datetime.datetime(2021, 11, 16, 22, 56, 8, 342857), datetime_complete=datetime.datetime(2021, 11, 16, 22, 56, 38, 773483), params={'n_estimators': 650, 'learning_rate': 0.05553081486523326}, distributions={'n_estimators': IntUniformDistribution(high=1100, low=600, step=1), 'learning_rate': UniformDistribution(high=0.06, low=0.01)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=2, state=TrialState.COMPLETE, value=None)
