# Presets

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
)
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn_genetic.space import Continuous, Categorical, Integer
from sklearn_genetic import GASearchCV
from sklearn.model_selection import cross_val_predict
#import pymc as pm
#import pymc_bart as pmb

In [2]:
df = pd.read_csv('C:/Projects/WarsawFlatPrices/data/clean/dataset_final.csv')

# Convert characters to 0-1

In [3]:
df.drop(columns=['id'], inplace=True)
dtypes_str = ['object', 'category']
vars_subset = df.columns.tolist()
vars_str = df.loc[:, vars_subset].select_dtypes(include=dtypes_str).columns
vars_to_drop = []
for i in vars_str:
    vars_to_drop.append(i+'_'+df[i].value_counts().index[0])
df = pd.get_dummies(df, columns=vars_str, prefix=vars_str)
df = df.drop(columns=vars_to_drop)

In [4]:
df.columns.tolist()

['price_per_m',
 'is_kitchen_separate',
 'is_closed_territory',
 'is_domophone',
 'is_security',
 'is_furniture',
 'is_air_cond',
 'is_balcony',
 'is_basement',
 'is_garage',
 'is_elevator',
 'is_phone',
 'is_security_windows',
 'is_terrace',
 'is_utility_room',
 'is_alarm',
 'is_garden',
 'is_remote_service',
 'is_first_time',
 'x',
 'y',
 'dist_airport',
 'dist_attraction',
 'dist_bank',
 'dist_bar',
 'dist_beauty_shop',
 'dist_bike_parking',
 'dist_bike_rent',
 'dist_bus_station',
 'dist_car_service',
 'dist_college_university',
 'dist_construction',
 'dist_cultural',
 'dist_dormitory',
 'dist_fast_food',
 'dist_food_shop',
 'dist_healthcare_institution',
 'dist_jeweller',
 'dist_office',
 'dist_park',
 'dist_pharmacy',
 'dist_prison',
 'dist_public_institution',
 'dist_public_service',
 'dist_renthouse',
 'dist_restaurant',
 'dist_school_kindergarden',
 'dist_service',
 'dist_shop',
 'dist_shopping_mall',
 'dist_sport_object',
 'dist_subway_entrance',
 'dist_temple_catholic',
 'dis

# Grids

In [5]:
model_grid_ga_rf = {
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(10, 80),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(2, 10),
    'max_features': Continuous(0.02, 0.4, distribution='uniform')
}

In [6]:
model_grid_ga_et = {
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(10, 80),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(2, 10),
    'max_features': Continuous(0.02, 0.4, distribution='uniform')
}

In [7]:
model_grid_ga_ad = {
    'n_estimators': Integer(500, 1000),
    'learning_rate': Continuous(0.01, 0.60, distribution='uniform')
}

In [8]:
model_grid_ga_gb = {
    'learning_rate': Continuous(0.01, 0.60, distribution='uniform'),
    'n_estimators': Integer(50, 1000)
}

In [9]:
model_grid_ga_xgb = {
    'n_estimators': Integer(50, 1000),
    'learning_rate': Continuous(0.01, 0.60, distribution='uniform')
}

In [10]:
model_grid_ga_lgbm = {
    'n_estimators': Integer(50, 1000),
    'learning_rate': Continuous(0.01, 0.60, distribution='uniform')
}

In [11]:
model_grid_ga_cat = {
    'iterations': Integer(50, 1000),
    'learning_rate': Continuous(0.01, 0.60, distribution='uniform')
}

In [12]:
model_grid_ga_mlp = {
    'hidden_layer_sizes': Integer(100, 1000),
    'max_iter': Integer(200, 1000)
}

# Model selection

In [13]:
X = df.copy()
y = X['price_per_m']
X.drop(columns=['price_per_m'], inplace=True)

In [14]:
model = CatBoostRegressor(train_dir='C:/Projects/WarsawFlatPrices/catboost/')
#MLPRegressor(learning_rate='adaptive')
#LGBMRegressor()
#GradientBoostingRegressor()
#AdaBoostRegressor()
#ExtraTreesRegressor(bootstrap=True)
#RandomForestRegressor()
model_grid = model_grid_ga_cat

In [15]:
a

NameError: name 'a' is not defined

In [None]:
 model_grid_search_cv = GASearchCV(
    estimator=model,
    cv=10,
    scoring='neg_root_mean_squared_error',
    population_size=20,
    generations=1,
    tournament_size=10,
    elitism=True,
    crossover_probability=0.8,
    mutation_probability=0.1,
    param_grid=model_grid,
    #criteria='max',
    algorithm='eaMuPlusLambda',
    n_jobs=-1,
    verbose=True,
    keep_top_k=3
    ).fit(X, y)

In [None]:
print("Accuracy:", model_grid_search_cv.best_score_, "\n")
print("Best params", model_grid_search_cv.best_params_, "\n")

# Best grids

In [51]:
best_rf = RandomForestRegressor(
    n_estimators=720,
    max_depth=40,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features=0.3492
)

best_ef = ExtraTreesRegressor(
    n_estimators=299,
    max_depth=50,
    min_samples_split=8,
    min_samples_leaf=2,
    max_features=0.3549,
    bootstrap=True
)
best_ad = AdaBoostRegressor(
    n_estimators=650,
    learning_rate=0.07430
)

best_gd = GradientBoostingRegressor(
    learning_rate=0.10016,
    n_estimators=948
)

best_xgb = XGBRegressor(
    n_estimators=592,
    learning_rate=0.12283
)

best_lgbm = LGBMRegressor(
    n_estimators=286,
    learning_rate=0.07589
)
best_cat = CatBoostRegressor(
    iterations=978,
    learning_rate=0.12395,
    train_dir='C:/Projects/WarsawFlatPrices/catboost/'
)
best_mlp = MLPRegressor(
    hidden_layer_sizes=938,
    max_iter=703,
    learning_rate='adaptive'
)

# Variable importance

In [17]:
m_rf = best_rf.fit(X, y)
m_ef = best_ef.fit(X, y)
m_ad = best_ad.fit(X, y)
m_gd = best_gd.fit(X, y)
m_xgb = best_xgb.fit(X, y)
m_lgbm = best_lgbm.fit(X, y)
m_cat = best_cat.fit(X, y)

0:	learn: 0.3217569	total: 142ms	remaining: 2m 18s
1:	learn: 0.3041208	total: 147ms	remaining: 1m 11s
2:	learn: 0.2896476	total: 153ms	remaining: 49.6s
3:	learn: 0.2770820	total: 158ms	remaining: 38.4s
4:	learn: 0.2660689	total: 163ms	remaining: 31.8s
5:	learn: 0.2580065	total: 169ms	remaining: 27.4s
6:	learn: 0.2501666	total: 176ms	remaining: 24.4s
7:	learn: 0.2431645	total: 182ms	remaining: 22.1s
8:	learn: 0.2374188	total: 188ms	remaining: 20.2s
9:	learn: 0.2324358	total: 195ms	remaining: 18.9s
10:	learn: 0.2282211	total: 202ms	remaining: 17.7s
11:	learn: 0.2238840	total: 208ms	remaining: 16.8s
12:	learn: 0.2199477	total: 214ms	remaining: 15.9s
13:	learn: 0.2172580	total: 221ms	remaining: 15.2s
14:	learn: 0.2146110	total: 227ms	remaining: 14.5s
15:	learn: 0.2119216	total: 233ms	remaining: 14s
16:	learn: 0.2094355	total: 239ms	remaining: 13.5s
17:	learn: 0.2076372	total: 245ms	remaining: 13.1s
18:	learn: 0.2056387	total: 251ms	remaining: 12.6s
19:	learn: 0.2038253	total: 256ms	remaini

161:	learn: 0.1416670	total: 1.05s	remaining: 5.27s
162:	learn: 0.1414637	total: 1.05s	remaining: 5.26s
163:	learn: 0.1412362	total: 1.06s	remaining: 5.25s
164:	learn: 0.1409346	total: 1.06s	remaining: 5.24s
165:	learn: 0.1406103	total: 1.07s	remaining: 5.23s
166:	learn: 0.1403606	total: 1.07s	remaining: 5.22s
167:	learn: 0.1400365	total: 1.08s	remaining: 5.21s
168:	learn: 0.1397414	total: 1.09s	remaining: 5.2s
169:	learn: 0.1394851	total: 1.09s	remaining: 5.19s
170:	learn: 0.1392586	total: 1.1s	remaining: 5.18s
171:	learn: 0.1391080	total: 1.1s	remaining: 5.17s
172:	learn: 0.1389443	total: 1.11s	remaining: 5.16s
173:	learn: 0.1387344	total: 1.11s	remaining: 5.14s
174:	learn: 0.1387278	total: 1.12s	remaining: 5.13s
175:	learn: 0.1384698	total: 1.12s	remaining: 5.12s
176:	learn: 0.1384637	total: 1.13s	remaining: 5.11s
177:	learn: 0.1382201	total: 1.13s	remaining: 5.1s
178:	learn: 0.1380675	total: 1.14s	remaining: 5.09s
179:	learn: 0.1378368	total: 1.14s	remaining: 5.08s
180:	learn: 0.13

326:	learn: 0.1160001	total: 1.95s	remaining: 3.89s
327:	learn: 0.1159090	total: 1.96s	remaining: 3.88s
328:	learn: 0.1157617	total: 1.97s	remaining: 3.88s
329:	learn: 0.1156415	total: 1.97s	remaining: 3.87s
330:	learn: 0.1155111	total: 1.98s	remaining: 3.87s
331:	learn: 0.1153851	total: 1.98s	remaining: 3.86s
332:	learn: 0.1153446	total: 1.99s	remaining: 3.85s
333:	learn: 0.1152008	total: 1.99s	remaining: 3.85s
334:	learn: 0.1150338	total: 2s	remaining: 3.84s
335:	learn: 0.1148897	total: 2.01s	remaining: 3.83s
336:	learn: 0.1147688	total: 2.01s	remaining: 3.83s
337:	learn: 0.1146495	total: 2.02s	remaining: 3.82s
338:	learn: 0.1145366	total: 2.02s	remaining: 3.81s
339:	learn: 0.1144522	total: 2.03s	remaining: 3.81s
340:	learn: 0.1144494	total: 2.03s	remaining: 3.8s
341:	learn: 0.1143644	total: 2.04s	remaining: 3.79s
342:	learn: 0.1142752	total: 2.04s	remaining: 3.78s
343:	learn: 0.1141851	total: 2.05s	remaining: 3.78s
344:	learn: 0.1140670	total: 2.05s	remaining: 3.77s
345:	learn: 0.11

490:	learn: 0.0998380	total: 2.85s	remaining: 2.83s
491:	learn: 0.0998358	total: 2.86s	remaining: 2.83s
492:	learn: 0.0997445	total: 2.87s	remaining: 2.82s
493:	learn: 0.0997430	total: 2.87s	remaining: 2.81s
494:	learn: 0.0997131	total: 2.88s	remaining: 2.81s
495:	learn: 0.0996330	total: 2.88s	remaining: 2.8s
496:	learn: 0.0995570	total: 2.89s	remaining: 2.79s
497:	learn: 0.0995057	total: 2.89s	remaining: 2.79s
498:	learn: 0.0994416	total: 2.9s	remaining: 2.78s
499:	learn: 0.0993767	total: 2.9s	remaining: 2.77s
500:	learn: 0.0993040	total: 2.91s	remaining: 2.77s
501:	learn: 0.0992560	total: 2.91s	remaining: 2.76s
502:	learn: 0.0991782	total: 2.92s	remaining: 2.76s
503:	learn: 0.0990584	total: 2.92s	remaining: 2.75s
504:	learn: 0.0989966	total: 2.93s	remaining: 2.75s
505:	learn: 0.0989275	total: 2.94s	remaining: 2.74s
506:	learn: 0.0988327	total: 2.94s	remaining: 2.73s
507:	learn: 0.0987832	total: 2.95s	remaining: 2.73s
508:	learn: 0.0986787	total: 2.95s	remaining: 2.72s
509:	learn: 0.0

656:	learn: 0.0884088	total: 3.76s	remaining: 1.84s
657:	learn: 0.0883397	total: 3.76s	remaining: 1.83s
658:	learn: 0.0882591	total: 3.77s	remaining: 1.82s
659:	learn: 0.0882102	total: 3.77s	remaining: 1.82s
660:	learn: 0.0881374	total: 3.78s	remaining: 1.81s
661:	learn: 0.0880599	total: 3.79s	remaining: 1.81s
662:	learn: 0.0880268	total: 3.79s	remaining: 1.8s
663:	learn: 0.0879865	total: 3.8s	remaining: 1.79s
664:	learn: 0.0879536	total: 3.8s	remaining: 1.79s
665:	learn: 0.0878652	total: 3.81s	remaining: 1.78s
666:	learn: 0.0878334	total: 3.81s	remaining: 1.78s
667:	learn: 0.0877547	total: 3.82s	remaining: 1.77s
668:	learn: 0.0876638	total: 3.82s	remaining: 1.77s
669:	learn: 0.0876127	total: 3.83s	remaining: 1.76s
670:	learn: 0.0875384	total: 3.83s	remaining: 1.75s
671:	learn: 0.0874852	total: 3.84s	remaining: 1.75s
672:	learn: 0.0874003	total: 3.85s	remaining: 1.74s
673:	learn: 0.0873430	total: 3.85s	remaining: 1.74s
674:	learn: 0.0872338	total: 3.86s	remaining: 1.73s
675:	learn: 0.0

818:	learn: 0.0789709	total: 4.66s	remaining: 905ms
819:	learn: 0.0789412	total: 4.67s	remaining: 900ms
820:	learn: 0.0788671	total: 4.67s	remaining: 894ms
821:	learn: 0.0788161	total: 4.68s	remaining: 888ms
822:	learn: 0.0787643	total: 4.68s	remaining: 882ms
823:	learn: 0.0787152	total: 4.69s	remaining: 877ms
824:	learn: 0.0786660	total: 4.7s	remaining: 871ms
825:	learn: 0.0786310	total: 4.7s	remaining: 865ms
826:	learn: 0.0785546	total: 4.71s	remaining: 859ms
827:	learn: 0.0785275	total: 4.71s	remaining: 854ms
828:	learn: 0.0784863	total: 4.72s	remaining: 848ms
829:	learn: 0.0784396	total: 4.73s	remaining: 843ms
830:	learn: 0.0783615	total: 4.73s	remaining: 837ms
831:	learn: 0.0783085	total: 4.74s	remaining: 831ms
832:	learn: 0.0782502	total: 4.74s	remaining: 826ms
833:	learn: 0.0781990	total: 4.75s	remaining: 820ms
834:	learn: 0.0781517	total: 4.75s	remaining: 814ms
835:	learn: 0.0781212	total: 4.76s	remaining: 808ms
836:	learn: 0.0780813	total: 4.76s	remaining: 803ms
837:	learn: 0.

In [64]:
feature_names = X.columns
importances = m_lgbm.feature_importances_
importances_df = pd.DataFrame({'feature_names': feature_names, 'importances':importances})
importances_df['importances'] = importances_df['importances']/(286*4)
importances_df.loc[importances_df['importances']>0.1].sort_values(#0.01
    by=['importances'], ascending=False
)

Unnamed: 0,feature_names,importances
48,dist_shopping_mall,0.156469
40,dist_prison,0.156469
75,dist_river,0.146853
59,dist_road_residential,0.143357
33,dist_fast_food,0.13986
28,dist_car_service,0.138986
61,dist_road_service,0.137238
25,dist_bike_parking,0.121503
32,dist_dormitory,0.121503
76,dist_cbd,0.120629


# Predicting using CV

In [None]:
y_pred_rf = cross_val_predict(best_rf, X, y, cv=10)

In [None]:
y_pred_ef = cross_val_predict(best_ef, X, y, cv=10)

In [None]:
y_pred_ad = cross_val_predict(best_ad, X, y, cv=10)

In [None]:
y_pred_gd = cross_val_predict(best_gd, X, y, cv=10)

In [None]:
y_pred_xgb = cross_val_predict(best_xgb, X, y, cv=10)

In [None]:
y_pred_lgbm = cross_val_predict(best_lgbm, X, y, cv=10)

In [None]:
y_pred_cat = cross_val_predict(best_cat, X, y, cv=10)

In [None]:
y_pred_mlp = cross_val_predict(best_mlp, X, y, cv=10)

In [None]:
df_res = {
    'pred_rf': y_pred_rf,
    'pred_ef': y_pred_ef,
    'pred_ad': y_pred_ad,
    'pred_gd': y_pred_gd,
    'pred_xgb': y_pred_xgb,
    'pred_lgbm': y_pred_lgbm,
    'pred_cat': y_pred_cat,
    'pred_mlp': y_pred_mlp,
}

In [None]:
df_res = pd.DataFrame(df_res)

In [None]:
df_res.to_csv('C:/Projects/WarsawFlatPrices/data/clean/ml_pred.csv')