# Presets

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (
    RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor, GradientBoostingRegressor
)
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn_genetic.space import Continuous, Categorical, Integer
from sklearn_genetic import GASearchCV
from sklearn.model_selection import cross_val_predict
#import pymc as pm
#import pymc_bart as pmb

In [2]:
df = pd.read_csv('C:/Projects/WarsawFlatPrices/data/clean/dataset_final.csv')

# Convert characters to 0-1

In [3]:
df.drop(columns=['id'], inplace=True)
dtypes_str = ['object', 'category']
vars_subset = df.columns.tolist()
vars_str = df.loc[:, vars_subset].select_dtypes(include=dtypes_str).columns
vars_to_drop = []
for i in vars_str:
    vars_to_drop.append(i+'_'+df[i].value_counts().index[0])
df = pd.get_dummies(df, columns=vars_str, prefix=vars_str)
df = df.drop(columns=vars_to_drop)

In [4]:
df.to_csv('C:/Projects/wappapp/data/df_lgbm_to_train.csv')

# Grids

In [5]:
model_grid_ga_rf = {
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(10, 80),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(2, 10),
    'max_features': Continuous(0.02, 0.4, distribution='uniform')
}

In [6]:
model_grid_ga_et = {
    'n_estimators': Integer(100, 1000),
    'max_depth': Integer(10, 80),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(2, 10),
    'max_features': Continuous(0.02, 0.4, distribution='uniform')
}

In [7]:
model_grid_ga_ad = {
    'n_estimators': Integer(500, 1000),
    'learning_rate': Continuous(0.01, 0.60, distribution='uniform')
}

In [8]:
model_grid_ga_gb = {
    'learning_rate': Continuous(0.01, 0.60, distribution='uniform'),
    'n_estimators': Integer(50, 1000)
}

In [9]:
model_grid_ga_xgb = {
    'n_estimators': Integer(50, 1000),
    'learning_rate': Continuous(0.01, 0.60, distribution='uniform')
}

In [10]:
model_grid_ga_lgbm = {
    'n_estimators': Integer(50, 1000),
    'learning_rate': Continuous(0.01, 0.60, distribution='uniform')
}

In [11]:
model_grid_ga_cat = {
    'iterations': Integer(50, 1000),
    'learning_rate': Continuous(0.01, 0.60, distribution='uniform')
}

In [12]:
model_grid_ga_mlp = {
    'hidden_layer_sizes': Integer(100, 1000),
    'max_iter': Integer(200, 1000)
}

# Model selection

In [14]:
df.to_csv('C:/Projects/wappapp/data/flats.csv')

In [None]:
X = df.copy()
y = X['price_per_m']
X.drop(columns=['price_per_m'], inplace=True)

In [None]:
model = CatBoostRegressor(train_dir='C:/Projects/WarsawFlatPrices/catboost/')
#MLPRegressor(learning_rate='adaptive')
#LGBMRegressor()
#GradientBoostingRegressor()
#AdaBoostRegressor()
#ExtraTreesRegressor(bootstrap=True)
#RandomForestRegressor()
model_grid = model_grid_ga_cat

In [None]:
a

In [None]:
 model_grid_search_cv = GASearchCV(
    estimator=model,
    cv=10,
    scoring='neg_root_mean_squared_error',
    population_size=20,
    generations=1,
    tournament_size=10,
    elitism=True,
    crossover_probability=0.8,
    mutation_probability=0.1,
    param_grid=model_grid,
    #criteria='max',
    algorithm='eaMuPlusLambda',
    n_jobs=-1,
    verbose=True,
    keep_top_k=3
    ).fit(X, y)

In [None]:
print("Accuracy:", model_grid_search_cv.best_score_, "\n")
print("Best params", model_grid_search_cv.best_params_, "\n")

# Best grids

In [None]:
best_rf = RandomForestRegressor(
    n_estimators=720,
    max_depth=40,
    min_samples_split=4,
    min_samples_leaf=2,
    max_features=0.3492
)

best_ef = ExtraTreesRegressor(
    n_estimators=299,
    max_depth=50,
    min_samples_split=8,
    min_samples_leaf=2,
    max_features=0.3549,
    bootstrap=True
)
best_ad = AdaBoostRegressor(
    n_estimators=650,
    learning_rate=0.07430
)

best_gd = GradientBoostingRegressor(
    learning_rate=0.10016,
    n_estimators=948
)

best_xgb = XGBRegressor(
    n_estimators=592,
    learning_rate=0.12283
)

best_lgbm = LGBMRegressor(
    n_estimators=286,
    learning_rate=0.07589
)
best_cat = CatBoostRegressor(
    iterations=978,
    learning_rate=0.12395,
    train_dir='C:/Projects/WarsawFlatPrices/catboost/'
)
best_mlp = MLPRegressor(
    hidden_layer_sizes=938,
    max_iter=703,
    learning_rate='adaptive'
)

# Testing model

In [None]:
m_lgbm = best_lgbm.fit(X, y)

In [None]:
XX = pd.read_csv('data/clean/dataset_testing.csv')
XX['rooms_num_1'] = 0
XX['rooms_num_3'] = 1
XX['rooms_num_4'] = 0
XX['rooms_num_4+'] = 0
XX['region_Bemowo'] = 0
XX['region_Białołęka'] = 0
XX['region_Bielany'] = 0
XX['region_Mokotów'] = 0
XX['region_Ochota'] = 0
XX['region_Praga-Południe'] = 0
XX['region_Praga-Północ'] = 0
XX['region_Rembertów'] = 0
XX['region_Targówek'] = 0
XX['region_Ursus'] = 0
XX['region_Ursynów'] = 0
XX['region_Wawer'] = 0
XX['region_Wesoła'] = 0
XX['region_Wilanów'] = 0
XX['region_Włochy'] = 0
XX['region_Śródmieście'] = 0
XX['region_Żoliborz'] = 0
XX['building_type_apartment'] = 1
XX['building_type_block'] = 0
XX['windows_type_aluminium'] = 0
XX['windows_type_plastic'] = 1
XX['windows_type_wooden'] = 0
XX['floor_no_0'] = 0
XX['floor_no_2'] = 0
XX['floor_no_3'] = 0#1
XX['floor_no_4'] = 0
XX['floor_no_5'] = 1#0
XX['floor_no_6'] = 0
XX['floor_no_7'] = 0
XX['floor_no_8'] = 0
XX['floor_no_9'] = 0
XX['floor_no_9+'] = 0
XX['floor_no_unknown'] = 0
XX['building_floors_num_1'] = 0
XX['building_floors_num_2'] = 0
XX['building_floors_num_3'] = 0
XX['building_floors_num_5'] = 0
XX['building_floors_num_6'] = 0
XX['building_floors_num_7'] = 0
XX['building_floors_num_8'] = 1
XX['building_floors_num_9'] = 0
XX['building_floors_num_9+'] = 0
XX['building_floors_num_unknown'] = 0
XX.drop(columns=['price_per_m', 'id'], inplace=True)
XX = XX[X.columns]

In [None]:
#XX['is_market_primary']

In [None]:
#https://www.money.pl/gospodarka/ceny-mieszkan-wystrzelily-najnowsze-dane-6936938860427904a.html#:~:text=Cena%20za%20metr%20kwadratowy%20w%20tym%20mie%C5%9Bcie%20skoczy%C5%82a%20do%2014%20805%20z%C5%82.&text=Najwy%C5%BCsze%20stawki%20nadal%20charakteryzuj%C4%85%20Warszaw%C4%99,sierpniu%20ju%C5%BC%2015%20394%20z%C5%82.

In [None]:
#https://www.otodom.pl/pl/oferta/mieszkanie-m3-ul-jana-kazimierza-wola-bezposrednio-ID4mNn4.html?_ga=2.107833158.598607718.1694011103-283185793.1694011103&_gac=1.115792244.1694011104.Cj0KCQjwxuCnBhDLARIsAB-cq1oUWMcMSGJigbd2BeDhmpMB2aOCeXP6b0PXPHbrM47l4yGEusEoXboaAge-EALw_wcB&_gl=1*1mnei2q*_ga*MjgzMTg1NzkzLjE2OTQwMTExMDM.*_ga_6PZTQNYS5C*MTY5NDAxMTEwNC4xLjAuMTY5NDAxMTEwNC4wLjAuMA..

In [None]:
np.round(np.exp(m_lgbm.predict(XX))*66, 0)[0]

In [None]:
pred_t = np.round(np.exp(m_lgbm.predict(XX))*66, 0)[0]*1.052
print('expected price:', pred_t)
print('real price:', 1180000)
print('difference:', pred_t-1180000)
# 15-16k wiecej

In [None]:
#1154524

# Variable importance

In [None]:
m_rf = best_rf.fit(X, y)
m_ef = best_ef.fit(X, y)
m_ad = best_ad.fit(X, y)
m_gd = best_gd.fit(X, y)
m_xgb = best_xgb.fit(X, y)
m_lgbm = best_lgbm.fit(X, y)
m_cat = best_cat.fit(X, y)

In [None]:
feature_names = X.columns
importances = m_lgbm.feature_importances_
importances_df = pd.DataFrame({'feature_names': feature_names, 'importances':importances})
importances_df['importances'] = importances_df['importances']/(286*4)
importances_df.loc[importances_df['importances']>0.1].sort_values(#0.01
    by=['importances'], ascending=False
)

# Predicting using CV

In [None]:
y_pred_rf = cross_val_predict(best_rf, X, y, cv=10)

In [None]:
y_pred_ef = cross_val_predict(best_ef, X, y, cv=10)

In [None]:
y_pred_ad = cross_val_predict(best_ad, X, y, cv=10)

In [None]:
y_pred_gd = cross_val_predict(best_gd, X, y, cv=10)

In [None]:
y_pred_xgb = cross_val_predict(best_xgb, X, y, cv=10)

In [None]:
y_pred_lgbm = cross_val_predict(best_lgbm, X, y, cv=10)

In [None]:
y_pred_cat = cross_val_predict(best_cat, X, y, cv=10)

In [None]:
y_pred_mlp = cross_val_predict(best_mlp, X, y, cv=10)

In [None]:
df_res = {
    'pred_rf': y_pred_rf,
    'pred_ef': y_pred_ef,
    'pred_ad': y_pred_ad,
    'pred_gd': y_pred_gd,
    'pred_xgb': y_pred_xgb,
    'pred_lgbm': y_pred_lgbm,
    'pred_cat': y_pred_cat,
    'pred_mlp': y_pred_mlp,
}

In [None]:
df_res = pd.DataFrame(df_res)

In [None]:
df_res.to_csv('C:/Projects/WarsawFlatPrices/data/clean/ml_pred.csv')