In [1]:
import pandas as pd
import numpy as np
import _pickle

In [2]:
with open('../regular_data_ready_for_model', 'rb') as f:
    df = _pickle.load(f)
df.head()

Unnamed: 0,rent,latitude,longitude,oda_sayisi,salon_sayisi,brut_m2,net_m2,bina_yasi,kat_sayisi,esyali,...,binned_bulundugu_kat_cat_1,binned_bulundugu_kat_cat_10,binned_bulundugu_kat_cat_11,binned_bulundugu_kat_cat_2,binned_bulundugu_kat_cat_3,binned_bulundugu_kat_cat_4,binned_bulundugu_kat_cat_6,binned_bulundugu_kat_cat_7,binned_bulundugu_kat_cat_8,binned_bulundugu_kat_cat_9
8262,30000,41.043127,28.969064,2,1,100,75,1,8.0,1,...,0,0,1,0,0,0,0,0,0,0
8260,8000,40.979325,28.729905,3,1,145,130,30,5.0,0,...,0,0,1,0,0,0,0,0,0,0
7889,10000,41.057928,28.974291,3,1,110,100,15,5.0,0,...,0,0,1,0,0,0,0,0,0,0
7919,65000,41.152676,28.924586,3,1,165,121,0,3.0,0,...,0,0,0,0,0,1,0,0,0,0
7923,55000,41.207677,29.020296,6,2,450,430,16,4.0,0,...,0,1,0,0,0,0,0,0,0,0


In [3]:
categorical_cols = [col for col in df.columns if df[col].dtype == 'uint8']
df[categorical_cols] = df[categorical_cols].astype('int')

df['oda_sayisi'] = pd.to_numeric(df['oda_sayisi'], errors='coerce')
df['salon_sayisi'] = pd.to_numeric(df['salon_sayisi'], errors='coerce')

In [4]:
df.fillna(-10000, inplace=True)

In [5]:
from sklearn.model_selection import train_test_split, cross_val_score

In [6]:
X, y = df.drop('rent', axis=1), df['rent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)

# hyperparameter optimization

In [11]:
from sklearn.ensemble import RandomForestRegressor
import optuna

In [9]:
def objective(trial: optuna.Trial, X_train, y_train) -> float:
    
    params = {
    'bootstrap': True,
    'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
    'max_depth': trial.suggest_int('max_depth', 5, 50),
    'max_features': trial.suggest_int('max_features', 1, 10),
    'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
    'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
    }
    
    model = RandomForestRegressor(**params)
    
    return -cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=5, n_jobs=-1).mean()


In [12]:
study = optuna.create_study(direction='minimize',sampler=optuna.samplers.TPESampler())
study.optimize(lambda trial : objective(trial, X_train, y_train), n_trials=100)

[32m[I 2023-02-19 17:04:35,242][0m A new study created in memory with name: no-name-2fad29ef-af94-4d0b-9a89-4650e08beeb3[0m
[32m[I 2023-02-19 17:04:41,243][0m Trial 0 finished with value: 6144.776942859321 and parameters: {'n_estimators': 462, 'max_depth': 29, 'max_features': 3, 'min_samples_leaf': 1, 'min_samples_split': 8}. Best is trial 0 with value: 6144.776942859321.[0m
[32m[I 2023-02-19 17:04:44,387][0m Trial 1 finished with value: 5947.147876523057 and parameters: {'n_estimators': 323, 'max_depth': 16, 'max_features': 7, 'min_samples_leaf': 7, 'min_samples_split': 2}. Best is trial 1 with value: 5947.147876523057.[0m
[32m[I 2023-02-19 17:04:46,125][0m Trial 2 finished with value: 5789.47467947129 and parameters: {'n_estimators': 104, 'max_depth': 47, 'max_features': 9, 'min_samples_leaf': 8, 'min_samples_split': 13}. Best is trial 2 with value: 5789.47467947129.[0m
[32m[I 2023-02-19 17:04:46,960][0m Trial 3 finished with value: 6677.951895280939 and parameters: {'n

In [13]:
trials_df = study.trials_dataframe().sort_values('value', ascending=True)
trials_df.iloc[:15]

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_max_depth,params_max_features,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
55,55,5108.647209,2023-02-19 17:09:59.327300,2023-02-19 17:10:10.231696,0 days 00:00:10.904396,34,10,1,2,610,COMPLETE
51,51,5109.606018,2023-02-19 17:09:27.417550,2023-02-19 17:09:37.807466,0 days 00:00:10.389916,37,10,1,2,569,COMPLETE
72,72,5109.671318,2023-02-19 17:11:47.984612,2023-02-19 17:11:56.096357,0 days 00:00:08.111745,34,10,1,2,451,COMPLETE
99,99,5113.261676,2023-02-19 17:15:44.221300,2023-02-19 17:15:56.929819,0 days 00:00:12.708519,37,10,1,2,626,COMPLETE
52,52,5113.600135,2023-02-19 17:09:37.809466,2023-02-19 17:09:45.398058,0 days 00:00:07.588592,37,10,1,2,433,COMPLETE
48,48,5117.072301,2023-02-19 17:09:07.429194,2023-02-19 17:09:16.873158,0 days 00:00:09.443964,30,10,1,2,457,COMPLETE
93,93,5117.312106,2023-02-19 17:14:48.666570,2023-02-19 17:14:59.333237,0 days 00:00:10.666667,34,10,1,2,541,COMPLETE
86,86,5117.3455,2023-02-19 17:13:51.505200,2023-02-19 17:14:03.973399,0 days 00:00:12.468199,33,10,1,2,626,COMPLETE
91,91,5118.482395,2023-02-19 17:14:27.846002,2023-02-19 17:14:39.274720,0 days 00:00:11.428718,32,10,1,2,583,COMPLETE
73,73,5118.497662,2023-02-19 17:11:56.099358,2023-02-19 17:12:05.466176,0 days 00:00:09.366818,34,10,1,2,518,COMPLETE


In [14]:
study.best_params

{'n_estimators': 610,
 'max_depth': 34,
 'max_features': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [16]:
params = study.best_params

model = RandomForestRegressor(**params)
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=34, max_features=10, n_estimators=610)

In [17]:
from sklearn.metrics import mean_absolute_error

In [18]:
preds = model.predict(X_test)
mean_absolute_error(y_test, preds)

4800.4081588166955

In [19]:
# with open('rf_trained', 'wb') as f:
#     _pickle.dump(model, f)