In [1]:
import pandas as pd
import numpy as np
import _pickle

import matplotlib.pyplot as plt

from sklearn.svm import SVR

from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import mean_absolute_error

from sklearn.impute import SimpleImputer

In [2]:
with open('../regular_data_ready_for_model', 'rb') as f:
    df = _pickle.load(f)
df.head()

Unnamed: 0,rent,latitude,longitude,oda_sayisi,salon_sayisi,brut_m2,net_m2,bina_yasi,kat_sayisi,esyali,...,binned_bulundugu_kat_cat_1,binned_bulundugu_kat_cat_10,binned_bulundugu_kat_cat_11,binned_bulundugu_kat_cat_2,binned_bulundugu_kat_cat_3,binned_bulundugu_kat_cat_4,binned_bulundugu_kat_cat_6,binned_bulundugu_kat_cat_7,binned_bulundugu_kat_cat_8,binned_bulundugu_kat_cat_9
8262,30000,41.043127,28.969064,2,1,100,75,1,8.0,1,...,0,0,1,0,0,0,0,0,0,0
8260,8000,40.979325,28.729905,3,1,145,130,30,5.0,0,...,0,0,1,0,0,0,0,0,0,0
7889,10000,41.057928,28.974291,3,1,110,100,15,5.0,0,...,0,0,1,0,0,0,0,0,0,0
7919,65000,41.152676,28.924586,3,1,165,121,0,3.0,0,...,0,0,0,0,0,1,0,0,0,0
7923,55000,41.207677,29.020296,6,2,450,430,16,4.0,0,...,0,1,0,0,0,0,0,0,0,0


In [None]:
###################

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10255 entries, 8262 to 10947
Data columns (total 55 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   rent                         10255 non-null  int32  
 1   latitude                     9191 non-null   float64
 2   longitude                    9191 non-null   float64
 3   oda_sayisi                   10255 non-null  object 
 4   salon_sayisi                 10255 non-null  object 
 5   brut_m2                      10255 non-null  int64  
 6   net_m2                       10255 non-null  int64  
 7   bina_yasi                    10255 non-null  int32  
 8   kat_sayisi                   10243 non-null  float64
 9   esyali                       10255 non-null  int32  
 10  banyo_sayisi                 10247 non-null  float64
 11  site_icerisinde_binary       10255 non-null  bool   
 12  dogu                         10255 non-null  bool   
 13  bati         

there is no missing value for the categorical variables

In [4]:
X, y = df.drop('rent', axis=1), df['rent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=10)

impute and scale within cross validation

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

def impute_and_scale(X_train, X_val, y_train, y_val, imputation_strategy='median'):
    numerical_cols = ['latitude', 'longitude', 'oda_sayisi', 'salon_sayisi', 'brut_m2',
               'net_m2', 'bina_yasi', 'kat_sayisi', 'banyo_sayisi',
               'site_icerisinde_binary', 'cephe_sayisi', 'bulundugu_kat_num']
    
    # fit the imputer on the train set and impute it
    imputer = SimpleImputer(missing_values=np.nan, strategy=imputation_strategy)
    train_imputed = pd.DataFrame(imputer.fit_transform(pd.concat([y_train, X_train], axis=1)), 
                                 columns=[y_train.name]+list(X_train.columns))
    
    # resplit the train set
    X_train_imputed, y_train = train_imputed.drop('rent', axis=1), train_imputed['rent']
    
    # fit the scaler on the train set and scale it
    scaler = RobustScaler()
    X_train_num_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed[numerical_cols]), columns=numerical_cols)
    
    X_train_scaled = X_train_imputed.copy()
    X_train_scaled[numerical_cols] = X_train_num_scaled
    
    # impute the val set
    val_imputed = pd.DataFrame(imputer.transform(pd.concat([y_val, X_val], axis=1)), 
                             columns=[y_val.name]+list(X_val.columns))
    
    # resplit the val set
    X_val_imputed, y_val = val_imputed.drop('rent', axis=1), val_imputed['rent']
    
    # scale the val set
    X_val_num_scaled = pd.DataFrame(scaler.transform(X_val_imputed[numerical_cols]), columns=numerical_cols)
    
    X_val_scaled = X_val_imputed.copy()
    X_val_scaled[numerical_cols] = X_val_num_scaled
    
    return X_train_scaled, X_val_scaled

In [6]:
def cross_validation(model, X, y, seed, n_splits=10, imputation_strategy='median'):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    cv_scores = []

    for train_idx, val_idx in kf.split(X):
        x_tr, y_tr = X.iloc[train_idx], y.iloc[train_idx]
        x_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
        x_tr, x_val = impute_and_scale(x_tr, x_val, y_tr, y_val, imputation_strategy=imputation_strategy)
        
        model.fit(X=x_tr, y=np.power(y_tr, 1/3))
        pred = np.power(model.predict(x_val), 3)
        cv_scores.append(mean_absolute_error(y_val, pred))
        
    return np.mean(cv_scores)

# hyperparameter optimization

In [9]:
import optuna

In [12]:
def objective(trial, X_train, y_train, seed, n_splits):
    C = trial.suggest_loguniform('C', 0.01, 10)
    kernel = trial.suggest_categorical('kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    degree = 0
    gamma = 'auto'
    epsilon = trial.suggest_loguniform('epsilon', 0.01, 1.0)

    if kernel == 'poly':
        degree = trial.suggest_int('degree', 2, 5)
    elif kernel in ['rbf', 'sigmoid']:
        gamma = trial.suggest_loguniform('gamma', 0.001, 1.0)
    
    params = {'C':C, 'kernel':kernel, 'degree':degree, 
              'gamma':gamma, 'epsilon':epsilon}

    imputation = trial.suggest_categorical('imputation_strategy', ['mean', 'median'])
    
    # train and evaluate the model using the hyperparameters
    #model = SVR(C=C, kernel=kernel, degree=degree, gamma=gamma, epsilon=epsilon)
    model = SVR(**params)
    return cross_validation(model, X, y, seed=seed, n_splits=n_splits, imputation_strategy=imputation)

In [13]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

study = optuna.create_study(direction='minimize',sampler=optuna.samplers.TPESampler(30))
study.optimize(lambda trial : objective(trial, X_train=X_train, y_train=y_train, seed=55, n_splits=5), n_trials=50)

[32m[I 2023-02-19 17:12:12,624][0m A new study created in memory with name: no-name-2787b8bd-381a-4152-b5c3-595811413a42[0m
[32m[I 2023-02-19 17:12:45,398][0m Trial 0 finished with value: 8934.026796666873 and parameters: {'C': 0.176629782647449, 'kernel': 'sigmoid', 'epsilon': 0.38665156637375564, 'gamma': 0.007671817645803569, 'imputation_strategy': 'median'}. Best is trial 0 with value: 8934.026796666873.[0m
[32m[I 2023-02-19 17:13:27,777][0m Trial 1 finished with value: 6186.089758852566 and parameters: {'C': 0.6392261956576255, 'kernel': 'rbf', 'epsilon': 0.03441440768737148, 'gamma': 0.015320935809131336, 'imputation_strategy': 'median'}. Best is trial 1 with value: 6186.089758852566.[0m
[32m[I 2023-02-19 17:14:00,174][0m Trial 2 finished with value: 15776.89120097537 and parameters: {'C': 0.2165734703535194, 'kernel': 'sigmoid', 'epsilon': 0.13239636267930094, 'gamma': 0.017221410595336082, 'imputation_strategy': 'median'}. Best is trial 1 with value: 6186.08975885256

In [14]:
study.best_params

{'C': 8.600180329778576,
 'kernel': 'rbf',
 'epsilon': 0.4676682188494261,
 'gamma': 0.1232350551967559,
 'imputation_strategy': 'median'}

In [15]:
trials_df = study.trials_dataframe().sort_values('value', ascending=True)
trials_df.iloc[:15]

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_degree,params_epsilon,params_gamma,params_imputation_strategy,params_kernel,state
14,14,5239.483033,2023-02-19 17:19:19.611794,2023-02-19 17:19:50.623966,0 days 00:00:31.012172,8.60018,,0.467668,0.123235,median,rbf,COMPLETE
10,10,5246.500911,2023-02-19 17:17:38.273946,2023-02-19 17:18:10.215896,0 days 00:00:31.941950,9.326243,,0.514623,0.130573,median,rbf,COMPLETE
32,32,5250.533386,2023-02-19 17:27:21.568564,2023-02-19 17:27:49.696690,0 days 00:00:28.128126,9.816391,,0.636184,0.086207,median,rbf,COMPLETE
42,42,5251.879716,2023-02-19 17:32:02.558151,2023-02-19 17:32:29.202240,0 days 00:00:26.644089,4.720869,,0.502516,0.131957,median,rbf,COMPLETE
41,41,5260.463157,2023-02-19 17:31:39.136919,2023-02-19 17:32:02.557150,0 days 00:00:23.420231,4.85071,,0.754604,0.107023,median,rbf,COMPLETE
30,30,5261.82554,2023-02-19 17:26:16.701854,2023-02-19 17:26:50.400985,0 days 00:00:33.699131,9.518667,,0.405332,0.147883,median,rbf,COMPLETE
11,11,5262.403052,2023-02-19 17:18:10.216863,2023-02-19 17:18:32.244634,0 days 00:00:22.027771,5.425254,,0.938651,0.102707,median,rbf,COMPLETE
48,48,5262.452918,2023-02-19 17:35:07.556409,2023-02-19 17:35:31.902921,0 days 00:00:24.346512,7.155133,,0.830617,0.085446,median,rbf,COMPLETE
31,31,5264.315894,2023-02-19 17:26:50.400985,2023-02-19 17:27:21.567564,0 days 00:00:31.166579,7.151167,,0.393141,0.16024,median,rbf,COMPLETE
40,40,5268.036044,2023-02-19 17:31:04.656857,2023-02-19 17:31:39.135953,0 days 00:00:34.479096,9.9204,,0.095937,0.075902,median,rbf,COMPLETE


In [18]:
X_train_preprocessed, X_test_preprocessed = impute_and_scale(X_train, X_test, y_train, y_test, imputation_strategy='median')

In [23]:
model_params={'C': 8.600180329778576,
             'kernel': 'rbf',
             'epsilon': 0.4676682188494261,
             'gamma': 0.1232350551967559}

model = SVR(**model_params)
model.fit(X_train_preprocessed, np.power(y_train, 1/3))

SVR(C=8.600180329778576, epsilon=0.4676682188494261, gamma=0.1232350551967559)

In [24]:
preds = np.power(model.predict(X_test_preprocessed), 3)
mean_absolute_error(y_test, preds)   

5056.099147406984

In [25]:
# with open('svr_trained', 'wb') as f:
#     _pickle.dump(model, f)