In [1]:
import pandas as pd
import numpy as np
import _pickle

import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import mean_absolute_error

In [2]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Train Test Split and Leakage Test

In [3]:
SEED = 10

In [4]:
with open('../regular_data_ready_for_model', 'rb') as f:
    df = _pickle.load(f)
df.head()

Unnamed: 0,rent,latitude,longitude,oda_sayisi,salon_sayisi,brut_m2,net_m2,bina_yasi,kat_sayisi,esyali,...,binned_bulundugu_kat_cat_1,binned_bulundugu_kat_cat_10,binned_bulundugu_kat_cat_11,binned_bulundugu_kat_cat_2,binned_bulundugu_kat_cat_3,binned_bulundugu_kat_cat_4,binned_bulundugu_kat_cat_6,binned_bulundugu_kat_cat_7,binned_bulundugu_kat_cat_8,binned_bulundugu_kat_cat_9
8262,30000,41.043127,28.969064,2,1,100,75,1,8.0,1,...,0,0,1,0,0,0,0,0,0,0
8260,8000,40.979325,28.729905,3,1,145,130,30,5.0,0,...,0,0,1,0,0,0,0,0,0,0
7889,10000,41.057928,28.974291,3,1,110,100,15,5.0,0,...,0,0,1,0,0,0,0,0,0,0
7919,65000,41.152676,28.924586,3,1,165,121,0,3.0,0,...,0,0,0,0,0,1,0,0,0,0
7923,55000,41.207677,29.020296,6,2,450,430,16,4.0,0,...,0,1,0,0,0,0,0,0,0,0


In [8]:
categorical_cols = [col for col in df.columns if df[col].dtype == 'uint8']
df[categorical_cols] = df[categorical_cols].astype('int')

df['oda_sayisi'] = pd.to_numeric(df['oda_sayisi'], errors='coerce')
df['salon_sayisi'] = pd.to_numeric(df['salon_sayisi'], errors='coerce')

In [5]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score, precision_score

def test_leakage(X_train, X_test, y_train, y_test):
    
    df_train = pd.concat([X_train, y_train], axis=1)
    df_train['split'] = 0
    
    df_test = pd.concat([X_test, y_test], axis=1)
    df_test['split'] = 1
    
    df_leakage = pd.concat([df_train, df_test])
    
    leakage_X, leakage_y = df_leakage.drop('split', axis=1), df_leakage['split']
    leakage_X_train, leakage_X_test, leakage_y_train, leakage_y_test = train_test_split(leakage_X, leakage_y, 
                                                                                    test_size=0.2, random_state=22, stratify=leakage_y)
    
    xgb_clf = XGBClassifier().fit(leakage_X_train, leakage_y_train)
    pred_train = xgb_clf.predict(leakage_X_train)
    pred_test = xgb_clf.predict(leakage_X_test)
    #print(f1_score(leakage_y_train, pred_train), f1_score(leakage_y_test, pred_test))
    print('train-test precisions: ', precision_score(leakage_y_train, pred_train), precision_score(leakage_y_test, pred_test))
    print('train-test recalls: ', recall_score(leakage_y_train, pred_train), recall_score(leakage_y_test, pred_test))
    print('train-test f1 scores: ', f1_score(leakage_y_train, pred_train), f1_score(leakage_y_test, pred_test))

In [9]:
# df.dropna(inplace=True)
# df.fillna(-10000, inplace=True)
X, y = df.drop('rent', axis=1), df['rent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)

In [10]:
test_leakage(X_train, X_test, y_train, y_test)

train-test precisions:  1.0 0.0
train-test recalls:  0.47990255785627284 0.0
train-test f1 scores:  0.648559670781893 0.0


the data seem to be splitted properly

# Model Stacking

base level

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

def impute_and_scale(X_train, X_val, y_train, y_val, imputation_strategy='median'):
    numerical_cols = ['latitude', 'longitude', 'oda_sayisi', 'salon_sayisi', 'brut_m2',
               'net_m2', 'bina_yasi', 'kat_sayisi', 'banyo_sayisi',
               'site_icerisinde_binary', 'cephe_sayisi', 'bulundugu_kat_num']
    
    # fit the imputer on the train set and impute it
    imputer = SimpleImputer(missing_values=np.nan, strategy=imputation_strategy)
    train_imputed = pd.DataFrame(imputer.fit_transform(pd.concat([y_train, X_train], axis=1)), 
                                 columns=[y_train.name]+list(X_train.columns))
    
    # resplit the train set
    X_train_imputed, y_train = train_imputed.drop('rent', axis=1), train_imputed['rent']
    
    # fit the scaler on the train set and scale it
    scaler = RobustScaler()
    X_train_num_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed[numerical_cols]), columns=numerical_cols)
    
    X_train_scaled = X_train_imputed.copy()
    X_train_scaled[numerical_cols] = X_train_num_scaled
    
    # impute the val set
    val_imputed = pd.DataFrame(imputer.transform(pd.concat([y_val, X_val], axis=1)), 
                             columns=[y_val.name]+list(X_val.columns))
    
    # resplit the val set
    X_val_imputed, y_val = val_imputed.drop('rent', axis=1), val_imputed['rent']
    
    # scale the val set
    X_val_num_scaled = pd.DataFrame(scaler.transform(X_val_imputed[numerical_cols]), columns=numerical_cols)
    
    X_val_scaled = X_val_imputed.copy()
    X_val_scaled[numerical_cols] = X_val_num_scaled
    
    return X_train_scaled, X_val_scaled

In [24]:
def level0(model, X_train, y_train, X_test, y_test, seed, model_name):
    kf = KFold(n_splits=5, shuffle=True, random_state = seed)
    pred_train = np.zeros(len(y_train))
    cv_scores = []
    
    for train_idx, val_idx in kf.split(X_train):
        x_tr, y_tr = X_train.iloc[train_idx], y_train.iloc[train_idx]
        x_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]
        
        if model_name == 'svr':
            x_tr, x_val = impute_and_scale(x_tr, x_val, y_tr, y_val, imputation_strategy='median')
            model.fit(X=x_tr, y=np.power(y_tr, 1/3))
            pred = np.power(model.predict(x_val), 3)
        elif model_name == 'rf':
            model.fit(X=x_tr.fillna(-10000), y=y_tr)
            pred = model.predict(x_val.fillna(-10000))
        else:
            model.fit(X=x_tr, y=y_tr)
            pred = model.predict(x_val)
            
        pred_train[val_idx] = pred
        cv_scores.append(mean_absolute_error(y_val, pred))
    
    if model_name == 'svr':
        X_train, X_test = impute_and_scale(X_train, X_test, y_train, y_test, imputation_strategy='median')
        model.fit(X_train, np.power(y_train, 1/3))
        pred_test = np.power(model.predict(X_test), 3)
    elif model_name == 'rf':
        model.fit(X_train.fillna(-10000), y_train)
        pred_test = model.predict(X_test.fillna(-10000))
    else:
        model.fit(X_train, y_train)
        pred_test = model.predict(X_test)
    
    test_score = mean_absolute_error(y_test, pred_test)
    print('val_score: ', np.array(cv_scores).mean())
    print('test_score: ', test_score)
    return pred_train, pred_test

In [21]:
def create_base_models():
    params_xgb = {'objective':'reg:squarederror',
                 'max_depth': 12,
                 'reg_alpha': 4,
                 'reg_lambda': 4,
                 'min_child_weight': 5,
                 'gamma': 2,
                 'learning_rate': 0.011938538816069405,
                 'colsample_bytree': 0.88,
                 'n_estimators': 1897,
                 'missing':np.nan,
                 'seed':1}
    xgb = XGBRegressor(**params_xgb)
    
    params_lgb = {'objective':'regression',
                 'lambda_l1': 1.5358410525682363e-06,
                 'lambda_l2': 5.885480224603298e-08,
                 'num_leaves': 132,
                 'learning_rate': 0.040233200712880696,
                 'feature_fraction': 0.7109327980307654,
                 'bagging_fraction': 0.9847849913642786,
                 'bagging_freq': 7,
                 'max_depth': 29,
                 'n_estimators': 456,
                 'missing':np.nan,
                 'seed':1,
                 'verbose':-1}
    lgb = LGBMRegressor(**params_lgb)
    
    params_rf = {'n_estimators': 610,
                 'max_depth': 34,
                 'max_features': 10,
                 'min_samples_leaf': 1,
                 'min_samples_split': 2}
    rf = RandomForestRegressor(**params_rf)
    
    params_svr = {'C': 8.600180329778576,
                 'kernel': 'rbf',
                 'epsilon': 0.4676682188494261,
                 'gamma': 0.1232350551967559}
    svr = SVR(**params_svr)
    
    return {'xgb': xgb,
            'lgb': lgb,
            'rf': rf,
            'svr': svr}
            

In [22]:
def get_level0(X_train, y_train, X_test, y_test, seed):
    base_models = create_base_models()
    
    df_train, df_test = pd.DataFrame(y_train), pd.DataFrame(y_test)
    for model_name, model in base_models.items():
        print(model_name)
        pred_train, pred_test = level0(model, X_train.copy(), y_train.copy(), X_test.copy(), y_test.copy(), seed, model_name)
        df_train[f'pred_{model_name}'] = pred_train
        df_test[f'pred_{model_name}'] = pred_test
        
    return df_train, df_test

In [25]:
df_tr, df_te = get_level0(X_train, y_train, X_test, y_test, seed=444)

xgb
val_score:  4414.562505875412
test_score:  4175.0078186867995
lgb
val_score:  4421.846544198888
test_score:  4081.037699691662
rf
val_score:  5109.364162493604
test_score:  4810.79476111908
svr
val_score:  5254.726643306187
test_score:  5056.099147406984


In [26]:
df_tr

Unnamed: 0,rent,pred_xgb,pred_lgb,pred_rf,pred_svr
7513,12000,14125.302734,13146.362555,14975.940984,9771.448797
3723,8000,8063.174316,7183.714963,9583.573770,11430.384520
7236,23000,15549.841797,14999.109498,15985.868852,15284.750550
4100,10000,9669.904297,8987.249663,10122.295082,9926.998159
8757,8000,10177.689453,10107.146496,10149.359016,9494.740717
...,...,...,...,...,...
8114,35000,21564.814453,17278.046801,24372.180328,20958.363017
7622,23000,28196.785156,31182.440309,32593.003279,30435.849264
579,11000,30470.763672,32138.868447,27737.996721,21133.203266
3790,16500,25654.335938,30286.931505,28729.998361,26272.453529


In [27]:
df_te

Unnamed: 0,rent,pred_xgb,pred_lgb,pred_rf,pred_svr
6705,60000,40651.281250,41258.489226,30950.163934,34047.895759
1484,17000,16433.822266,18664.523729,16164.475410,12943.628794
6478,10000,7175.735352,8338.538740,11112.483607,8212.397873
8181,27500,25812.732422,29449.902241,18520.905464,13706.739108
5854,54950,54145.304688,56158.065561,54702.829508,60809.574233
...,...,...,...,...,...
4279,19000,14306.596680,14093.414677,17009.098361,12920.338688
7213,15000,12726.775391,9907.502940,12248.934426,19514.197524
756,17500,19035.000000,18983.424191,22367.195082,21800.209844
1866,27500,31843.058594,32723.065105,27018.196721,23328.574722


In [28]:
df_tr.drop('rent', axis=1).corr()

Unnamed: 0,pred_xgb,pred_lgb,pred_rf,pred_svr
pred_xgb,1.0,0.988337,0.967215,0.930216
pred_lgb,0.988337,1.0,0.96486,0.930105
pred_rf,0.967215,0.96486,1.0,0.940435
pred_svr,0.930216,0.930105,0.940435,1.0


final level

In [29]:
def get_final_level(model, df_tr, df_te):
    X_tr, y_tr = df_tr.drop('rent', axis=1), df_tr['rent']
    X_te, y_te = df_te.drop('rent', axis=1), df_te['rent']
    
    final_model = model.fit(X_tr, y_tr)
    
    preds_tr_final = final_model.predict(X_tr)
    preds_te_final = final_model.predict(X_te)
    
    print('train set mae: ', mean_absolute_error(y_train, preds_tr_final))
    print('test set mae: ', mean_absolute_error(y_test, preds_te_final))
    return final_model

In [30]:
lr = LinearRegression()
stacked_model = get_final_level(lr, df_tr, df_te)

train set mae:  4348.251256967843
test set mae:  4082.072637657351


In [31]:
xgb = XGBRegressor()
stacked_model = get_final_level(xgb, df_tr, df_te)

train set mae:  2838.5029036306664
test set mae:  4296.758141352187


In [32]:
lgb = LGBMRegressor()
stacked_model = get_final_level(lgb, df_tr, df_te)

train set mae:  3856.086224308715
test set mae:  4166.631143717436


In [33]:
rf = RandomForestRegressor()
stacked_model = get_final_level(rf, df_tr, df_te)

train set mae:  1717.2693825364145
test set mae:  4263.22197205978


In [34]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()
stacked_model = get_final_level(gb, df_tr, df_te)

train set mae:  4144.661420991451
test set mae:  4120.542363985961


create the final model

In [None]:
class EnsembleModel():
    pass