In [1]:
import pandas as pd
import numpy as np
import _pickle

import matplotlib.pyplot as plt
import seaborn as sns

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import mean_absolute_error

In [29]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# Train Test Split and Leakage Test

In [3]:
SEED = 1

In [4]:
with open('model_ready_data', 'rb') as f:
    df = _pickle.load(f)
df.head()

Unnamed: 0,rent,latitude,longitude,oda_sayisi,salon_sayisi,brut_m2,net_m2,bina_yasi,kat_sayisi,esyali,...,binned_bulundugu_kat_cat_1,binned_bulundugu_kat_cat_10,binned_bulundugu_kat_cat_11,binned_bulundugu_kat_cat_2,binned_bulundugu_kat_cat_3,binned_bulundugu_kat_cat_4,binned_bulundugu_kat_cat_6,binned_bulundugu_kat_cat_7,binned_bulundugu_kat_cat_8,binned_bulundugu_kat_cat_9
8262,30000,41.043127,28.969064,2,1,100,75,1,8.0,True,...,0,0,1,0,0,0,0,0,0,0
8260,8000,40.979325,28.729905,3,1,145,130,30,5.0,False,...,0,0,1,0,0,0,0,0,0,0
7889,10000,41.057928,28.974291,3,1,110,100,15,5.0,False,...,0,0,1,0,0,0,0,0,0,0
7919,65000,41.152676,28.924586,3,1,165,121,0,3.0,False,...,0,0,0,0,0,1,0,0,0,0
7923,55000,41.207677,29.020296,6,2,450,430,16,4.0,False,...,0,1,0,0,0,0,0,0,0,0


In [5]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, recall_score, precision_score

def test_leakage(X_train, X_test, y_train, y_test):
    
    df_train = pd.concat([X_train, y_train], axis=1)
    df_train['split'] = 0
    
    df_test = pd.concat([X_test, y_test], axis=1)
    df_test['split'] = 1
    
    df_leakage = pd.concat([df_train, df_test])
    
    leakage_X, leakage_y = df_leakage.drop('split', axis=1), df_leakage['split']
    leakage_X_train, leakage_X_test, leakage_y_train, leakage_y_test = train_test_split(leakage_X, leakage_y, 
                                                                                    test_size=0.2, random_state=22, stratify=leakage_y)
    
    xgb_clf = XGBClassifier().fit(leakage_X_train, leakage_y_train)
    pred_train = xgb_clf.predict(leakage_X_train)
    pred_test = xgb_clf.predict(leakage_X_test)
    #print(f1_score(leakage_y_train, pred_train), f1_score(leakage_y_test, pred_test))
    print('train-test precisions: ', precision_score(leakage_y_train, pred_train), precision_score(leakage_y_test, pred_test))
    print('train-test recalls: ', recall_score(leakage_y_train, pred_train), recall_score(leakage_y_test, pred_test))
    print('train-test f1 scores: ', f1_score(leakage_y_train, pred_train), f1_score(leakage_y_test, pred_test))

In [6]:
# df.dropna(inplace=True)
# df.fillna(-10000, inplace=True)
X, y = df.drop('rent', axis=1), df['rent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=SEED)

In [7]:
test_leakage(X_train, X_test, y_train, y_test)

train-test precisions:  1.0 0.1
train-test recalls:  0.4591961023142509 0.004878048780487805
train-test f1 scores:  0.6293823038397329 0.009302325581395349


the data seem to be splitted properly

# Model Stacking

base level

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler

def impute_and_scale(X_train, X_val, y_train, y_val, imputation_strategy='median'):
    numerical_cols = ['latitude', 'longitude', 'oda_sayisi', 'salon_sayisi', 'brut_m2',
               'net_m2', 'bina_yasi', 'kat_sayisi', 'banyo_sayisi',
               'site_icerisinde_binary', 'cephe_sayisi', 'bulundugu_kat_num']
    
    # fit the imputer on the train set and impute it
    imputer = SimpleImputer(missing_values=np.nan, strategy=imputation_strategy)
    train_imputed = pd.DataFrame(imputer.fit_transform(pd.concat([y_train, X_train], axis=1)), 
                                 columns=[y_train.name]+list(X_train.columns))
    
    # resplit the train set
    X_train_imputed, y_train = train_imputed.drop('rent', axis=1), train_imputed['rent']
    
    # fit the scaler on the train set and scale it
    scaler = RobustScaler()
    X_train_num_scaled = pd.DataFrame(scaler.fit_transform(X_train_imputed[numerical_cols]), columns=numerical_cols)
    
    X_train_scaled = X_train_imputed.copy()
    X_train_scaled[numerical_cols] = X_train_num_scaled
    
    # impute the val set
    val_imputed = pd.DataFrame(imputer.transform(pd.concat([y_val, X_val], axis=1)), 
                             columns=[y_val.name]+list(X_val.columns))
    
    # resplit the val set
    X_val_imputed, y_val = val_imputed.drop('rent', axis=1), val_imputed['rent']
    
    # scale the val set
    X_val_num_scaled = pd.DataFrame(scaler.transform(X_val_imputed[numerical_cols]), columns=numerical_cols)
    
    X_val_scaled = X_val_imputed.copy()
    X_val_scaled[numerical_cols] = X_val_num_scaled
    
    return X_train_scaled, X_val_scaled

In [20]:
def level0(model, X_train, y_train, X_test, y_test, seed, model_name):
    kf = KFold(n_splits=10, shuffle=True, random_state = seed)
    pred_train = np.zeros(len(y_train))
    cv_scores = []

    for train_idx, val_idx in kf.split(X_train):
        x_tr, y_tr = X_train.iloc[train_idx], y_train.iloc[train_idx]
        x_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]
        
        if model_name == 'svr':
            x_tr, x_val = impute_and_scale(x_tr, x_val, y_tr, y_val, imputation_strategy='mean')
            model.fit(X=x_tr, y=np.power(y_tr, 1/3))
            pred = np.power(model.predict(x_val), 3)
        elif model_name == 'rf':
            model.fit(X=x_tr.fillna(-10000), y=y_tr)
            pred = model.predict(x_val.fillna(-10000))
        else:
            model.fit(X=x_tr, y=y_tr)
            pred = model.predict(x_val)
            
        pred_train[val_idx] = pred
        cv_scores.append(mean_absolute_error(y_val, pred))
    
    if model_name == 'svr':
        X_train, X_test = impute_and_scale(X_train, X_test, y_train, y_test, imputation_strategy='mean')
        model.fit(X_train, np.power(y_train, 1/3))
        pred_test = np.power(model.predict(X_test), 3)
    elif model_name == 'rf':
        model.fit(X_train.fillna(-10000), y_train)
        pred_test = model.predict(X_test.fillna(-10000))
    else:
        model.fit(X_train, y_train)
        pred_test = model.predict(X_test)
    
    test_score = mean_absolute_error(y_test, pred_test)
    print('val_score: ', np.array(cv_scores).mean())
    print('test_score: ', test_score)
    return pred_train, pred_test

In [30]:
def create_base_models():
    params_xgb = {'n_estimators': 800, 'max_depth': 8, 'reg_alpha': 0, 'reg_lambda': 4, 'min_child_weight': 3, 
        'gamma': 3, 'learning_rate': 0.03176084772867135, 'colsample_bytree': 0.6}
    xgb = XGBRegressor(**params_xgb)
    
    params_lgb = {'lambda_l1': 2.7474326940885665e-07, 'lambda_l2': 1.9883851537044354e-05, 'num_leaves': 159, 
        'learning_rate': 0.18683725035485513, 'feature_fraction': 0.6093847617143736, 
        'bagging_fraction': 0.9313731040704265, 'bagging_freq': 1, 'max_depth': 7}
    lgb = LGBMRegressor(**params_lgb)
    
    params_rf = {'n_estimators': 382,
                 'max_depth': 18,
                 'max_features': 10,
                 'min_samples_leaf': 1,
                 'min_samples_split': 2}
    rf = RandomForestRegressor(**params_rf)
    
    params_svr = {'C': 6.2409942877045435, 'kernel': 'rbf', 'epsilon': 0.2969722590661538, 
                  'gamma': 0.09611835767838475}
    svr = SVR(**params_svr)
    
    return {'xgb': xgb,
            'lgb': lgb,
            'rf': rf,
            'svr': svr}
            

In [18]:
def get_level0(X_train, y_train, X_test, y_test, seed):
    base_models = create_base_models()
    
    df_train, df_test = pd.DataFrame(y_train), pd.DataFrame(y_test)
    for model_name, model in base_models.items():
        print(model_name)
        pred_train, pred_test = level0(model, X_train.copy(), y_train.copy(), X_test.copy(), y_test.copy(), seed, model_name)
        df_train[f'pred_{model_name}'] = pred_train
        df_test[f'pred_{model_name}'] = pred_test
        
    return df_train, df_test

In [31]:
df_tr, df_te = get_level0(X_train, y_train, X_test, y_test, seed=SEED)

xgb
val_score:  4329.557126341368
test_score:  4166.323440878944
lgb
val_score:  4600.8641897660855
test_score:  4488.484917290654
rf
val_score:  5068.479707651482
test_score:  4919.029183071714
svr
val_score:  5201.340597750439
test_score:  4927.77084170546


In [32]:
df_tr

Unnamed: 0,rent,pred_xgb,pred_lgb,pred_rf,pred_svr
1660,12500,12865.932617,14068.061639,13126.087631,14739.486385
2294,55000,47891.378906,48504.959415,36850.082585,26478.352914
5643,15000,10714.422852,9730.704380,9652.100070,8792.941096
6069,10000,13335.797852,12686.754081,12912.541943,13466.946105
5771,5000,1558.701416,643.368395,6534.334849,2981.152240
...,...,...,...,...,...
2091,12000,11928.513672,14964.373422,15253.914028,13601.073238
7411,8500,12290.735352,12206.648353,13251.104458,15325.167886
3350,12000,15375.823242,14529.602678,15073.422057,14266.165050
4322,12500,14073.116211,13381.093234,13396.543687,16777.539592


In [33]:
df_te

Unnamed: 0,rent,pred_xgb,pred_lgb,pred_rf,pred_svr
4482,6000,12739.530273,12360.297455,12649.923098,12751.695511
5974,15000,14910.393555,12605.958407,14156.667668,10087.079435
5894,20000,19370.843750,18830.305358,26200.401226,24632.719624
8056,19000,22386.189453,22953.650095,19904.604403,12100.622398
6661,12500,16500.287109,19777.554203,17736.949737,17488.706937
...,...,...,...,...,...
2333,7000,8779.819336,7923.661782,8163.848630,7573.308099
7665,13000,21654.023438,20624.917756,29994.400484,12363.254028
3751,23000,26061.740234,24487.602511,19568.303111,14530.416161
4425,6000,7736.008301,8116.547514,8858.345007,5873.455435


In [34]:
df_tr.drop('rent', axis=1).corr()

Unnamed: 0,pred_xgb,pred_lgb,pred_rf,pred_svr
pred_xgb,1.0,0.983773,0.968408,0.93591
pred_lgb,0.983773,1.0,0.962677,0.932147
pred_rf,0.968408,0.962677,1.0,0.945922
pred_svr,0.93591,0.932147,0.945922,1.0


final level

In [36]:
def get_final_level(model, df_tr, df_te):
    X_tr, y_tr = df_tr.drop('rent', axis=1), df_tr['rent']
    X_te, y_te = df_te.drop('rent', axis=1), df_te['rent']
    
    final_model = model.fit(X_tr, y_tr)
    
    preds_tr_final = final_model.predict(X_tr)
    preds_te_final = final_model.predict(X_te)
    
    print('train set mae: ', mean_absolute_error(y_train, preds_tr_final))
    print('test set mae: ', mean_absolute_error(y_test, preds_te_final))
    return final_model

In [37]:
lr = LinearRegression()
stacked_model = get_final_level(lr, df_tr, df_te)

train set mae:  4319.178079265291
test set mae:  4155.884392485603


In [38]:
xgb = XGBRegressor()
stacked_model = get_final_level(xgb, df_tr, df_te)

train set mae:  2849.853132900379
test set mae:  4301.398263555753


In [39]:
lgb = LGBMRegressor()
stacked_model = get_final_level(lgb, df_tr, df_te)

train set mae:  3828.8445095056395
test set mae:  4256.479339326906


In [40]:
rf = RandomForestRegressor()
stacked_model = get_final_level(rf, df_tr, df_te)

train set mae:  1682.045659334706
test set mae:  4463.362439896036


In [41]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()
stacked_model = get_final_level(gb, df_tr, df_te)

train set mae:  4113.06367681811
test set mae:  4173.886620369122
