In [190]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [220]:
df = pd.read_csv('processed_3_kayseri_houses_data.csv')
df = df.drop(['Şehir'], axis=1)
df = df.drop(['Türü'], axis=1)
df = df.drop(['Binanın Kat Sayısı'], axis=1)

df.head()

Unnamed: 0,Binanın Yaşı,Kullanım Durumu,Net Metrekare,Oda Sayısı,Bulunduğu Kat,Isıtma Tipi,House Price,İlçe,Mahalle
0,0,0,150,9,1,5,3200000,4,47
1,0,0,125,7,8,5,4500000,3,36
2,8,0,100,4,8,6,920000,4,17
3,8,2,155,8,2,6,2600000,4,135
4,4,0,135,7,2,7,2300000,4,127


In [221]:
X = df.drop(['House Price'], axis=1)
y = df['House Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [222]:
model_params = {
    'n_estimators': [100, 200, 1000, 2000],
    'max_depth': [2, 3, 4, 5, 6],
    'learning_rate': [0.01, 0.02, 0.05, 0.09],
    'colsample_bytree': [0,4, 0.5, 0.6],
    'gamma': [0, 0.03, 0.1],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2, 3, 4.5],
    'subsample': [0.6, 0.8, 1]
}

In [223]:
xgb0 = XGBRegressor()

In [224]:
grid_search = GridSearchCV(xgb0,
                           model_params,
                           cv=10,
                           n_jobs=-1,
                           verbose=2)

In [225]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 43200 candidates, totalling 432000 fits


KeyboardInterrupt: 

In [199]:
grid_search.best_params_

{'colsample_bytree': 0.5,
 'learning_rate': 0.01,
 'max_depth': 2,
 'n_estimators': 200}

In [226]:
xgb1 = XGBRegressor(colsample_bytree= 0.5, learning_rate= 0.01, max_depth= 2, n_estimators= 200)

In [227]:
model_xgb = xgb1.fit(X_train, y_train)

In [228]:
model_xgb.predict(X_test)[15:25]  # predicted values

array([7823326.5, 1727863.6, 1646781.8, 3710818.2, 1685239.8, 1290372.9,
       3551608. , 1598743.6, 3574875.5, 3845871. ], dtype=float32)

In [229]:
y_test[15:25]  #actual values

438     16900000
1091     2585000
1299     2250000
34       2850000
54       1499000
491      2100000
408      3350000
838      1849000
461      2350000
746      2850000
Name: House Price, dtype: int64

we compared the predicted and actual values in the two codes above.

In [217]:
y_pred = model_xgb.predict(X_test)[15:25]  # predicted values

In [230]:
model_xgb.score(X_test, y_test)

0.6559387210482596

In [219]:
model_xgb.score(X_train, y_train)

0.9955259615869179

In [208]:
np.sqrt(-1* cross_val_score(model_xgb, X_test, y_test, cv=10, scoring='neg_mean_squared_error')).mean()

1657013.5446479323

In [210]:
importance = pd.DataFrame({'Importance': model_xgb.feature_importances_},
                          index=X_train.columns)
importance

Unnamed: 0,Importance
Binanın Yaşı,0.049477
Kullanım Durumu,0.0
Net Metrekare,0.230334
Oda Sayısı,0.332961
Bulunduğu Kat,0.120555
Isıtma Tipi,0.090156
İlçe,0.089705
Mahalle,0.086812
