In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [56]:
df = pd.read_csv('processed_3_kayseri_houses_data.csv')
df = df.drop(['Şehir'], axis=1)
df = df.drop(['Türü'], axis=1)

df.head()

Unnamed: 0,Binanın Yaşı,Binanın Kat Sayısı,Kullanım Durumu,Net Metrekare,Oda Sayısı,Bulunduğu Kat,Isıtma Tipi,House Price,İlçe,Mahalle
0,0,7,0,150,9,1,5,3200000,4,47
1,0,2,0,125,7,8,5,4500000,3,36
2,8,9,0,100,4,8,6,920000,4,17
3,8,11,2,155,8,2,6,2600000,4,135
4,4,13,0,135,7,2,7,2300000,4,127


In [119]:
X = df.drop(['House Price'], axis=1)
y = df['House Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [209]:
model_params = {
    'n_estimators': [100, 200, 1000, 2000],
    'max_depth': [2, 3, 4, 5, 6],
    'learning_rate': [0.01, 0.02, 0.05, 0.09],
    'colsample_bytree': [0,4, 0.5, 0.6],
    'gamma': [0, 0.03, 0.1],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2, 3, 4.5],
    'subsample': [0.6, 0.8, 1]
}

In [210]:
xgb0 = XGBRegressor()

In [212]:
grid_search = GridSearchCV(xgb0,
                           model_params,
                           cv=10,
                           n_jobs=-1,
                           verbose=2)

In [213]:
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 135 candidates, totalling 675 fits


In [214]:
grid_search.best_params_

{'gamma': 0, 'reg_alpha': 0.5, 'reg_lambda': 4.5, 'subsample': 0.6}

In [221]:
xgb1 = XGBRegressor(learning_rate= 0.01, n_jobs=4, n_estimators= 200)

In [222]:
model_xgb = xgb1.fit(X_train, y_train)

In [223]:
model_xgb.predict(X_test)[1:6]  # predicted values

array([1176301.4, 1067477. , 2662788.5, 7618061.5, 1495444.4],
      dtype=float32)

In [224]:
y_test[1:6]  #actual values

376    1575000
9      1300000
308    2350000
299    9050000
483    2080000
Name: House Price, dtype: int64

we compared the predicted and actual values in the two codes above.

In [225]:
model_xgb.score(X_test, y_test)

0.7615185548623613

In [226]:
model_xgb.score(X_train, y_train)

0.8586775631843329

In [227]:
y_pred = model_xgb.predict(X_test)[15:25]  # predicted values

In [107]:
np.sqrt(-1* cross_val_score(model_xgb, X_test, y_test, cv=10, scoring='neg_mean_squared_error')).mean()

1694349.6778234735

In [228]:
importance = pd.DataFrame({'Importance': model_xgb.feature_importances_},
                          index=X_train.columns)
importance

Unnamed: 0,Importance
Binanın Yaşı,0.360769
Binanın Kat Sayısı,0.005277
Kullanım Durumu,0.009845
Net Metrekare,0.091362
Oda Sayısı,0.051768
Bulunduğu Kat,0.01856
Isıtma Tipi,0.014096
İlçe,0.006947
Mahalle,0.441377


In [244]:
new_data = pd.DataFrame({'Binanın Yaşı': [5],
                         'Binanın Kat Sayısı': [14],
                         'Kullanım Durumu': [2],
                         'Net Metrekare': [150],
                         'Oda Sayısı': [7],
                         'Bulunduğu Kat': [1],
                         'Isıtma Tipi': [6],
                         'İlçe': [4],
                         'Mahalle': [11],
                        })
new_data

Unnamed: 0,Binanın Yaşı,Binanın Kat Sayısı,Kullanım Durumu,Net Metrekare,Oda Sayısı,Bulunduğu Kat,Isıtma Tipi,İlçe,Mahalle
0,5,14,2,150,7,1,6,4,11


In [245]:
y_pred = model_xgb.predict(new_data)  # predicted values
int(y_pred)

1639087