In [27]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [28]:
df = pd.read_csv('processed_3_kayseri_houses_data.csv')
df = df.drop(['Şehir'], axis=1)
df = df.drop(['Türü'], axis=1)

df.head()

Unnamed: 0,Binanın Yaşı,Binanın Kat Sayısı,Kullanım Durumu,Net Metrekare,Oda Sayısı,Bulunduğu Kat,Isıtma Tipi,House Price,İlçe,Mahalle
0,0,7,0,150,9,1,5,3200000,4,47
1,0,2,0,125,7,8,5,4500000,3,36
2,8,9,0,100,4,8,6,920000,4,17
3,8,11,2,155,8,2,6,2600000,4,135
4,4,13,0,135,7,2,7,2300000,4,127


In [29]:
X = df.drop(['House Price'], axis=1)
y = df['House Price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [23]:
model_params = {
    'n_estimators': [100, 200, 1000, 2000],
    'max_depth': [2, 3, 4, 5, 6],
    'learning_rate': [0.01, 0.02, 0.05, 0.09],
    'colsample_bytree': [0,4, 0.5, 0.6],
    'gamma': [0, 0.03, 0.1],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2, 3, 4.5],
    'subsample': [0.6, 0.8, 1]
}

In [24]:
xgb0 = XGBRegressor()

In [25]:
grid_search = GridSearchCV(xgb0,
                           model_params,
                           cv=10,
                           n_jobs=-1,
                           verbose=2)

In [26]:
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 43200 candidates, totalling 432000 fits


KeyboardInterrupt: 

In [None]:
grid_search.best_params_

In [30]:
xgb1 = XGBRegressor(learning_rate= 0.01, n_jobs=4, n_estimators= 200)

In [31]:
model_xgb = xgb1.fit(X_train, y_train)

In [32]:
model_xgb.predict(X_test)[1:6]  # predicted values

array([1176301.4, 1067477. , 2662788.5, 7618061.5, 1495444.4],
      dtype=float32)

In [33]:
y_test[1:6]  #actual values

376    1575000
9      1300000
308    2350000
299    9050000
483    2080000
Name: House Price, dtype: int64

we compared the predicted and actual values in the two codes above.

In [34]:
model_xgb.score(X_test, y_test)

0.7615185548623613

In [35]:
model_xgb.score(X_train, y_train)

0.8586775631843329

In [36]:
y_pred = model_xgb.predict(X_test)[15:25]  # predicted values

Unnamed: 0,Binanın Yaşı,Binanın Kat Sayısı,Kullanım Durumu,Net Metrekare,Oda Sayısı,Bulunduğu Kat,Isıtma Tipi,İlçe,Mahalle
1226,0,5,0,49,29,1,5,1,41
376,0,4,0,120,7,1,6,3,137
9,3,9,2,120,7,1,7,4,7
308,5,11,2,180,11,1,7,3,107
299,0,2,0,165,16,7,5,4,19
...,...,...,...,...,...,...,...,...,...
1381,8,14,0,150,7,2,6,7,86
1354,0,2,0,260,20,8,10,7,108
1409,4,4,1,115,7,8,5,1,63
1276,6,14,0,115,7,1,6,7,86


In [None]:
np.sqrt(-1* cross_val_score(model_xgb, X_test, y_test, cv=10, scoring='neg_mean_squared_error')).mean()

In [None]:
importance = pd.DataFrame({'Importance': model_xgb.feature_importances_},
                          index=X_train.columns)
importance

In [47]:
new_data = pd.DataFrame({'Binanın Yaşı': [2],
                         'Binanın Kat Sayısı': [10],
                         'Kullanım Durumu': [2],
                         'Net Metrekare': [150],
                         'Oda Sayısı': [1],
                         'Bulunduğu Kat': [1],
                         'Isıtma Tipi': [0],
                         'İlçe': [4],
                         'Mahalle': [11],
                        })
new_data

Unnamed: 0,Binanın Yaşı,Binanın Kat Sayısı,Kullanım Durumu,Net Metrekare,Oda Sayısı,Bulunduğu Kat,Isıtma Tipi,İlçe,Mahalle
0,2,10,2,150,1,1,0,4,11


In [48]:
y_pred = model_xgb.predict(new_data)  # predicted values
int(y_pred)

941947

In [40]:
from joblib import dump
dump(model_xgb, 'house_price_prediction_xgb_model.joblib')

['house_price_prediction_xgb_model.joblib']