In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sbn
from davood_ml_functions import *
from scipy import stats as st
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score , make_scorer
from sklearn.utils.class_weight import compute_class_weight
from skopt import BayesSearchCV
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score 
from xgboost import XGBRegressor

In [None]:
path_clean = r"C:\Users\moein\clean_da.csv"
df = pd.read_csv(path_clean , index_col = 0)
df.columns

In [None]:
df["city_slug"].value_counts()

In [None]:
(df["city_slug"] == "tehran").isna().sum()

In [None]:
path_clean = r"C:\Users\moein\clean_da.csv"
df = pd.read_csv(path_clean , index_col = 0)
df["in_tehran"] = (df["city_slug"] == "tehran")
selected_columns = ["building_size" , "rooms_count" , "construction_year" ,
                    "has_warehouse" , "has_parking",
                    'lux_count',
                    "cat2_slug" ,  "floor" ,
                    'location_latitude','location_longitude',
                    # "property_type" , "is_rebuilt",
                    "for_sale",
                    "in_tehran",
                    "Y"]

df = df.dropna(subset = ["Y"])
df = df.loc[: , selected_columns]
df = df.loc[df["Y"] > 0]
df["Y"] = np.log(df["Y"])
# df["Y"] = np.cbrt(df["Y"])

# for_sale = ["residential-sell" , "commercial-sell"]
# for_rent = ["residential-rent" , "commercial-rent" , "temporary-rent"]
# df_sale = df.loc[df["for_sale"] == True].drop("for_sale" , axis = 1)
# df_rent = df.loc[df["for_sale"] == False].drop("for_sale" , axis = 1)

df = df.dropna()
m = 100_000
# df = df.sample(n = m , random_state = 42)

X = df.drop("Y" , axis = 1)
X = pd.get_dummies(X)
Y = df["Y"]

In [None]:
# # maximum features
# path_clean = r"C:\Users\Davood\Desktop\Bootcamp\DivarEstate\clean_data.csv"
# df = pd.read_csv(path_clean , index_col = 0)
# to_drop = ["cat3_slug" , "city_slug" , "neighborhood_slug"]
# get_table_null_dtype(df.drop(to_drop , axis = 1))
# df = df.dropna(subset = ["Y"])
# df = df.loc[df["Y"] > 0]
# df["Y"] = np.log(df["Y"])

In [None]:
from xgboost import XGBRegressor

search_spaces = {
    'n_estimators': (100 , 300),
    'max_depth': (3 , 7),
    'learning_rate': (0.01 , 0.2 , 'log-uniform'),
    'subsample': (0.7 , 1.0),
    'colsample_bytree': (0.7 , 1.0),
    'gamma': (0 , 2),
    'reg_alpha': (1e-3 , 1 , 'log-uniform'),
    'reg_lambda': (1e-3 , 1 , 'log-uniform')
}

bayes = BayesSearchCV(
    estimator = XGBRegressor(random_state = 42),
    search_spaces = search_spaces,
    n_iter = 30,
    cv = 5 ,
    scoring = 'r2',
    n_jobs = -1,
    random_state = 42
)

bayes.fit(X , Y)

best_model = bayes.best_estimator_
best_params = bayes.best_params_
best_score = bayes.best_score_

In [None]:
X_train , X_test , y_train , y_test = train_test_split(
    X , Y , test_size = 0.2 , random_state = 42
)

xgb = XGBRegressor(
    n_estimators = best_params["n_estimators"] ,
    learning_rate = best_params["learning_rate"] ,
    max_depth = best_params["max_depth"] ,
    subsample = best_params["subsample"] ,
    colsample_bytree = best_params["colsample_bytree"] ,
    gamma = best_params["gamma"] ,
    reg_alpha = best_params["reg_alpha"] ,
    reg_lambda = best_params["reg_lambda"] ,
    random_state = 42 ,
    n_jobs = -1
)

xgb.fit(X_train , y_train)
y_pred = xgb.predict(X_test)

R2 = r2_score(y_test , y_pred)
MSE = mean_squared_error(y_test , y_pred)
MAE = mean_absolute_error(y_test , y_pred)
print(f"R2 = {round(R2*100 , 2)}%")
print("MSE =" , round(MSE , 1))
print("MAE =" , round(MAE , 1))