In [10]:
import os
import numpy as np
import pandas as pd
# !pip install optuna 
import optuna
# import matplotlib.pyplot as plt
# import seaborn as sns
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.exceptions import NotFittedError

import typing
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.exceptions import NotFittedError

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
TARGET = 'per_square_meter_price'
# признаки (или набор признаков), для которых применяем smoothed target encoding
CATEGORICAL_STE_FEATURES = ['region', 'city', 'realty_type']

# признаки, для которых применяем one hot encoding
CATEGORICAL_ONE_FEATURES = []

# численные признаки
NUM_FEATURES = ['lat', 'lng', 'osm_amenity_points_in_0.001',
       'osm_amenity_points_in_0.005', 'osm_amenity_points_in_0.0075',
       'osm_amenity_points_in_0.01', 'osm_building_points_in_0.001',
       'osm_building_points_in_0.005', 'osm_building_points_in_0.0075',
       'osm_building_points_in_0.01', 'osm_catering_points_in_0.001',
       'osm_catering_points_in_0.005', 'osm_catering_points_in_0.0075',
       'osm_catering_points_in_0.01', 'osm_city_closest_dist',
      'osm_city_nearest_population',
       'osm_crossing_closest_dist', 'osm_crossing_points_in_0.001',
       'osm_crossing_points_in_0.005', 'osm_crossing_points_in_0.0075',
       'osm_crossing_points_in_0.01', 'osm_culture_points_in_0.001',
       'osm_culture_points_in_0.005', 'osm_culture_points_in_0.0075',
       'osm_culture_points_in_0.01', 'osm_finance_points_in_0.001',
       'osm_finance_points_in_0.005', 'osm_finance_points_in_0.0075',
       'osm_finance_points_in_0.01', 'osm_healthcare_points_in_0.005',
       'osm_healthcare_points_in_0.0075', 'osm_healthcare_points_in_0.01',
       'osm_historic_points_in_0.005', 'osm_historic_points_in_0.0075',
       'osm_historic_points_in_0.01', 'osm_hotels_points_in_0.005',
       'osm_hotels_points_in_0.0075', 'osm_hotels_points_in_0.01',
       'osm_leisure_points_in_0.005', 'osm_leisure_points_in_0.0075',
       'osm_leisure_points_in_0.01', 'osm_offices_points_in_0.001',
       'osm_offices_points_in_0.005', 'osm_offices_points_in_0.0075',
       'osm_offices_points_in_0.01', 'osm_shops_points_in_0.001',
       'osm_shops_points_in_0.005', 'osm_shops_points_in_0.0075',
       'osm_shops_points_in_0.01', 'osm_subway_closest_dist',
       'osm_train_stop_closest_dist', 'osm_train_stop_points_in_0.005',
       'osm_train_stop_points_in_0.0075', 'osm_train_stop_points_in_0.01',
       'osm_transport_stop_closest_dist', 'osm_transport_stop_points_in_0.005',
       'osm_transport_stop_points_in_0.0075',
       'osm_transport_stop_points_in_0.01',
       'reform_count_of_houses_1000', 'reform_count_of_houses_500',
       'reform_house_population_1000', 'reform_house_population_500',
       'reform_mean_floor_count_1000', 'reform_mean_floor_count_500',
       'reform_mean_year_building_1000', 'reform_mean_year_building_500','total_square']

MODEL_PARAMS = dict(
            n_estimators=2000,
            learning_rate=0.01,
            reg_alpha=1,
            num_leaves=40,
            min_child_samples=5,
            importance_type="gain",
            n_jobs=1,
            random_state=563,
        )

In [5]:
data = pd.read_csv('../data/data_processed.csv')

In [6]:
def get_Xy(data):
  X = data.loc[:, data.columns != TARGET]
  for i in NUM_FEATURES:
    X[i] = pd.to_numeric(X[i])
  y = data[TARGET]
#   X = X.values
#   y = y.values
  return X, y

In [7]:
X, y = get_Xy(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[i] = pd.to_numeric(X[i])


In [11]:
def objective(trial,data=X,target=y):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.15,random_state=42)
    param = {
        'tree_method':'gpu_hist',  # this parameter means using the GPU when training our model to speedup the training process
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
        'n_estimators': 10,
        'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17,20]),
        'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    model = XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)],early_stopping_rounds=100,verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds,squared=False)
    return rmse

In [12]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-09-25 21:42:12,243][0m A new study created in memory with name: no-name-45d3dbe0-5550-4fde-a747-5ead4a1738b9[0m
[32m[I 2021-09-25 21:42:14,200][0m Trial 0 finished with value: 196107.47824140955 and parameters: {'lambda': 0.07750275374208812, 'alpha': 7.301929238937282, 'colsample_bytree': 0.3, 'subsample': 1.0, 'learning_rate': 0.01, 'max_depth': 9, 'random_state': 24, 'min_child_weight': 137}. Best is trial 0 with value: 196107.47824140955.[0m
[32m[I 2021-09-25 21:42:16,451][0m Trial 1 finished with value: 197699.8756190848 and parameters: {'lambda': 6.97487618987105, 'alpha': 0.008775258164182297, 'colsample_bytree': 0.4, 'subsample': 0.8, 'learning_rate': 0.009, 'max_depth': 15, 'random_state': 24, 'min_child_weight': 285}. Best is trial 0 with value: 196107.47824140955.[0m
[32m[I 2021-09-25 21:42:18,488][0m Trial 2 finished with value: 187625.27127507693 and parameters: {'lambda': 1.369384052522493, 'alpha': 0.045155889465294585, 'colsample_bytree': 0.3, 'su

[32m[I 2021-09-25 21:43:38,690][0m Trial 24 finished with value: 189325.28800004246 and parameters: {'lambda': 2.6898145135058114, 'alpha': 0.4389085114997859, 'colsample_bytree': 1.0, 'subsample': 0.7, 'learning_rate': 0.014, 'max_depth': 9, 'random_state': 2020, 'min_child_weight': 2}. Best is trial 21 with value: 180887.08239791202.[0m
[32m[I 2021-09-25 21:43:42,136][0m Trial 25 finished with value: 181028.85477723938 and parameters: {'lambda': 0.7895410888260439, 'alpha': 0.11148749594284992, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.02, 'max_depth': 13, 'random_state': 24, 'min_child_weight': 35}. Best is trial 21 with value: 180887.08239791202.[0m
[32m[I 2021-09-25 21:43:45,243][0m Trial 26 finished with value: 181395.05924596929 and parameters: {'lambda': 0.6068387444989574, 'alpha': 0.1183162369602138, 'colsample_bytree': 0.7, 'subsample': 1.0, 'learning_rate': 0.02, 'max_depth': 13, 'random_state': 24, 'min_child_weight': 77}. Best is trial 21 with 

[32m[I 2021-09-25 21:45:18,187][0m Trial 48 finished with value: 182570.8028825252 and parameters: {'lambda': 0.2528353559688884, 'alpha': 0.024316319783542158, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.018, 'max_depth': 20, 'random_state': 48, 'min_child_weight': 15}. Best is trial 47 with value: 180024.40671205125.[0m
[32m[I 2021-09-25 21:45:39,463][0m Trial 49 finished with value: 189726.1491801209 and parameters: {'lambda': 0.05038088377375617, 'alpha': 0.006267892593795406, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.012, 'max_depth': 20, 'random_state': 48, 'min_child_weight': 2}. Best is trial 47 with value: 180024.40671205125.[0m


Number of finished trials: 50
Best trial: {'lambda': 0.2528174422390116, 'alpha': 0.018414991637997532, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 20, 'random_state': 48, 'min_child_weight': 17}


In [27]:
Best_trial= {"n_estimators": 500,'lambda': 0.2528174422390116, 'alpha': 0.018414991637997532, 'colsample_bytree': 0.6, 'subsample': 0.6, 'learning_rate': 0.02, 'max_depth': 20, 'random_state': 48, 'min_child_weight': 17}

In [28]:
clf_C = XGBRegressor(**Best_trial)
clf_C

XGBRegressor(alpha=0.018414991637997532, base_score=None, booster=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.6, gamma=None, gpu_id=None,
             importance_type='gain', interaction_constraints=None,
             lambda=0.2528174422390116, learning_rate=0.02, max_delta_step=None,
             max_depth=20, min_child_weight=17, missing=nan,
             monotone_constraints=None, n_estimators=500, n_jobs=None,
             num_parallel_tree=None, random_state=48, reg_alpha=None,
             reg_lambda=None, scale_pos_weight=None, subsample=0.6,
             tree_method=None, validate_parameters=None, verbosity=None)

In [29]:
clf_C.fit(X, y, early_stopping_rounds=15, eval_set=[[X, y]])

[0]	validation_0-rmse:200975.95312
[1]	validation_0-rmse:197682.06250
[2]	validation_0-rmse:194460.32812
[3]	validation_0-rmse:191313.59375
[4]	validation_0-rmse:188215.65625
[5]	validation_0-rmse:185273.59375
[6]	validation_0-rmse:182417.53125
[7]	validation_0-rmse:179512.46875
[8]	validation_0-rmse:176670.01562
[9]	validation_0-rmse:173898.45312
[10]	validation_0-rmse:171245.82812
[11]	validation_0-rmse:168601.68750
[12]	validation_0-rmse:165975.18750
[13]	validation_0-rmse:163430.46875
[14]	validation_0-rmse:161033.48438
[15]	validation_0-rmse:158600.51562
[16]	validation_0-rmse:156225.93750
[17]	validation_0-rmse:153878.96875
[18]	validation_0-rmse:151613.28125
[19]	validation_0-rmse:149409.81250
[20]	validation_0-rmse:147302.90625
[21]	validation_0-rmse:145158.93750
[22]	validation_0-rmse:143058.01562
[23]	validation_0-rmse:141083.10938
[24]	validation_0-rmse:139071.68750
[25]	validation_0-rmse:137110.96875
[26]	validation_0-rmse:135267.54688
[27]	validation_0-rmse:133471.23438
[2

[230]	validation_0-rmse:54805.44531
[231]	validation_0-rmse:54777.28125
[232]	validation_0-rmse:54722.24219
[233]	validation_0-rmse:54681.84766
[234]	validation_0-rmse:54659.51953
[235]	validation_0-rmse:54624.56641
[236]	validation_0-rmse:54602.12500
[237]	validation_0-rmse:54575.67969
[238]	validation_0-rmse:54550.78906
[239]	validation_0-rmse:54504.66016
[240]	validation_0-rmse:54477.71094
[241]	validation_0-rmse:54462.94141
[242]	validation_0-rmse:54420.81250
[243]	validation_0-rmse:54366.48438
[244]	validation_0-rmse:54324.25000
[245]	validation_0-rmse:54289.35938
[246]	validation_0-rmse:54261.19922
[247]	validation_0-rmse:54232.78125
[248]	validation_0-rmse:54198.43750
[249]	validation_0-rmse:54150.21094
[250]	validation_0-rmse:54115.56641
[251]	validation_0-rmse:54053.24609
[252]	validation_0-rmse:54032.85156
[253]	validation_0-rmse:53987.83594
[254]	validation_0-rmse:53951.29297
[255]	validation_0-rmse:53937.13672
[256]	validation_0-rmse:53903.97266
[257]	validation_0-rmse:5388

[458]	validation_0-rmse:50582.07812
[459]	validation_0-rmse:50573.70703
[460]	validation_0-rmse:50567.81641
[461]	validation_0-rmse:50542.83203
[462]	validation_0-rmse:50520.67188
[463]	validation_0-rmse:50511.44141
[464]	validation_0-rmse:50495.26953
[465]	validation_0-rmse:50486.06641
[466]	validation_0-rmse:50471.59766
[467]	validation_0-rmse:50450.60938
[468]	validation_0-rmse:50427.50391
[469]	validation_0-rmse:50415.92969
[470]	validation_0-rmse:50399.72656
[471]	validation_0-rmse:50393.85938
[472]	validation_0-rmse:50387.46875
[473]	validation_0-rmse:50382.30859
[474]	validation_0-rmse:50375.55469
[475]	validation_0-rmse:50365.72656
[476]	validation_0-rmse:50348.68359
[477]	validation_0-rmse:50341.54688
[478]	validation_0-rmse:50339.36328
[479]	validation_0-rmse:50329.16797
[480]	validation_0-rmse:50324.95312
[481]	validation_0-rmse:50317.03906
[482]	validation_0-rmse:50309.40625
[483]	validation_0-rmse:50307.02734
[484]	validation_0-rmse:50297.42969
[485]	validation_0-rmse:5028

XGBRegressor(alpha=0.018414991637997532, base_score=0.5, booster='gbtree',
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.6,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', lambda=0.2528174422390116,
             learning_rate=0.02, max_delta_step=0, max_depth=20,
             min_child_weight=17, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=6, num_parallel_tree=1, random_state=48,
             reg_alpha=0.018414991, reg_lambda=0.252817452, scale_pos_weight=1,
             subsample=0.6, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [30]:
import joblib
file_name = "../data/xgb_reg.pkl"

# save
joblib.dump(clf_C, file_name) 

#load saved model
# clf_C = joblib.load(file_name)

['../data/xgb_reg.pkl']

In [31]:
test = pd.read_csv('../data/test_processed.csv', low_memory=False)

In [32]:
pred = clf_C.predict(test)

In [33]:
test_orig = pd.read_csv('../data/test.csv', low_memory=False)

In [34]:
list_ = []
for idx, row in test_orig.iterrows():
    list_.append([test_orig.loc[idx, 'id'], pred[idx]])

In [35]:
test_submission = pd.DataFrame(list_, columns =['id', 'per_square_meter_price'])
test_submission.head()

Unnamed: 0,id,per_square_meter_price
0,COL_289284,57236.730469
1,COL_289305,105866.59375
2,COL_289318,66132.296875
3,COL_289354,121184.882812
4,COL_289399,122397.007812


In [36]:
test_submission.to_csv("../data/test_submission.csv", index=False)