In [2]:
import numpy as np
import pandas as pd

In [3]:
train_full = pd.read_csv("train.csv")

In [7]:
train_full.shape

(30471, 292)

In [6]:
train_cleaned = train_full.drop(['id','timestamp'], axis=1)

In [8]:
train_cleaned.shape

(30471, 290)

In [9]:
y = train_cleaned.price_doc

In [11]:
y.shape

(30471,)

In [290]:
X_train_cleaned = train_cleaned.drop(['price_doc'], axis=1)

In [291]:
X_train_cleaned.shape

(30471, 289)

In [13]:
categorical_cols = [cname for cname in X_train_cleaned.columns if
                    X_train_cleaned[cname].nunique() < 10 and 
                    X_train_cleaned[cname].dtype == "object"]

In [14]:
numerical_cols = [cname for cname in X_train_cleaned.columns if 
                X_train_cleaned[cname].dtype in ['int64', 'float64']]

In [15]:
categorical_cols

['product_type',
 'culture_objects_top_25',
 'thermal_power_plant_raion',
 'incineration_raion',
 'oil_chemistry_raion',
 'radiation_raion',
 'railroad_terminal_raion',
 'big_market_raion',
 'nuclear_reactor_raion',
 'detention_facility_raion',
 'water_1line',
 'big_road1_1line',
 'railroad_1line',
 'ecology']

In [16]:
len(categorical_cols)

14

In [17]:
numerical_cols

['full_sq',
 'life_sq',
 'floor',
 'max_floor',
 'material',
 'build_year',
 'num_room',
 'kitch_sq',
 'state',
 'area_m',
 'raion_popul',
 'green_zone_part',
 'indust_part',
 'children_preschool',
 'preschool_quota',
 'preschool_education_centers_raion',
 'children_school',
 'school_quota',
 'school_education_centers_raion',
 'school_education_centers_top_20_raion',
 'hospital_beds_raion',
 'healthcare_centers_raion',
 'university_top_20_raion',
 'sport_objects_raion',
 'additional_education_raion',
 'culture_objects_top_25_raion',
 'shopping_centers_raion',
 'office_raion',
 'full_all',
 'male_f',
 'female_f',
 'young_all',
 'young_male',
 'young_female',
 'work_all',
 'work_male',
 'work_female',
 'ekder_all',
 'ekder_male',
 'ekder_female',
 '0_6_all',
 '0_6_male',
 '0_6_female',
 '7_14_all',
 '7_14_male',
 '7_14_female',
 '0_17_all',
 '0_17_male',
 '0_17_female',
 '16_29_all',
 '16_29_male',
 '16_29_female',
 '0_13_all',
 '0_13_male',
 '0_13_female',
 'raion_build_count_with_mater

In [18]:
len(numerical_cols)

274

In [19]:
sub_area_cols = ['sub_area']

In [20]:
my_cols = categorical_cols + numerical_cols + sub_area_cols

In [21]:
X_input = X_train_cleaned[my_cols].copy()

In [22]:
X_input.head()

Unnamed: 0,product_type,culture_objects_top_25,thermal_power_plant_raion,incineration_raion,oil_chemistry_raion,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,sub_area
0,Investment,no,no,no,no,no,no,no,no,no,...,9,4,0,13,22,1,0,52,4,Bibirevo
1,Investment,yes,no,no,no,no,no,no,no,no,...,15,3,0,15,29,1,10,66,14,Nagatinskij Zaton
2,Investment,no,no,no,no,yes,no,no,no,no,...,10,3,0,11,27,0,4,67,10,Tekstil'shhiki
3,Investment,no,no,no,no,no,no,no,no,no,...,11,2,1,4,4,0,0,26,3,Mitino
4,Investment,no,no,no,no,yes,yes,no,no,no,...,319,108,17,135,236,2,91,195,14,Basmannoe


In [23]:
X_input.shape

(30471, 289)

## Making pipelines

In [70]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

In [71]:
numerical_transformer = SimpleImputer(strategy='median')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
sub_area_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('label',OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        ('sub_area', sub_area_transformer, sub_area_cols)
    ])

## Define different models

In [72]:
from sklearn.tree import DecisionTreeRegressor
model1 = DecisionTreeRegressor(random_state=1)

In [73]:
from sklearn.ensemble import RandomForestRegressor
model2 = RandomForestRegressor(random_state=1)

In [205]:
from xgboost import XGBRegressor
model3 = XGBRegressor(random_state=1,early_stopping_rounds=5)

## Create pipelines

In [75]:
my_pipeline1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model1),
])

In [76]:
my_pipeline2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model2),
])

In [206]:
my_pipeline3 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model3),
])

## Define X (which feature to use) and y

In [89]:
top20_features = ['full_sq',
 'sub_area',
 '0_13_female',
 '7_14_female',
 'work_female',
 '0_13_all',
 '0_17_male',
 'young_female',
 '0_17_female',
 'raion_popul',
 '7_14_male',
 '0_13_male',
 'work_male',
 'work_all',
 'young_all',
 '0_17_all',
 'children_school',
 '0_6_female',
 'ekder_female',
 '0_6_male']

In [41]:
X_input.life_sq

0        27.0
1        19.0
2        29.0
3        50.0
4        77.0
         ... 
30466    27.0
30467    59.0
30468     NaN
30469    32.0
30470    28.0
Name: life_sq, Length: 30471, dtype: float64

In [42]:
X_input[numerical_cols]

Unnamed: 0,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,state,area_m,...,cafe_count_5000_price_1500,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000
0,43,27.0,4.0,,,,,,,6.407578e+06,...,40,9,4,0,13,22,1,0,52,4
1,34,19.0,3.0,,,,,,,9.589337e+06,...,36,15,3,0,15,29,1,10,66,14
2,43,29.0,2.0,,,,,,,4.808270e+06,...,25,10,3,0,11,27,0,4,67,10
3,89,50.0,9.0,,,,,,,1.258354e+07,...,15,11,2,1,4,4,0,0,26,3
4,77,77.0,4.0,,,,,,,8.398461e+06,...,552,319,108,17,135,236,2,91,195,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30466,44,27.0,7.0,9.0,1.0,1975.0,2.0,6.0,3.0,1.005305e+07,...,47,15,5,0,15,26,1,2,84,6
30467,86,59.0,3.0,9.0,2.0,1935.0,4.0,10.0,3.0,7.307411e+06,...,511,313,128,24,98,182,1,82,171,15
30468,45,,10.0,20.0,1.0,,1.0,1.0,1.0,2.553630e+07,...,5,1,1,0,2,12,0,1,11,1
30469,64,32.0,5.0,15.0,1.0,2003.0,2.0,11.0,2.0,6.050065e+06,...,58,22,1,1,6,31,1,4,65,7


## CV (optinal if with Gridsearch) Test with using ALL features

In [78]:
from sklearn.model_selection import cross_val_score


In [169]:
from sklearn.model_selection import GridSearchCV

In [192]:
param_grid_singletree = {'model__max_leaf_nodes': [50,100,200,500,1000]}

In [220]:
param_grid_forest = {'model__n_estimators': [50,100,200,500,1000]}

In [207]:
param_grid_xgboost = {'model__n_estimators': [50,100,200,500,1000],
                     'model__learning_rate': [0.03, 0.05, 0.07, 0.1]
                     }

## Single Tree

In [188]:
grid_search_singletree = GridSearchCV(my_pipeline1, param_grid_singletree, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [189]:
grid_search_singletree.fit(X_input,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ........model__max_leaf_nodes=500;, score=-0.407 total time=   2.9s
[CV 2/5] END ........model__max_leaf_nodes=500;, score=-0.221 total time=   2.8s
[CV 3/5] END ........model__max_leaf_nodes=500;, score=-0.283 total time=   2.6s
[CV 4/5] END ........model__max_leaf_nodes=500;, score=-0.263 total time=   2.7s
[CV 5/5] END ........model__max_leaf_nodes=500;, score=-0.225 total time=   2.8s
[CV 1/5] END .......model__max_leaf_nodes=1000;, score=-0.431 total time=   3.0s
[CV 2/5] END .......model__max_leaf_nodes=1000;, score=-0.266 total time=   3.1s
[CV 3/5] END .......model__max_leaf_nodes=1000;, score=-0.330 total time=   3.0s
[CV 4/5] END .......model__max_leaf_nodes=1000;, score=-0.316 total time=   3.0s
[CV 5/5] END .......model__max_leaf_nodes=1000;, score=-0.280 total time=   3.1s
[CV 1/5] END .......model__max_leaf_nodes=2000;, score=-0.487 total time=   3.5s
[CV 2/5] END .......model__max_leaf_nodes=2000;, 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['full_sq',
                                                                          'life_sq',
                                                                          'floor',
                                                                          'max_floor',
                                                                          'material',
                                                                          'build_year',
                                                                          'num_room',
                                                                          'kitch_sq',
                                                    

In [190]:
grid_search_singletree.best_params_

{'model__max_leaf_nodes': 500}

In [191]:
grid_search_singletree.best_score_

-0.27990865753610483

In [193]:
grid_search_singletree1 = GridSearchCV(my_pipeline1, param_grid_singletree1, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [194]:
grid_search_singletree1.fit(X_input,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END .........model__max_leaf_nodes=50;, score=-0.396 total time=   2.1s
[CV 2/5] END .........model__max_leaf_nodes=50;, score=-0.211 total time=   2.1s
[CV 3/5] END .........model__max_leaf_nodes=50;, score=-0.247 total time=   2.3s
[CV 4/5] END .........model__max_leaf_nodes=50;, score=-0.235 total time=   2.1s
[CV 5/5] END .........model__max_leaf_nodes=50;, score=-0.206 total time=   2.3s
[CV 1/5] END ........model__max_leaf_nodes=100;, score=-0.392 total time=   2.3s
[CV 2/5] END ........model__max_leaf_nodes=100;, score=-0.202 total time=   2.2s
[CV 3/5] END ........model__max_leaf_nodes=100;, score=-0.243 total time=   2.9s
[CV 4/5] END ........model__max_leaf_nodes=100;, score=-0.226 total time=   2.7s
[CV 5/5] END ........model__max_leaf_nodes=100;, score=-0.198 total time=   2.8s
[CV 1/5] END ........model__max_leaf_nodes=200;, score=-0.395 total time=   2.9s
[CV 2/5] END ........model__max_leaf_nodes=200;, 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['full_sq',
                                                                          'life_sq',
                                                                          'floor',
                                                                          'max_floor',
                                                                          'material',
                                                                          'build_year',
                                                                          'num_room',
                                                                          'kitch_sq',
                                                    

In [195]:
grid_search_singletree1.best_params_

{'model__max_leaf_nodes': 100}

In [196]:
grid_search_singletree1.best_score_

-0.2523952264512714

## XGBOOST

In [208]:
grid_search_xgboost = GridSearchCV(my_pipeline3, param_grid_xgboost, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [209]:
grid_search_xgboost.fit(X_input,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.323 total time=   3.8s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.206 total time=   4.0s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alar

Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/5] END model__learning_rate=0.03, model__n_estimators=500;, score=-0.359 total time=  32.8s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5] END model__learning_rate=0.03, model__n_estimators=500;, score=-0.184 total time=  32.7s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings bu

[CV 1/5] END model__learning_rate=0.05, model__n_estimators=100;, score=-0.363 total time=   8.2s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5] END model__learning_rate=0.05, model__n_estimators=100;, score=-0.186 total time=   8.2s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 3/5] END model__learning_rate=0.05, model__n_estimators=100;, score=-0.226 total time=   7.4s
Parameters: { "early_stopping_rounds" } might not 

Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5] END model__learning_rate=0.05, model__n_estimators=1000;, score=-0.188 total time= 1.1min
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 3/5] END model__learning_rate=0.05, model__n_estimators=1000;, score=-0.228 total time= 1.1min
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings 

[CV 2/5] END model__learning_rate=0.07, model__n_estimators=200;, score=-0.184 total time=  13.3s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 3/5] END model__learning_rate=0.07, model__n_estimators=200;, score=-0.226 total time=  13.2s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 4/5] END model__learning_rate=0.07, model__n_estimators=200;, score=-0.201 total time=  13.7s
Parameters: { "early_stopping_rounds" } might not 

Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV 5/5] END model__learning_rate=0.07, model__n_estimators=1000;, score=nan total time= 1.1min
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/5] END model__learning_rate=0.1, model__n_estimators=50;, score=-0.363 total time=   4.2s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5] END model__learning_rate=0.1, model__n_estimators=50;, score=-0.187 total time=   4.1s
Parameters: { "early_stopping_rounds" } might not be use

Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/5] END model__learning_rate=0.1, model__n_estimators=500;, score=-0.359 total time=  31.7s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV 2/5] END model__learning_rate=0.1, model__n_estimators=500;, score=nan total time=  32.3s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 3/5] END model__learning_rate=0.1, model__n_estimators=500;, score=-0.230 total time=  31.8s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 4/5] END model__learning_rate=0.1, model__n_estimators=500;, score=-0.203 total time=  33.4s
Parameters: { "early_stopping_rounds" } might not be use

Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV 2/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=nan total time= 1.1min
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV 3/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=nan total time= 1.1min
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV 4/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=nan total time= 1.1min
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 5/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=-0.183 total time= 1.1min


 -0.23018529 -0.22827482 -0.22744477 -0.23009292 -0.23029246 -0.22949877
 -0.22791407 -0.22937756         nan -0.23037987 -0.22820583 -0.22768867
         nan         nan]


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['full_sq',
                                                                          'life_sq',
                                                                          'floor',
                                                                          'max_floor',
                                                                          'material',
                                                                          'build_year',
                                                                          'num_room',
                                                                          'kitch_sq',
                                                    

In [210]:
grid_search_xgboost.best_params_

{'model__learning_rate': 0.03, 'model__n_estimators': 500}

In [211]:
grid_search_xgboost.best_score_

-0.22735653350547108

In [79]:
scores1 = -1 * cross_val_score(my_pipeline1, X_input, y, cv=5, scoring='neg_mean_squared_log_error')

In [80]:
scores1

array([0.54551017, 0.39776924, 0.4633882 , 0.43634591, 0.3883457 ])

In [82]:
scores1.mean()

0.44627184477970977

In [86]:
scores2 = -1 * cross_val_score(my_pipeline2, X_input, y, cv=5, scoring='neg_mean_squared_log_error')

In [87]:
scores2

array([0.36738411, 0.18748647, 0.22842122, 0.20547225, 0.17551595])

In [88]:
scores2.mean()

0.23285599846584373

In [None]:
scores3 = -1 * cross_val_score(my_pipeline3, X_input, y, cv=5, scoring='neg_mean_squared_log_error')

In [None]:
scores3

In [None]:
scores3.mean()

## CV (optinal if with Gridsearch) Test with using TOP 20 features

In [212]:
grid_search_singletree_top = GridSearchCV(my_pipeline1_top, param_grid_singletree1, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [213]:
grid_search_singletree_top.fit(X_top20,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END .........model__max_leaf_nodes=50;, score=-0.405 total time=   0.0s
[CV 2/5] END .........model__max_leaf_nodes=50;, score=-0.213 total time=   0.0s
[CV 3/5] END .........model__max_leaf_nodes=50;, score=-0.257 total time=   0.0s
[CV 4/5] END .........model__max_leaf_nodes=50;, score=-0.241 total time=   0.0s
[CV 5/5] END .........model__max_leaf_nodes=50;, score=-0.220 total time=   0.0s
[CV 1/5] END ........model__max_leaf_nodes=100;, score=-0.399 total time=   0.0s
[CV 2/5] END ........model__max_leaf_nodes=100;, score=-0.209 total time=   0.0s
[CV 3/5] END ........model__max_leaf_nodes=100;, score=-0.249 total time=   0.0s
[CV 4/5] END ........model__max_leaf_nodes=100;, score=-0.238 total time=   0.0s
[CV 5/5] END ........model__max_leaf_nodes=100;, score=-0.213 total time=   0.0s
[CV 1/5] END ........model__max_leaf_nodes=200;, score=-0.397 total time=   0.0s
[CV 2/5] END ........model__max_leaf_nodes=200;, 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['full_sq',
                                                                          '0_13_female',
                                                                          '7_14_female',
                                                                          'work_female',
                                                                          '0_13_all',
                                                                          '0_17_male',
                                                                          'young_female',
                                                                          '0_17_female',
                                  

In [214]:
grid_search_singletree_top.best_params_

{'model__max_leaf_nodes': 200}

In [215]:
grid_search_singletree_top.best_score_

-0.2595431817723153

## Forest

In [221]:
grid_search_forest_top = GridSearchCV(my_pipeline2_top, param_grid_forest, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [222]:
grid_search_forest_top.fit(X_top20,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ...........model__n_estimators=50;, score=-0.410 total time=   3.6s
[CV 2/5] END ...........model__n_estimators=50;, score=-0.216 total time=   3.8s
[CV 3/5] END ...........model__n_estimators=50;, score=-0.257 total time=   3.9s
[CV 4/5] END ...........model__n_estimators=50;, score=-0.237 total time=   4.3s
[CV 5/5] END ...........model__n_estimators=50;, score=-0.209 total time=   3.8s
[CV 1/5] END ..........model__n_estimators=100;, score=-0.409 total time=   8.0s
[CV 2/5] END ..........model__n_estimators=100;, score=-0.215 total time=   7.9s
[CV 3/5] END ..........model__n_estimators=100;, score=-0.256 total time=   7.7s
[CV 4/5] END ..........model__n_estimators=100;, score=-0.237 total time=   7.3s
[CV 5/5] END ..........model__n_estimators=100;, score=-0.209 total time=   7.7s
[CV 1/5] END ..........model__n_estimators=200;, score=-0.408 total time=  15.5s
[CV 2/5] END ..........model__n_estimators=200;, 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['full_sq',
                                                                          '0_13_female',
                                                                          '7_14_female',
                                                                          'work_female',
                                                                          '0_13_all',
                                                                          '0_17_male',
                                                                          'young_female',
                                                                          '0_17_female',
                                  

In [223]:
grid_search_forest_top.best_params_

{'model__n_estimators': 500}

In [224]:
grid_search_forest_top.best_score_

-0.2643936040745176

## XGBOOST

In [225]:
grid_search_xgboost_top = GridSearchCV(my_pipeline3_top, param_grid_xgboost, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [226]:
grid_search_xgboost_top.fit(X_top20,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.328 total time=   0.3s
[CV 2/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.210 total time=   0.3s
[CV 3/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.266 total time=   0.3s
[CV 4/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.267 total time=   0.3s
[CV 5/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.276 total time=   0.3s
[CV 1/5] END model__learning_rate=0.03, model__n_estimators=100;, score=-0.367 total time=   0.6s
[CV 2/5] END model__learning_rate=0.03, model__n_estimators=100;, score=-0.192 total time=   0.6s
[CV 3/5] END model__learning_rate=0.03, model__n_estimators=100;, score=-0.232 total time=   0.6s
[CV 4/5] END model__learning_rate=0.03, model__n_estimators=100;, score=-0.218 total time=   0.6s
[CV 5/5] END model__learning_rate=0.03, model__n_estimators=1

[CV 5/5] END model__learning_rate=0.1, model__n_estimators=100;, score=-0.187 total time=   0.6s
[CV 1/5] END model__learning_rate=0.1, model__n_estimators=200;, score=-0.387 total time=   1.2s
[CV 2/5] END model__learning_rate=0.1, model__n_estimators=200;, score=-0.199 total time=   1.2s
[CV 3/5] END model__learning_rate=0.1, model__n_estimators=200;, score=-0.233 total time=   1.2s
[CV 4/5] END model__learning_rate=0.1, model__n_estimators=200;, score=-0.214 total time=   1.2s
[CV 5/5] END model__learning_rate=0.1, model__n_estimators=200;, score=-0.189 total time=   1.1s
[CV 1/5] END model__learning_rate=0.1, model__n_estimators=500;, score=-0.391 total time=   2.8s
[CV 2/5] END model__learning_rate=0.1, model__n_estimators=500;, score=-0.202 total time=   3.0s
[CV 3/5] END model__learning_rate=0.1, model__n_estimators=500;, score=-0.237 total time=   2.8s
[CV 4/5] END model__learning_rate=0.1, model__n_estimators=500;, score=-0.219 total time=   2.8s
[CV 5/5] END model__learning_r

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['full_sq',
                                                                          '0_13_female',
                                                                          '7_14_female',
                                                                          'work_female',
                                                                          '0_13_all',
                                                                          '0_17_male',
                                                                          'young_female',
                                                                          '0_17_female',
                                  

In [227]:
grid_search_xgboost_top.best_params_

{'model__learning_rate': 0.05, 'model__n_estimators': 50}

In [228]:
grid_search_xgboost_top.best_score_

-0.24127230112717807

## -----------------------

In [92]:
X_top20 = X_input[top20_features]

In [95]:
X_top20.dtypes

full_sq             int64
sub_area           object
0_13_female         int64
7_14_female         int64
work_female         int64
0_13_all            int64
0_17_male           int64
young_female        int64
0_17_female         int64
raion_popul         int64
7_14_male           int64
0_13_male           int64
work_male           int64
work_all            int64
young_all           int64
0_17_all            int64
children_school     int64
0_6_female          int64
ekder_female        int64
0_6_male            int64
dtype: object

In [98]:
X_top20.columns

Index(['full_sq', 'sub_area', '0_13_female', '7_14_female', 'work_female',
       '0_13_all', '0_17_male', 'young_female', '0_17_female', 'raion_popul',
       '7_14_male', '0_13_male', 'work_male', 'work_all', 'young_all',
       '0_17_all', 'children_school', '0_6_female', 'ekder_female',
       '0_6_male'],
      dtype='object')

In [99]:
numerical_cols_top = ['full_sq', '0_13_female', '7_14_female', 'work_female',
       '0_13_all', '0_17_male', 'young_female', '0_17_female', 'raion_popul',
       '7_14_male', '0_13_male', 'work_male', 'work_all', 'young_all',
       '0_17_all', 'children_school', '0_6_female', 'ekder_female',
       '0_6_male']

In [100]:
categorical_cols_top = ['sub_area']

In [101]:
preprocessor_top = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_top),
        ('sub_area', sub_area_transformer, categorical_cols_top)
    ])

In [102]:
my_pipeline1_top = Pipeline(steps=[
    ('preprocessor', preprocessor_top),
    ('model', model1),
])

In [103]:
my_pipeline2_top = Pipeline(steps=[
    ('preprocessor', preprocessor_top),
    ('model', model2),
])

In [104]:
my_pipeline3_top = Pipeline(steps=[
    ('preprocessor', preprocessor_top),
    ('model', model3),
])

In [105]:
scores1_top = -1 * cross_val_score(my_pipeline1_top, X_top20, y, cv=5, scoring='neg_mean_squared_log_error')

In [106]:
scores1_top

array([0.45992925, 0.26243839, 0.32562733, 0.29701708, 0.28431644])

In [107]:
scores1_top.mean()

0.32586569728342923

In [109]:
scores2_top = -1 * cross_val_score(my_pipeline2_top, X_top20, y, cv=5, scoring='neg_mean_squared_log_error')

In [110]:
scores2_top

array([0.40865489, 0.21502985, 0.25586191, 0.23659428, 0.20882305])

In [111]:
scores2_top.mean()

0.2649927968627222

In [112]:
scores3_top = -1 * cross_val_score(my_pipeline3_top, X_top20, y, cv=5, scoring='neg_mean_squared_log_error')

In [113]:
scores3_top

array([0.39041806, 0.20032383, 0.23472604, 0.21634221, 0.19235673])

In [114]:
scores3_top.mean()

0.24683337327024088

## CV (optinal if with Gridsearch) Test with using Random 20 features

In [229]:
grid_search_singletree_random = GridSearchCV(my_pipeline1_random, param_grid_singletree1, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [230]:
grid_search_singletree_random.fit(X_random20,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END .........model__max_leaf_nodes=50;, score=-0.433 total time=   0.1s
[CV 2/5] END .........model__max_leaf_nodes=50;, score=-0.257 total time=   0.0s
[CV 3/5] END .........model__max_leaf_nodes=50;, score=-0.269 total time=   0.0s
[CV 4/5] END .........model__max_leaf_nodes=50;, score=-0.269 total time=   0.0s
[CV 5/5] END .........model__max_leaf_nodes=50;, score=-0.240 total time=   0.0s
[CV 1/5] END ........model__max_leaf_nodes=100;, score=-0.424 total time=   0.0s
[CV 2/5] END ........model__max_leaf_nodes=100;, score=-0.249 total time=   0.0s
[CV 3/5] END ........model__max_leaf_nodes=100;, score=-0.264 total time=   0.0s
[CV 4/5] END ........model__max_leaf_nodes=100;, score=-0.265 total time=   0.0s
[CV 5/5] END ........model__max_leaf_nodes=100;, score=-0.236 total time=   0.0s
[CV 1/5] END ........model__max_leaf_nodes=200;, score=-0.426 total time=   0.0s
[CV 2/5] END ........model__max_leaf_nodes=200;, 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['life_sq',
                                                                          'children_school',
                                                                          'young_all',
                                                                          '0_17_all',
                                                                          'build_count_panel',
                                                                          'kindergarten_km',
                                                                          'public_transport_station_min_walk',
                                                                          'ID_bus_termin

In [231]:
grid_search_singletree_random.best_params_

{'model__max_leaf_nodes': 200}

In [232]:
grid_search_singletree_random.best_score_

-0.28235788141400453

## Forest

In [233]:
grid_search_forest_random = GridSearchCV(my_pipeline2_random, param_grid_forest, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [234]:
grid_search_forest_random.fit(X_random20,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ...........model__n_estimators=50;, score=-0.404 total time=   6.8s
[CV 2/5] END ...........model__n_estimators=50;, score=-0.233 total time=   7.5s
[CV 3/5] END ...........model__n_estimators=50;, score=-0.252 total time=   7.3s
[CV 4/5] END ...........model__n_estimators=50;, score=-0.244 total time=   9.0s
[CV 5/5] END ...........model__n_estimators=50;, score=-0.210 total time=   8.6s
[CV 1/5] END ..........model__n_estimators=100;, score=-0.403 total time=  14.2s
[CV 2/5] END ..........model__n_estimators=100;, score=-0.231 total time=  14.2s
[CV 3/5] END ..........model__n_estimators=100;, score=-0.251 total time=  13.7s
[CV 4/5] END ..........model__n_estimators=100;, score=-0.244 total time=  15.8s
[CV 5/5] END ..........model__n_estimators=100;, score=-0.209 total time=  15.8s
[CV 1/5] END ..........model__n_estimators=200;, score=-0.402 total time=  27.1s
[CV 2/5] END ..........model__n_estimators=200;, 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['life_sq',
                                                                          'children_school',
                                                                          'young_all',
                                                                          '0_17_all',
                                                                          'build_count_panel',
                                                                          'kindergarten_km',
                                                                          'public_transport_station_min_walk',
                                                                          'ID_bus_termin

In [235]:
grid_search_forest_random.best_params_

{'model__n_estimators': 1000}

In [236]:
grid_search_forest_random.best_score_

-0.26635835485187015

## XGBOOST

In [237]:
grid_search_xgboost_random = GridSearchCV(my_pipeline3_random, param_grid_xgboost, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [238]:
grid_search_xgboost_random.fit(X_random20,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.344 total time=   0.5s
[CV 2/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.229 total time=   0.5s
[CV 3/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.275 total time=   0.5s
[CV 4/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.283 total time=   0.4s
[CV 5/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.289 total time=   0.5s
[CV 1/5] END model__learning_rate=0.03, model__n_estimators=100;, score=-0.388 total time=   0.9s
[CV 2/5] END model__learning_rate=0.03, model__n_estimators=100;, score=-0.224 total time=   0.9s
[CV 3/5] END model__learning_rate=0.03, model__n_estimators=100;, score=-0.244 total time=   0.9s
[CV 4/5] END model__learning_rate=0.03, model__n_estimators=100;, score=-0.239 total time=   0.9s
[CV 5/5] END model__learning_rate=0.03, model__n_estimators=1

Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV 5/5] END model__learning_rate=0.05, model__n_estimators=1000;, score=nan total time=   9.2s
[CV 1/5] END model__learning_rate=0.07, model__n_estimators=50;, score=-0.397 total time=   0.5s
[CV 2/5] END model__learning_rate=0.07, model__n_estimators=50;, score=-0.229 total time=   0.5s
[CV 3/5] END model__learning_rate=0.07, model__n_estimators=50;, score=-0.246 total time=   0.5s
[CV 4/5] END model__learning_rate=0.07, model__n_estimators=50;, score=-0.239 total time=   0.5s
[CV 5/5] END model__learning_rate=0.07, model__n_estimators=50;, score=-0.212 total time=   0.6s
[CV 1/5] END model__learning_rate=0.07, model__n_estimators=100;, score=-0.408 total time=   0.9s
[CV 2/5] END model__learning_rate=0.07, model__n_estimators=100;, score=-0.232 total time=   1.0s
[CV 3/5] END model__learning_rate=0.07, model__n_estimators=100;, score=-0.246 total time=   1.0s
[CV 4/5] END model__learning_rate=0.07, model__n_estimators=100;, score=-0.237 total time=   1.0s
[CV 5/5] END model__learnin

Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV 5/5] END model__learning_rate=0.07, model__n_estimators=500;, score=nan total time=   4.5s
[CV 1/5] END model__learning_rate=0.07, model__n_estimators=1000;, score=-0.408 total time=   9.0s
[CV 2/5] END model__learning_rate=0.07, model__n_estimators=1000;, score=-0.230 total time=   9.2s
[CV 3/5] END model__learning_rate=0.07, model__n_estimators=1000;, score=-0.254 total time=   8.9s
[CV 4/5] END model__learning_rate=0.07, model__n_estimators=1000;, score=-0.238 total time=   8.9s


Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV 5/5] END model__learning_rate=0.07, model__n_estimators=1000;, score=nan total time=   9.2s
[CV 1/5] END model__learning_rate=0.1, model__n_estimators=50;, score=-0.408 total time=   0.5s
[CV 2/5] END model__learning_rate=0.1, model__n_estimators=50;, score=-0.232 total time=   0.5s
[CV 3/5] END model__learning_rate=0.1, model__n_estimators=50;, score=-0.247 total time=   0.5s
[CV 4/5] END model__learning_rate=0.1, model__n_estimators=50;, score=-0.237 total time=   0.5s
[CV 5/5] END model__learning_rate=0.1, model__n_estimators=50;, score=-0.208 total time=   0.5s
[CV 1/5] END model__learning_rate=0.1, model__n_estimators=100;, score=-0.408 total time=   1.0s
[CV 2/5] END model__learning_rate=0.1, model__n_estimators=100;, score=-0.231 total time=   0.9s
[CV 3/5] END model__learning_rate=0.1, model__n_estimators=100;, score=-0.246 total time=   0.9s
[CV 4/5] END model__learning_rate=0.1, model__n_estimators=100;, score=-0.236 total time=   1.0s
[CV 5/5] END model__learning_rate=0.

Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



[CV 2/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=nan total time=   9.3s
[CV 3/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=-0.264 total time=   9.1s
[CV 4/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=-0.243 total time=   8.8s


Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

 -0.2665837  -0.26523684 -0.26378492         nan -

[CV 5/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=nan total time=   9.1s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['life_sq',
                                                                          'children_school',
                                                                          'young_all',
                                                                          '0_17_all',
                                                                          'build_count_panel',
                                                                          'kindergarten_km',
                                                                          'public_transport_station_min_walk',
                                                                          'ID_bus_termin

In [239]:
grid_search_xgboost_random.best_params_

{'model__learning_rate': 0.05, 'model__n_estimators': 50}

In [240]:
grid_search_xgboost_random.best_score_

-0.26074559872284053

In [118]:
X_input.columns

Index(['product_type', 'culture_objects_top_25', 'thermal_power_plant_raion',
       'incineration_raion', 'oil_chemistry_raion', 'radiation_raion',
       'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion',
       'detention_facility_raion',
       ...
       'cafe_count_5000_price_2500', 'cafe_count_5000_price_4000',
       'cafe_count_5000_price_high', 'big_church_count_5000',
       'church_count_5000', 'mosque_count_5000', 'leisure_count_5000',
       'sport_count_5000', 'market_count_5000', 'sub_area'],
      dtype='object', length=289)

In [141]:
random_20_features = X_input.columns[::15]

In [142]:
random_20_features

Index(['product_type', 'life_sq', 'children_school', 'young_all', '0_17_all',
       'build_count_panel', 'kindergarten_km',
       'public_transport_station_min_walk', 'ID_bus_terminal',
       'detention_facility_km', 'green_part_500', 'cafe_count_500_price_4000',
       'cafe_sum_1000_min_price_avg', 'market_count_1000',
       'cafe_count_1500_price_2500', 'cafe_count_2000', 'sport_count_2000',
       'cafe_count_3000_price_1500', 'trc_sqm_5000', 'leisure_count_5000'],
      dtype='object')

In [143]:
len(random_20_features)

20

In [144]:
X_random20 = X_input[random_20_features]

In [145]:
X_random20.dtypes

product_type                          object
life_sq                              float64
children_school                        int64
young_all                              int64
0_17_all                               int64
build_count_panel                    float64
kindergarten_km                      float64
public_transport_station_min_walk    float64
ID_bus_terminal                        int64
detention_facility_km                float64
green_part_500                       float64
cafe_count_500_price_4000              int64
cafe_sum_1000_min_price_avg          float64
market_count_1000                      int64
cafe_count_1500_price_2500             int64
cafe_count_2000                        int64
sport_count_2000                       int64
cafe_count_3000_price_1500             int64
trc_sqm_5000                           int64
leisure_count_5000                     int64
dtype: object

In [146]:
numerical_cols_random = ['life_sq', 'children_school', 'young_all', '0_17_all',
       'build_count_panel', 'kindergarten_km',
       'public_transport_station_min_walk', 'ID_bus_terminal',
       'detention_facility_km', 'green_part_500', 'cafe_count_500_price_4000',
       'cafe_sum_1000_min_price_avg', 'market_count_1000',
       'cafe_count_1500_price_2500', 'cafe_count_2000', 'sport_count_2000',
       'cafe_count_3000_price_1500', 'trc_sqm_5000', 'leisure_count_5000']

In [147]:
categorical_cols_random = ['product_type']

In [148]:
preprocessor_random = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_random),
        ('cat', categorical_transformer, categorical_cols_random)
    ])

In [149]:
my_pipeline1_random = Pipeline(steps=[
    ('preprocessor', preprocessor_random),
    ('model', model1),
])

In [150]:
my_pipeline2_random = Pipeline(steps=[
    ('preprocessor', preprocessor_random),
    ('model', model2),
])

In [151]:
my_pipeline3_random = Pipeline(steps=[
    ('preprocessor', preprocessor_random),
    ('model', model3),
])

In [152]:
scores1_random = -1 * cross_val_score(my_pipeline1_random, X_random20, y, cv=5, scoring='neg_mean_squared_log_error')

In [153]:
scores1_random

array([0.57375869, 0.38981891, 0.48663176, 0.46624965, 0.44081028])

In [154]:
scores1_random.mean()

0.4714538578207462

In [155]:
scores2_random = -1 * cross_val_score(my_pipeline2_random, X_random20, y, cv=5, scoring='neg_mean_squared_log_error')

In [156]:
scores2_random

array([0.40296849, 0.23139571, 0.25084355, 0.24357507, 0.20904144])

In [157]:
scores2_random.mean()

0.2675648533596991

In [160]:
scores3_random = -1 * cross_val_score(my_pipeline3_random, X_random20, y, cv=5, scoring='neg_mean_squared_log_error')

Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 87, in __call__
    score = scorer._score(cached_call, estimator,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.



In [161]:
scores3_random

array([0.40614753, 0.22892315, 0.2505899 , 0.24015476,        nan])

In [162]:
scores3_random.mean()

nan

In [163]:
scores3_random_skewed = scores3_random[:-1]

In [164]:
scores3_random_skewed

array([0.40614753, 0.22892315, 0.2505899 , 0.24015476])

In [165]:
scores3_random_skewed.mean()

0.28145383527338974

In [166]:
X_random20

Unnamed: 0,product_type,life_sq,children_school,young_all,0_17_all,build_count_panel,kindergarten_km,public_transport_station_min_walk,ID_bus_terminal,detention_facility_km,green_part_500,cafe_count_500_price_4000,cafe_sum_1000_min_price_avg,market_count_1000,cafe_count_1500_price_2500,cafe_count_2000,sport_count_2000,cafe_count_3000_price_1500,trc_sqm_5000,leisure_count_5000
0,Investment,27.0,10309,21154,23603,184.0,0.145700,3.299822,1,4.248036,0.00,0,527.78,1,2,36,10,16,4036616,0
1,Investment,19.0,7759,15727,17700,90.0,0.147754,0.783160,2,12.649879,25.14,1,615.38,0,2,21,11,4,2034942,10
2,Investment,29.0,6207,13028,14884,60.0,0.049102,3.945073,3,7.682303,1.67,0,642.86,3,0,24,8,9,1572990,4
3,Investment,50.0,13670,28563,32063,201.0,0.179441,1.579164,1,8.789894,17.36,0,658.33,1,1,25,13,10,942180,0
4,Investment,77.0,6748,13368,15237,35.0,0.247901,0.857764,4,3.779781,3.56,0,763.45,0,30,483,21,262,3503058,91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30466,Investment,27.0,10311,21400,23849,222.0,0.132645,1.119101,4,4.589571,3.38,0,666.67,1,2,43,15,15,2548292,2
30467,Investment,59.0,6398,11272,12508,41.0,0.276256,1.088544,5,0.416625,5.64,0,699.35,0,38,444,36,230,4345915,82
30468,OwnerOccupier,,264,574,646,,0.897889,7.560163,8,25.084813,3.33,0,1000.00,0,1,7,3,2,201300,1
30469,Investment,32.0,4635,9414,10896,134.0,0.203020,3.138330,8,24.788893,14.85,0,753.85,0,5,67,18,26,1464521,4


In [168]:
X_random20.tail(10)

Unnamed: 0,product_type,life_sq,children_school,young_all,0_17_all,build_count_panel,kindergarten_km,public_transport_station_min_walk,ID_bus_terminal,detention_facility_km,green_part_500,cafe_count_500_price_4000,cafe_sum_1000_min_price_avg,market_count_1000,cafe_count_1500_price_2500,cafe_count_2000,sport_count_2000,cafe_count_3000_price_1500,trc_sqm_5000,leisure_count_5000
30461,Investment,19.0,4635,9414,10896,134.0,0.417062,0.949552,8,14.544518,8.58,0,872.22,0,6,84,22,54,1145385,6
30462,OwnerOccupier,30.0,3889,8294,9249,61.0,0.726512,3.178926,5,4.113151,3.47,0,766.67,0,1,45,16,29,2276854,13
30463,Investment,29.0,11319,22961,26095,235.0,0.22572,2.27497,1,11.415882,2.63,0,400.0,1,1,19,5,9,867953,1
30464,OwnerOccupier,51.0,3594,7706,8741,60.0,0.414927,1.037921,4,5.15282,12.38,0,716.67,0,3,32,16,19,1313276,4
30465,OwnerOccupier,,6398,11272,12508,41.0,1.048962,3.912418,13,3.939382,12.24,4,914.53,1,87,1058,42,446,3346565,105
30466,Investment,27.0,10311,21400,23849,222.0,0.132645,1.119101,4,4.589571,3.38,0,666.67,1,2,43,15,15,2548292,2
30467,Investment,59.0,6398,11272,12508,41.0,0.276256,1.088544,5,0.416625,5.64,0,699.35,0,38,444,36,230,4345915,82
30468,OwnerOccupier,,264,574,646,,0.897889,7.560163,8,25.084813,3.33,0,1000.0,0,1,7,3,2,201300,1
30469,Investment,32.0,4635,9414,10896,134.0,0.20302,3.13833,8,24.788893,14.85,0,753.85,0,5,67,18,26,1464521,4
30470,Investment,28.0,6533,13523,14994,85.0,0.093619,3.001814,3,8.868202,0.0,0,581.82,0,1,26,12,6,646575,9


## TOP20 + NEWLY CREATED

In [293]:
X_train_cleaned.shape

(30471, 289)

In [294]:
X_train_cleaned['young_male_ratio'] = X_train_cleaned.young_male/X_train_cleaned.young_all
X_train_cleaned['young_female_ratio'] = X_train_cleaned.young_female/X_train_cleaned.young_all
X_train_cleaned['work_male_ratio'] = X_train_cleaned.work_male/X_train_cleaned.work_all
X_train_cleaned['work_female_ratio'] = X_train_cleaned.work_female/X_train_cleaned.work_all
X_train_cleaned['ekder_male_ratio'] = X_train_cleaned.ekder_male/X_train_cleaned.ekder_all
X_train_cleaned['ekder_female_ratio'] = X_train_cleaned.ekder_female/X_train_cleaned.ekder_all

In [295]:
X_train_cleaned['0_6_male_ratio'] = X_train_cleaned['0_6_male']/X_train_cleaned['0_6_all']
X_train_cleaned['0_6_female_ratio'] = X_train_cleaned['0_6_female']/X_train_cleaned['0_6_all']
X_train_cleaned['7_14_male_ratio'] = X_train_cleaned['7_14_male']/X_train_cleaned['7_14_all']
X_train_cleaned['7_14_female_ratio'] = X_train_cleaned['7_14_female']/X_train_cleaned['7_14_all']
X_train_cleaned['0_17_male_ratio'] = X_train_cleaned['0_17_male']/X_train_cleaned['0_17_all']
X_train_cleaned['0_17_female_ratio'] = X_train_cleaned['0_17_female']/X_train_cleaned['0_17_all']
X_train_cleaned['16_29_male_ratio'] = X_train_cleaned['16_29_male']/X_train_cleaned['16_29_all']
X_train_cleaned['16_29_female_ratio'] = X_train_cleaned['16_29_female']/X_train_cleaned['16_29_all']
X_train_cleaned['0_13_male_ratio'] = X_train_cleaned['0_13_male']/X_train_cleaned['0_13_all']
X_train_cleaned['0_13_female_ratio'] = X_train_cleaned['0_13_female']/X_train_cleaned['0_13_all']

In [296]:
#X_train_cleaned['rel_floor'] = (X_train_cleaned.floor/X_train_cleaned.max_floor).astype(float)
#X_train_cleaned['rel_kitchen_sq'] = (X_train_cleaned.kitch_sq/X_train_cleaned.full_sq).astype(float)

In [297]:
numerical_cols_top = ['full_sq', '0_13_female', '7_14_female', 'work_female',
       '0_13_all', '0_17_male', 'young_female', '0_17_female', 'raion_popul',
       '7_14_male', '0_13_male', 'work_male', 'work_all', 'young_all',
       '0_17_all', 'children_school', '0_6_female', 'ekder_female',
       '0_6_male']

In [298]:
handcrafted_cols = ['young_male_ratio','young_female_ratio','work_male_ratio','work_female_ratio','ekder_male_ratio','ekder_female_ratio'
                   , '0_6_male_ratio','0_6_female_ratio','7_14_male_ratio','7_14_female_ratio','0_17_male_ratio','0_17_female_ratio',
                   '16_29_male_ratio','16_29_female_ratio','0_13_male_ratio','0_13_female_ratio']

In [299]:
numerical_cols_new = numerical_cols_top + handcrafted_cols

In [300]:
numerical_cols_new

['full_sq',
 '0_13_female',
 '7_14_female',
 'work_female',
 '0_13_all',
 '0_17_male',
 'young_female',
 '0_17_female',
 'raion_popul',
 '7_14_male',
 '0_13_male',
 'work_male',
 'work_all',
 'young_all',
 '0_17_all',
 'children_school',
 '0_6_female',
 'ekder_female',
 '0_6_male',
 'young_male_ratio',
 'young_female_ratio',
 'work_male_ratio',
 'work_female_ratio',
 'ekder_male_ratio',
 'ekder_female_ratio',
 '0_6_male_ratio',
 '0_6_female_ratio',
 '7_14_male_ratio',
 '7_14_female_ratio',
 '0_17_male_ratio',
 '0_17_female_ratio',
 '16_29_male_ratio',
 '16_29_female_ratio',
 '0_13_male_ratio',
 '0_13_female_ratio']

In [301]:
categorical_cols_new = ['sub_area']

In [302]:
preprocessor_new = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols_new),
        ('sub_area', sub_area_transformer, categorical_cols_new)
    ])

In [303]:
my_pipeline1_new = Pipeline(steps=[
    ('preprocessor', preprocessor_new),
    ('model', model1),
])

In [304]:
my_pipeline2_new = Pipeline(steps=[
    ('preprocessor', preprocessor_new),
    ('model', model2),
])

In [305]:
my_pipeline3_new = Pipeline(steps=[
    ('preprocessor', preprocessor_new),
    ('model', model3),
])

## Single Tree GRIDSEARCH

In [306]:
new_total_cols = numerical_cols_new + categorical_cols_new

In [307]:
X_new = X_train_cleaned[new_total_cols]

In [308]:
X_new.describe()

Unnamed: 0,full_sq,0_13_female,7_14_female,work_female,0_13_all,0_17_male,young_female,0_17_female,raion_popul,7_14_male,...,0_6_male_ratio,0_6_female_ratio,7_14_male_ratio,7_14_female_ratio,0_17_male_ratio,0_17_female_ratio,16_29_male_ratio,16_29_female_ratio,0_13_male_ratio,0_13_female_ratio
count,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,...,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0,30471.0
mean,54.214269,4803.833153,2610.946507,26414.414821,9841.097535,6423.198517,5455.013948,6117.569525,84056.425552,2743.470349,...,0.512006,0.488335,0.513246,0.487045,0.513054,0.487108,0.502394,0.497677,0.512083,0.487928
std,38.031487,3536.907486,1941.380464,18643.132758,7290.007118,4769.593328,4020.546283,4491.734947,57871.285899,2054.52152,...,0.012428,0.012258,0.016302,0.016262,0.013144,0.013068,0.034702,0.034618,0.013473,0.01347
min,0.0,156.0,82.0,771.0,322.0,214.0,177.0,198.0,2546.0,87.0,...,0.459178,0.459625,0.420637,0.442599,0.452859,0.455447,0.412014,0.387667,0.444224,0.458632
25%,38.0,1512.0,743.0,6661.0,3112.0,1973.0,1677.0,1858.0,21819.0,821.0,...,0.505275,0.481637,0.508735,0.47745,0.510767,0.48084,0.4814,0.476543,0.510106,0.48315
50%,49.0,4667.0,2535.0,26092.0,9633.0,6085.0,5333.0,6185.0,83502.0,2693.0,...,0.51459,0.485714,0.515847,0.484897,0.515534,0.484466,0.494143,0.505857,0.515487,0.484513
75%,63.0,6699.0,3534.0,37942.0,13121.0,8599.0,7617.0,8549.0,122862.0,3585.0,...,0.519534,0.494725,0.52255,0.491265,0.520124,0.489233,0.523457,0.5186,0.51685,0.489894
max,5326.0,17461.0,9322.0,81668.0,36035.0,23233.0,19715.0,21937.0,247469.0,9761.0,...,0.540375,0.540822,0.557401,0.579363,0.544553,0.547141,0.61243,0.587986,0.541368,0.555776


In [309]:
grid_search_singletree_new = GridSearchCV(my_pipeline1_new, param_grid_singletree1, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [310]:
grid_search_singletree_new.fit(X_new,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END .........model__max_leaf_nodes=50;, score=-0.403 total time=   0.1s
[CV 2/5] END .........model__max_leaf_nodes=50;, score=-0.216 total time=   0.1s
[CV 3/5] END .........model__max_leaf_nodes=50;, score=-0.254 total time=   0.1s
[CV 4/5] END .........model__max_leaf_nodes=50;, score=-0.236 total time=   0.1s
[CV 5/5] END .........model__max_leaf_nodes=50;, score=-0.216 total time=   0.1s
[CV 1/5] END ........model__max_leaf_nodes=100;, score=-0.402 total time=   0.1s
[CV 2/5] END ........model__max_leaf_nodes=100;, score=-0.210 total time=   0.1s
[CV 3/5] END ........model__max_leaf_nodes=100;, score=-0.247 total time=   0.1s
[CV 4/5] END ........model__max_leaf_nodes=100;, score=-0.234 total time=   0.1s
[CV 5/5] END ........model__max_leaf_nodes=100;, score=-0.206 total time=   0.1s
[CV 1/5] END ........model__max_leaf_nodes=200;, score=-0.397 total time=   0.1s
[CV 2/5] END ........model__max_leaf_nodes=200;, 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['full_sq',
                                                                          '0_13_female',
                                                                          '7_14_female',
                                                                          'work_female',
                                                                          '0_13_all',
                                                                          '0_17_male',
                                                                          'young_female',
                                                                          '0_17_female',
                                  

In [311]:
grid_search_singletree_new.best_params_

{'model__max_leaf_nodes': 200}

In [312]:
grid_search_singletree_new.best_score_

-0.25846761542309815

## FOREST

In [313]:
grid_search_forest_new = GridSearchCV(my_pipeline2_new, param_grid_forest, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [314]:
grid_search_forest_new.fit(X_new,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5] END ...........model__n_estimators=50;, score=-0.408 total time=   6.6s
[CV 2/5] END ...........model__n_estimators=50;, score=-0.215 total time=   6.3s
[CV 3/5] END ...........model__n_estimators=50;, score=-0.257 total time=   6.5s
[CV 4/5] END ...........model__n_estimators=50;, score=-0.237 total time=   6.5s
[CV 5/5] END ...........model__n_estimators=50;, score=-0.210 total time=   6.5s
[CV 1/5] END ..........model__n_estimators=100;, score=-0.408 total time=  12.6s
[CV 2/5] END ..........model__n_estimators=100;, score=-0.214 total time=  12.7s
[CV 3/5] END ..........model__n_estimators=100;, score=-0.255 total time=  12.8s
[CV 4/5] END ..........model__n_estimators=100;, score=-0.237 total time=  13.3s
[CV 5/5] END ..........model__n_estimators=100;, score=-0.208 total time=  13.1s
[CV 1/5] END ..........model__n_estimators=200;, score=-0.407 total time=  25.8s
[CV 2/5] END ..........model__n_estimators=200;, 

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['full_sq',
                                                                          '0_13_female',
                                                                          '7_14_female',
                                                                          'work_female',
                                                                          '0_13_all',
                                                                          '0_17_male',
                                                                          'young_female',
                                                                          '0_17_female',
                                  

In [315]:
grid_search_forest_new.best_params_

{'model__n_estimators': 500}

In [316]:
grid_search_forest_new.best_score_

-0.26399157234763904

## XGBOOST

In [317]:
grid_search_xgboost_new = GridSearchCV(my_pipeline3_new, param_grid_xgboost, cv=5, scoring='neg_mean_squared_log_error', verbose=3)

In [318]:
grid_search_xgboost_new.fit(X_new,y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.327 total time=   0.4s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5] END model__learning_rate=0.03, model__n_estimators=50;, score=-0.206 total time=   0.5s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alar

[CV 1/5] END model__learning_rate=0.03, model__n_estimators=500;, score=-0.386 total time=   3.7s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5] END model__learning_rate=0.03, model__n_estimators=500;, score=-0.198 total time=   3.5s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 3/5] END model__learning_rate=0.03, model__n_estimators=500;, score=-0.233 total time=   3.7s
Parameters: { "early_stopping_rounds" } might not 

[CV 2/5] END model__learning_rate=0.05, model__n_estimators=100;, score=-0.198 total time=   1.0s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 3/5] END model__learning_rate=0.05, model__n_estimators=100;, score=-0.233 total time=   0.9s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 4/5] END model__learning_rate=0.05, model__n_estimators=100;, score=-0.213 total time=   1.1s
Parameters: { "early_stopping_rounds" } might not 

[CV 3/5] END model__learning_rate=0.05, model__n_estimators=1000;, score=-0.237 total time=   8.3s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 4/5] END model__learning_rate=0.05, model__n_estimators=1000;, score=-0.219 total time=   8.4s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 5/5] END model__learning_rate=0.05, model__n_estimators=1000;, score=-0.193 total time=   7.5s
Parameters: { "early_stopping_rounds" } might n

[CV 4/5] END model__learning_rate=0.07, model__n_estimators=200;, score=-0.213 total time=   1.6s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 5/5] END model__learning_rate=0.07, model__n_estimators=200;, score=-0.187 total time=   1.6s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/5] END model__learning_rate=0.07, model__n_estimators=500;, score=-0.394 total time=   3.8s
Parameters: { "early_stopping_rounds" } might not 

[CV 5/5] END model__learning_rate=0.1, model__n_estimators=50;, score=-0.189 total time=   0.5s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/5] END model__learning_rate=0.1, model__n_estimators=100;, score=-0.386 total time=   0.9s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5] END model__learning_rate=0.1, model__n_estimators=100;, score=-0.198 total time=   0.8s
Parameters: { "early_stopping_rounds" } might not be u

[CV 1/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=-0.400 total time=   8.5s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=-0.210 total time=   8.9s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 3/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=-0.249 total time=   8.6s
Parameters: { "early_stopping_rounds" } might not 

Traceback (most recent call last):
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
    scores = scorer(estimator, X_test, y_test)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 199, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_scorer.py", line 242, in _score
    return self._sign * self._score_func(y_true, y_pred,
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "C:\Users\zhang\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 413, in mean_squared_log_error
    raise ValueError("Mean Squared Logarithmic Error cannot be used when "
ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.

 -0.24306812 -0.24334969 -0.24501076 -0.24919648 -

[CV 5/5] END model__learning_rate=0.1, model__n_estimators=1000;, score=nan total time=   7.9s
Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         SimpleImputer(strategy='median'),
                                                                         ['full_sq',
                                                                          '0_13_female',
                                                                          '7_14_female',
                                                                          'work_female',
                                                                          '0_13_all',
                                                                          '0_17_male',
                                                                          'young_female',
                                                                          '0_17_female',
                                  

In [319]:
grid_search_xgboost_new.best_params_

{'model__learning_rate': 0.05, 'model__n_estimators': 50}

In [320]:
grid_search_xgboost_new.best_score_

-0.23892618283781347