In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostRegressor

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
df = pd.read_csv("C:/Users\monster\Desktop/NonLinearRegressionModels\RandomForest\Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(["Salary", "League", "Division", "NewLeague"], axis = 1).astype("float64")
X = pd.concat([X_, dms[["League_N", "Division_W", "NewLeague_N"]]], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [4]:
catb_model = CatBoostRegressor().fit(X_train, y_train)

Learning rate set to 0.029229
0:	learn: 438.1974206	total: 53.2ms	remaining: 53.2s
1:	learn: 432.4168868	total: 54.6ms	remaining: 27.2s
2:	learn: 426.3836690	total: 56.2ms	remaining: 18.7s
3:	learn: 420.2261014	total: 57.6ms	remaining: 14.3s
4:	learn: 414.9976675	total: 58.8ms	remaining: 11.7s
5:	learn: 409.6125323	total: 60.1ms	remaining: 9.95s
6:	learn: 403.9277911	total: 61.3ms	remaining: 8.7s
7:	learn: 398.4395285	total: 62.6ms	remaining: 7.76s
8:	learn: 392.4517081	total: 63.9ms	remaining: 7.04s
9:	learn: 387.4871123	total: 65.2ms	remaining: 6.45s
10:	learn: 382.6230510	total: 66.5ms	remaining: 5.98s
11:	learn: 378.1012454	total: 68.4ms	remaining: 5.63s
12:	learn: 372.6002306	total: 70.2ms	remaining: 5.33s
13:	learn: 368.4682192	total: 72.4ms	remaining: 5.1s
14:	learn: 364.0565766	total: 73.7ms	remaining: 4.84s
15:	learn: 359.5683249	total: 75.1ms	remaining: 4.62s
16:	learn: 355.1782794	total: 76.8ms	remaining: 4.44s
17:	learn: 350.4689946	total: 78.2ms	remaining: 4.26s
18:	learn:

In [6]:
catb_params = { "iterations": [200, 500, 1000], "learning_rate": [0.1, 0.01], "depth": [3, 6, 8]}

In [7]:
catb_cv_model = GridSearchCV(catb_model, catb_params, cv = 10, n_jobs = -1, verbose = 2 ).fit(X_train, y_train)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  1.6min


0:	learn: 425.7900818	total: 1.05ms	remaining: 209ms
1:	learn: 404.8723520	total: 1.68ms	remaining: 166ms
2:	learn: 387.4057666	total: 2.23ms	remaining: 147ms
3:	learn: 372.2801584	total: 2.79ms	remaining: 137ms
4:	learn: 358.9204229	total: 3.37ms	remaining: 132ms
5:	learn: 347.0083933	total: 3.93ms	remaining: 127ms
6:	learn: 336.0130818	total: 4.45ms	remaining: 123ms
7:	learn: 324.3923300	total: 4.98ms	remaining: 120ms
8:	learn: 314.8690957	total: 5.54ms	remaining: 118ms
9:	learn: 308.5075563	total: 6.08ms	remaining: 115ms
10:	learn: 298.8587285	total: 6.73ms	remaining: 116ms
11:	learn: 294.7655438	total: 7.29ms	remaining: 114ms
12:	learn: 288.0697862	total: 7.86ms	remaining: 113ms
13:	learn: 282.6697154	total: 8.51ms	remaining: 113ms
14:	learn: 277.6121667	total: 9.06ms	remaining: 112ms
15:	learn: 273.4383979	total: 9.75ms	remaining: 112ms
16:	learn: 269.1556201	total: 10.3ms	remaining: 111ms
17:	learn: 264.8098704	total: 10.9ms	remaining: 110ms
18:	learn: 261.6700768	total: 11.4ms	r

[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  3.3min finished


In [8]:
catb_cv_model.best_params_

{'depth': 3, 'iterations': 200, 'learning_rate': 0.1}

In [21]:
catb_tuned = CatBoostRegressor(depth = 3, iterations = 400, learning_rate = 0.1).fit(X_train, y_train)

0:	learn: 425.7900818	total: 868us	remaining: 347ms
1:	learn: 404.8723520	total: 1.73ms	remaining: 344ms
2:	learn: 387.4057666	total: 2.6ms	remaining: 345ms
3:	learn: 372.2801584	total: 3.46ms	remaining: 343ms
4:	learn: 358.9204229	total: 4.08ms	remaining: 322ms
5:	learn: 347.0083933	total: 4.83ms	remaining: 317ms
6:	learn: 336.0130818	total: 5.68ms	remaining: 319ms
7:	learn: 324.3923300	total: 6.59ms	remaining: 323ms
8:	learn: 314.8690957	total: 7.42ms	remaining: 322ms
9:	learn: 308.5075563	total: 8.51ms	remaining: 332ms
10:	learn: 298.8587285	total: 9.15ms	remaining: 324ms
11:	learn: 294.7655438	total: 9.93ms	remaining: 321ms
12:	learn: 288.0697862	total: 10.7ms	remaining: 320ms
13:	learn: 282.6697154	total: 11.5ms	remaining: 317ms
14:	learn: 277.6121667	total: 12.3ms	remaining: 315ms
15:	learn: 273.4383979	total: 13ms	remaining: 313ms
16:	learn: 269.1556201	total: 13.7ms	remaining: 309ms
17:	learn: 264.8098704	total: 14.4ms	remaining: 305ms
18:	learn: 261.6700768	total: 15ms	remaini

In [22]:
y_pred = catb_tuned.predict(X_test)

In [23]:
np.sqrt(mean_squared_error(y_test, y_pred))

334.918434796382