In [14]:
import zipfile
import numpy as np
import pandas as pd 

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
import catboost as cb
import optuna

In [17]:
with zipfile.ZipFile("tabular-playground-series-feb-2021.zip") as z:
    with z.open("train.csv") as f:
        train = pd.read_csv(f)
    with z.open("test.csv") as f:
        test = pd.read_csv(f)
    with z.open("sample_submission.csv") as f:
        submission = pd.read_csv(f, index_col='id')

In [18]:
data = pd.concat([train, test])

In [19]:
scaler = StandardScaler()
data.iloc[:,11:-1]= scaler.fit_transform(data.iloc[:,11:-1])

In [20]:
data = pd.get_dummies(data)
train = data[:300000]
test = data[300000:]
del data

In [21]:
X=train.drop(['id', 'target'],axis=1)
y=train.target
test = test.drop(['id', 'target'],axis=1)

In [22]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=23)

In [23]:
categ_features = X.iloc[:,14:].columns

In [27]:
def objective(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)
    params = {
        'cat_features': categ_features,
        'eval_metric': 'RMSE',
        'loss_function' :'RMSE',
        'random_state': 23,
        'use_best_model':True,
        'task_type': 'CPU',
        'iterations': trial.suggest_int('iterations', 100, 10000),
        'depth': trial.suggest_int('depth', 2, 16),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1.0),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 100),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-2, 10.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    
    model = cb.CatBoostRegressor(**params)
    
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0, early_stopping_rounds=200)
    predictions = model.predict(X_valid)
    score = mean_squared_error(y_valid, predictions)
    return score

In [28]:
%%time
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=40)

[32m[I 2021-03-05 16:13:54,115][0m A new study created in memory with name: no-name-3d6a2ac7-b70c-4e1d-ba0d-4c9e7b79ae07[0m
[32m[I 2021-03-05 16:19:20,959][0m Trial 0 finished with value: 0.7118742210171839 and parameters: {'iterations': 6812, 'depth': 2, 'learning_rate': 0.06379209223785176, 'l2_leaf_reg': 5.60140306468175e-07, 'bagging_temperature': 5.692545094176539, 'min_child_samples': 70}. Best is trial 0 with value: 0.7118742210171839.[0m
[32m[I 2021-03-05 16:22:12,747][0m Trial 1 finished with value: 0.7173182588741599 and parameters: {'iterations': 6770, 'depth': 8, 'learning_rate': 0.18895330237152505, 'l2_leaf_reg': 0.0007600652404267158, 'bagging_temperature': 0.013410483561309187, 'min_child_samples': 75}. Best is trial 0 with value: 0.7118742210171839.[0m
[32m[I 2021-03-05 16:25:09,218][0m Trial 2 finished with value: 0.7165453942003709 and parameters: {'iterations': 6282, 'depth': 8, 'learning_rate': 0.09632929062950474, 'l2_leaf_reg': 0.005584101482260027, 'b

KeyboardInterrupt: 

In [13]:
trial = study.best_trial
print(f" best RMSE: {trial.value}")
print("best params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

 best RMSE: 0.7113293879285506
best params: 
    iterations: 8315
    depth: 3
    learning_rate: 0.5857687310544913
    l2_leaf_reg: 2.364246562962328e-06
    bagging_temperature: 0.35012527515317604
    min_child_samples: 90


In [14]:
n_folds = 10
train_oof = np.zeros((300000,))
test_preds = 0

skf = KFold(n_splits=n_folds, random_state=23, shuffle=True)

for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    
    print(f'FOLD    {fold+1}')
    
    X_train, X_valid = pd.DataFrame(X.iloc[train_index]), pd.DataFrame(X.iloc[test_index])
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
       
    cat_params = { 
        'iterations': 7827,
        'depth': 6, 
        'learning_rate': 0.06115296942277834, 
        'l2_leaf_reg': 0.00018537808841101856, 
        'bagging_temperature': 7.2769130916283125, 
        'min_child_samples': 95
    }
    
    model = cb.CatBoostRegressor(
        **cat_params
    )
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=500,
        early_stopping_rounds=200,
    )
    preds = model.predict(X_valid)
    test_preds += model.predict(test) / n_folds
    train_oof[test_index] = preds
    print("")
    

print(f": RMSE = {mean_squared_error(y, train_oof)}")

submission['target'] = test_preds
submission.to_csv('catboost_optuna_scaled.csv')

FOLD    1
0:	learn: 0.8854431	test: 0.8856635	best: 0.8856635 (0)	total: 61ms	remaining: 7m 57s
500:	learn: 0.8380577	test: 0.8426577	best: 0.8426577 (500)	total: 25.5s	remaining: 6m 13s
1000:	learn: 0.8282445	test: 0.8412879	best: 0.8412879 (1000)	total: 50.9s	remaining: 5m 46s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8411194697
bestIteration = 1135

Shrink model to first 1136 iterations.

FOLD    2
0:	learn: 0.8850368	test: 0.8874778	best: 0.8874778 (0)	total: 50ms	remaining: 6m 30s
500:	learn: 0.8376263	test: 0.8461418	best: 0.8461418 (500)	total: 24.9s	remaining: 6m 4s
1000:	learn: 0.8276183	test: 0.8450314	best: 0.8449502 (926)	total: 50.4s	remaining: 5m 43s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8448790229
bestIteration = 1220

Shrink model to first 1221 iterations.

FOLD    3
0:	learn: 0.8854179	test: 0.8857940	best: 0.8857940 (0)	total: 48.9ms	remaining: 6m 22s
500:	learn: 0.8376292	test: 0.8461892	best: 0.8461892 (500)	