In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
import catboost as cb
import optuna
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv
/kaggle/input/tabular-playground-series-feb-2021/train.csv
/kaggle/input/tabular-playground-series-feb-2021/test.csv


In [2]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-feb-2021/test.csv')
submission = pd.read_csv( '/kaggle/input/tabular-playground-series-feb-2021/sample_submission.csv', index_col='id')

In [3]:
data = pd.concat([train, test])

In [4]:
# scaler = StandardScaler()
# data.iloc[:,11:-1]= scaler.fit_transform(data.iloc[:,11:-1])

In [5]:

data = pd.get_dummies(data)
train = data[:300000]
test = data[300000:]
del data

In [6]:
X=train.drop(['id', 'target'],axis=1)
y=train.target
test = test.drop(['id', 'target'],axis=1)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=23)

In [8]:
categ_features = X.iloc[:,14:].columns

In [11]:
def objective(trial):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)
    params = {
        'cat_features': categ_features,
        'eval_metric': 'RMSE',
        'loss_function' :'RMSE',
        'random_state': 23,
        'use_best_model':True,
        'task_type': 'GPU',
        'iterations': trial.suggest_int('iterations', 100, 10000),
        'depth': trial.suggest_int('depth', 2, 16),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1.0),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 100),
#         'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1e-2, 10.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
    }
    
    model = cb.CatBoostRegressor(**params)
    
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=0, early_stopping_rounds=200)
    predictions = model.predict(X_valid)
    score = mean_squared_error(y_valid, predictions)
    return score

In [12]:
%%time
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=40)

[32m[I 2021-02-26 18:27:54,268][0m A new study created in memory with name: no-name-79f6d648-a3ed-482a-85ea-6ea6cf2c66ff[0m
[32m[I 2021-02-26 18:30:11,420][0m Trial 0 finished with value: 0.7808049826884593 and parameters: {'iterations': 2049, 'depth': 2, 'learning_rate': 0.0008205761099115174, 'l2_leaf_reg': 6.233469768268151, 'bagging_temperature': 0.3984055179876004, 'min_child_samples': 15}. Best is trial 0 with value: 0.7808049826884593.[0m
[32m[I 2021-02-26 18:31:52,661][0m Trial 1 finished with value: 0.7139542928559558 and parameters: {'iterations': 5218, 'depth': 5, 'learning_rate': 0.027374458133222687, 'l2_leaf_reg': 5.7578023556884115e-05, 'bagging_temperature': 0.6180657263771288, 'min_child_samples': 99}. Best is trial 1 with value: 0.7139542928559558.[0m
[32m[I 2021-02-26 18:33:04,586][0m Trial 2 finished with value: 0.7142668339229633 and parameters: {'iterations': 4894, 'depth': 5, 'learning_rate': 0.2693565442338347, 'l2_leaf_reg': 5.764374178897066e-07, 'b

[32m[I 2021-02-26 19:09:58,214][0m Trial 25 finished with value: 0.7123603591841516 and parameters: {'iterations': 8579, 'depth': 3, 'learning_rate': 0.2532937545618855, 'l2_leaf_reg': 1.0901864470245403e-06, 'bagging_temperature': 0.14406678953758079, 'min_child_samples': 89}. Best is trial 17 with value: 0.7114697776224905.[0m
[32m[I 2021-02-26 19:11:12,580][0m Trial 26 finished with value: 0.7192320936243212 and parameters: {'iterations': 4149, 'depth': 3, 'learning_rate': 0.2863563521682685, 'l2_leaf_reg': 7.365708091608343e-07, 'bagging_temperature': 0.1995005775979305, 'min_child_samples': 92}. Best is trial 17 with value: 0.7114697776224905.[0m
[32m[I 2021-02-26 19:12:25,814][0m Trial 27 finished with value: 0.7113293879285506 and parameters: {'iterations': 8315, 'depth': 3, 'learning_rate': 0.5857687310544913, 'l2_leaf_reg': 2.364246562962328e-06, 'bagging_temperature': 0.35012527515317604, 'min_child_samples': 90}. Best is trial 27 with value: 0.7113293879285506.[0m


CPU times: user 1h 3min 33s, sys: 6min 17s, total: 1h 9min 51s
Wall time: 1h 1min 53s


In [13]:
trial = study.best_trial
print(f" best RMSE: {trial.value}")
print("best params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

 best RMSE: 0.7113293879285506
best params: 
    iterations: 8315
    depth: 3
    learning_rate: 0.5857687310544913
    l2_leaf_reg: 2.364246562962328e-06
    bagging_temperature: 0.35012527515317604
    min_child_samples: 90


In [None]:
# %%time
# params1 = { 
#     'iterations': 7827,
#     'depth': 6, 
#     'learning_rate': 0.06115296942277834, 
#     'l2_leaf_reg': 0.00018537808841101856, 
#     'bagging_temperature': 7.2769130916283125, 
#     'min_child_samples': 95
# }


# model_cb = cb.CatBoostRegressor(**params1, cat_features=categ_features ,
#                                 loss_function='RMSE', eval_metric='RMSE')

# model_cb.fit(X, y, cat_features=categ_features)
# submission['target'] = model_cb.predict(test)
# submission.to_csv('catboost4.csv')

In [None]:
# %%time
# params2 = {
#     'iterations': 6961, 
#     'depth': 13, 
#     'learning_rate': 0.1420713844582006, 
#     'l2_leaf_reg': 0.0019005856806734837, 
#     'bagging_temperature': 5.060337163809838, 
#     'min_child_samples': 49
# }


# model_cb = cb.CatBoostRegressor(**params2, cat_features=categ_features ,
#                                 loss_function='RMSE', eval_metric='RMSE')

# model_cb.fit(X, y, cat_features=categ_features)
# submission['target'] = model_cb.predict(test)
# submission.to_csv('catboost3.csv')

In [14]:
n_folds = 10
train_oof = np.zeros((300000,))
test_preds = 0

skf = KFold(n_splits=n_folds, random_state=23, shuffle=True)

for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    
    print(f'FOLD    {fold+1}')
    
    X_train, X_valid = pd.DataFrame(X.iloc[train_index]), pd.DataFrame(X.iloc[test_index])
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
       
    cat_params = { 
        'iterations': 7827,
        'depth': 6, 
        'learning_rate': 0.06115296942277834, 
        'l2_leaf_reg': 0.00018537808841101856, 
        'bagging_temperature': 7.2769130916283125, 
        'min_child_samples': 95
    }
    
    model = cb.CatBoostRegressor(
        **cat_params
    )
    model.fit(
        X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=500,
        early_stopping_rounds=200,
    )
    preds = model.predict(X_valid)
    test_preds += model.predict(test) / n_folds
    train_oof[test_index] = preds
    print("")
    

print(f": RMSE = {mean_squared_error(y, train_oof)}")

submission['target'] = test_preds
submission.to_csv('catboost_optuna_scaled.csv')

FOLD    1
0:	learn: 0.8854431	test: 0.8856635	best: 0.8856635 (0)	total: 61ms	remaining: 7m 57s
500:	learn: 0.8380577	test: 0.8426577	best: 0.8426577 (500)	total: 25.5s	remaining: 6m 13s
1000:	learn: 0.8282445	test: 0.8412879	best: 0.8412879 (1000)	total: 50.9s	remaining: 5m 46s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8411194697
bestIteration = 1135

Shrink model to first 1136 iterations.

FOLD    2
0:	learn: 0.8850368	test: 0.8874778	best: 0.8874778 (0)	total: 50ms	remaining: 6m 30s
500:	learn: 0.8376263	test: 0.8461418	best: 0.8461418 (500)	total: 24.9s	remaining: 6m 4s
1000:	learn: 0.8276183	test: 0.8450314	best: 0.8449502 (926)	total: 50.4s	remaining: 5m 43s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8448790229
bestIteration = 1220

Shrink model to first 1221 iterations.

FOLD    3
0:	learn: 0.8854179	test: 0.8857940	best: 0.8857940 (0)	total: 48.9ms	remaining: 6m 22s
500:	learn: 0.8376292	test: 0.8461892	best: 0.8461892 (500)	