In [1]:
import sys
import os
import sklearn
import re

import numpy as np
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import pandas_profiling as pdpf

import seaborn as sns 

import tsfresh
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.feature_extraction import EfficientFCParameters
from tsfresh.feature_extraction import ComprehensiveFCParameters
from tsfresh.feature_extraction.settings import from_columns
from tsfresh.utilities.dataframe_functions import impute_dataframe_range

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

mpl.rc('axes', labelsize=10)
mpl.rc('xtick', labelsize=8)
mpl.rc('ytick', labelsize=8)

In [2]:
FILE_PATH = "/kaggle/input/wids-23"
def load_data(filename, path = FILE_PATH):
    return pd.read_csv(os.path.join(path, filename))
train_data = load_data(filename = "train_data.csv")

In [3]:
def reduce_mem_usage(dataframe, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_memory = dataframe.memory_usage().sum() / 1024**2
    for col in dataframe.columns:
        col_type = dataframe[col].dtypes
        if col_type in numerics:
            c_min = dataframe[col].min()
            c_max = dataframe[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    dataframe[col] = dataframe[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    dataframe[col] = dataframe[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    dataframe[col] = dataframe[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    dataframe[col] = dataframe[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    dataframe[col] = dataframe[col].astype(np.float32)
                else:
                    dataframe[col] = dataframe[col].astype(np.float64)
    end_memory = dataframe.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_memory, 100 * (start_memory - end_memory) / start_memory)) if verbose else print('Reduced to {:5.2f}'.format(end_memory))
    return dataframe

In [4]:
train_data = reduce_mem_usage(train_data)

Mem. usage decreased to 352.24 Mb (50.1% reduction)


In [5]:
# fc_params = ['contest-pevpr-sfc-gauss-14d@pevpr__minimum', 
#              'contest-slp-14d@slp__standard_deviation', 
#              'contest-wind-h100-14d@wind-hgt-100__maximum', 
#              'contest-wind-h850-14d@wind-hgt-850__absolute_maximum',
#              'contest-wind-uwnd-9@5-14d@wind-uwnd-9@5__sum_values',
#              'contest-precip-14d@precip__minimum', 'nasa0__absolute_maximum',
#              'contest-wind-uwnd-@50-14d@wind-uwnd-@50__variance',
#              'contest-rhum-sig995-14d@rhum__variance', 
#              'contest-wind-h850-14d@wind-hgt-850__standard_deviation',
#              'contest-pevpr-sfc-gauss-14d@pevpr__absolute_maximum', 
#              'contest-wind-vwnd-@50-14d@wind-vwnd-@50__absolute_maximum', 
#              'nmme-prate-56w@gfdl__minimum', 'contest-rhum-sig995-14d@rhum__maximum', 
#              'contest-slp-14d@slp__median', 'contest-wind-vwnd-9@5-14d@wind-vwnd-9@5__mean', 
#              'nmme-prate-56w@ccsm3__minimum', 'nmme-prate-34w@ccsm3__minimum', 'nmme0-prate-34w@ccsm30__minimum',
#              'elevation@elevation__absolute_maximum', 
#              'contest-wind-vwnd-9@5-14d@wind-vwnd-9@5__maximum', 
#              'contest-wind-uwnd-9@5-14d@wind-uwnd-9@5__root_mean_square', 
#              'nmme0-prate-56w@cancm30__minimum', 
#              'contest-precip-14d@precip__standard_deviation', 
#              'ccsm30__variance', 'nmme-prate-34w@ccsm3__variance', 
#              'nmme0-prate-34w@cfsv@0__minimum', 'nmme0-prate-34w@cancm40__minimum', 
#              'contest-wind-uwnd-@50-14d@wind-uwnd-@50__root_mean_square', 
#              'contest-wind-vwnd-9@5-14d@wind-vwnd-9@5__minimum', 
#              'nmme-prate-56w@gfdl__median', 'nmme0-prate-34w@ccsm40__minimum', 
#              'nmme0-prate-34w@gfdl0__standard_deviation', 'nmme-prate-34w@nasa__median', 
#              'nmme-prate-34w@gfdlflora__standard_deviation']

In [6]:
col_na = [
    'nmme0-tmp2m-34w__ccsm30', 
    'nmme-tmp2m-56w__ccsm3', 
    'nmme-prate-34w__ccsm3', 
    'nmme0-prate-56w__ccsm30', 
    'nmme0-prate-34w__ccsm30', 
    'nmme-prate-56w__ccsm3', 
    'nmme-tmp2m-34w__ccsm3']

g_means =  ['nmme0-tmp2m-34w__nmme0mean', 
 'nmme-tmp2m-56w__nmmemean', 
 'nmme-prate-34w__nmmemean', 
 'nmme0-prate-56w__nmme0mean', 
 'nmme0-prate-34w__nmme0mean', 
 'nmme-prate-56w__nmmemean', 
 'nmme-tmp2m-34w__nmmemean']


g_1 = ['nmme0-tmp2m-34w__cancm30',
'nmme0-tmp2m-34w__cancm40',
'nmme0-tmp2m-34w__ccsm40',
'nmme0-tmp2m-34w__cfsv20',
'nmme0-tmp2m-34w__gfdlflora0',
'nmme0-tmp2m-34w__gfdlflorb0',
'nmme0-tmp2m-34w__gfdl0',
'nmme0-tmp2m-34w__nasa0']

g_2 = ['nmme-tmp2m-56w__cancm3',
'nmme-tmp2m-56w__cancm4',
'nmme-tmp2m-56w__ccsm4',
'nmme-tmp2m-56w__cfsv2',
'nmme-tmp2m-56w__gfdl',
'nmme-tmp2m-56w__gfdlflora',
'nmme-tmp2m-56w__gfdlflorb',
'nmme-tmp2m-56w__nasa']

g_3 = ['nmme-prate-34w__cancm3',
'nmme-prate-34w__cancm4',
'nmme-prate-34w__ccsm4',
'nmme-prate-34w__cfsv2',
'nmme-prate-34w__gfdl',
'nmme-prate-34w__gfdlflora',
'nmme-prate-34w__gfdlflorb',
'nmme-prate-34w__nasa']

g_4 = [ 'nmme0-prate-56w__cancm30',
'nmme0-prate-56w__cancm40',
'nmme0-prate-56w__ccsm40',
'nmme0-prate-56w__cfsv20',
'nmme0-prate-56w__gfdlflora0',
'nmme0-prate-56w__gfdlflorb0',
'nmme0-prate-56w__gfdl0',
'nmme0-prate-56w__nasa0']

g_5 = ['nmme0-prate-34w__cancm30',
'nmme0-prate-34w__cancm40',
'nmme0-prate-34w__ccsm40',
'nmme0-prate-34w__cfsv20',
'nmme0-prate-34w__gfdlflora0',
'nmme0-prate-34w__gfdlflorb0',
'nmme0-prate-34w__gfdl0',
'nmme0-prate-34w__nasa0']

g_6 = ['nmme-prate-56w__cancm3',
'nmme-prate-56w__cancm4',
'nmme-prate-56w__ccsm4',
'nmme-prate-56w__cfsv2',
'nmme-prate-56w__gfdl',
'nmme-prate-56w__gfdlflora',
'nmme-prate-56w__gfdlflorb',
'nmme-prate-56w__nasa']

g_7 = ['nmme-tmp2m-34w__cancm3',
'nmme-tmp2m-34w__cancm4',
'nmme-tmp2m-34w__ccsm4',
'nmme-tmp2m-34w__cfsv2',
'nmme-tmp2m-34w__gfdl',
'nmme-tmp2m-34w__gfdlflora',
'nmme-tmp2m-34w__gfdlflorb',
'nmme-tmp2m-34w__nasa']


In [7]:
nmme_tm2m_34w_features = train_data.columns[train_data.columns.str.startswith('nmme0-tmp2m-34w')].tolist()
nmme_tm2m_56w_features = train_data.columns[train_data.columns.str.startswith('nmme-tmp2m-56w')].tolist()
c14 = [c for c in train_data.columns if "14" in c][:14]
good_feats = nmme_tm2m_34w_features+nmme_tm2m_56w_features+c14

In [8]:
target = 'contest-tmp2m-14d__tmp2m'

def scaling(train, test):
    std_scaler = StandardScaler()
    not_to_touch = ['index',
                    'id',
                    'startdate',
                    'climateregions__climateregion',
                    'lat',
                    'lon',
                    'contest-tmp2m-14d__tmp2m']
    num_data = [f for f in train.select_dtypes(include= [np.number]).columns if f not in not_to_touch]
    train[num_data] = std_scaler.fit_transform(train[num_data])
    test[num_data] = std_scaler.transform(test[num_data])
    return train, test

def create_id(train, test):
    df = pd.concat([train, test], axis= 0)
    df['id'] = df.groupby(['lat', 'lon'], axis= 0).ngroup()
    df = df.drop(['lat', 'lon'], axis= 1)
    train = df.iloc[:len(train)]
    test = df.iloc[len(train):].drop(target, axis= 1)
    return train, test


def fill_na(data):
    df = data
    missing_columns = df.columns[df.isnull().any()].to_list()
    for col in missing_columns:
        df[col].ffill()
    return df

def fill_na_ver(data):
    df = data
    gs = [g_1, g_2, g_3, g_4, g_5, g_6, g_7]
    zip_cols = zip(col_na, gs, g_means)
    for c, g, m in zip_cols:
        df[c] = (df[m]*9) - df[g].sum(1)
        df = df.ffill()
    return df

def high_corr(data, threshold= 0.8):
    not_to_touch = ['index',
                    'id',
                    'startdate',
                    'climateregions__climateregion',
                    'lat',
                    'lon',
                    'contest-tmp2m-14d__tmp2m']
    df = data
    corr_matrix = df.corr().abs()
    mask = np.triu(np.ones_like(corr_matrix, dtype= bool))
    reduced_matrix = corr_matrix.mask(mask)
    feats_to_drop = [c for c in reduced_matrix.columns if any(reduced_matrix[c] > threshold)]
    feats_to_drop = [f for f in feats_to_drop if f not in not_to_touch]
    return feats_to_drop


def cat_encoding(train, test):
    cat_encoder = LabelEncoder()
    train['climateregions__climateregion'] = cat_encoder.fit_transform(train['climateregions__climateregion'])
    test['climateregions__climateregion'] = cat_encoder.transform(test['climateregions__climateregion'])
    return train, test

def process_date(train, test):
    df = pd.concat([train, test], axis= 0)
    df['day'] = pd.DatetimeIndex(df['startdate']).day
    df['month'] = pd.DatetimeIndex(df['startdate']).month
    df['quarter'] = pd.DatetimeIndex(df['startdate']).quarter
    df = df.drop('startdate', axis= 1)
    train = df.iloc[:len(train)]
    test = df.iloc[len(train):].drop(target, axis= 1)
    return train, test
    
def preprocess(train, test):
    train, test = create_id(train, test)
    train, test = process_date(train, test)
    train = fill_na_ver(train)
    feats_to_drop = high_corr(train)
    train, test = scaling(train, test)
    train, test = cat_encoding(train, test)
    y_train = train[target]
    x_train = train.drop(target, axis= 1)
    return x_train, y_train, test, feats_to_drop

In [9]:
test_data = load_data("test_data.csv")
test_data = reduce_mem_usage(test_data)

Mem. usage decreased to 29.48 Mb (49.7% reduction)


In [10]:
x_train, y_train, test, feats_to_drop = preprocess(train_data, test_data)

In [11]:
features = [c for c in x_train.columns if c not in feats_to_drop]

In [12]:
c14 = [c for c in x_train.columns if "14" in c][:14]+nmme_tm2m_34w_features+nmme_tm2m_56w_features

In [13]:
import optuna
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import Pool, cv, CatBoostRegressor
from xgboost import XGBRegressor
from sklearn import linear_model
import warnings
warnings.filterwarnings('ignore')

### Lasso

In [14]:
rr = []
rr1 = []
rr2 = []

for j in range(x_train["id"].nunique()):

    y1 = y_train[x_train["id"]==j] 
    x1 = x_train.loc[x_train["id"]==j][c14]
    test1 = test.loc[test["id"]==j][c14]
    
    p = -28
    
    x1_conc = pd.concat([x1, test1])
    x1_conc[nmme_tm2m_34w_features] = x1_conc[nmme_tm2m_34w_features].shift(-14).ffill()
    x1_conc[nmme_tm2m_56w_features] = x1_conc[nmme_tm2m_56w_features].shift(p).ffill(limit=20)
    x2 = x1_conc.iloc[:-61,:]
    test2 = x1_conc.iloc[-61:p+20,:]
    

    
    index = x_train.loc[x_train["id"]==j]["index"]
    index1 = test.loc[test["id"]==j]["index"]
    index2 = index1[:p+20]

    
    clf = linear_model.Lasso(alpha=0.019, max_iter=10000)
    
    clf.fit(x1, y1)
    train_res= clf.predict(x1)
    test_res = clf.predict(test1)
    
    clf.fit(x2, y1)
    train_res_1= clf.predict(x2)
    test_res_1 = clf.predict(test2)
    
    
    


    df_test   = pd.DataFrame(data=test_res,  columns = ["contest-tmp2m-14d__tmp2m"], index=index1)
    df_test_1   = pd.DataFrame(data=test_res_1,  columns = ["contest-tmp2m-14d__tmp2m"], index=index2)
    
    y_copy = y_train.copy()
    y_copy[x_train["id"]==j] = y_copy[x_train["id"]==j] - train_res
    
    rr.append(df_test)
    rr1.append(df_test_1)

In [15]:
sub = pd.concat(rr).reset_index()
sub1 = pd.concat(rr1).reset_index()
sub.loc[sub["index"].isin(sub1["index"].values),'contest-tmp2m-14d__tmp2m'] = sub1['contest-tmp2m-14d__tmp2m'].values

In [16]:
def split_train_eval(x, y, test_ratio):
    test_set_size = int(len(x) * (1 - test_ratio))
    return x.iloc[:test_set_size], y.iloc[:test_set_size], x.iloc[test_set_size:], y.iloc[test_set_size:]

In [17]:
x_train_final = x_train[features]
x_train_set, y_train_set, x_eval_set, y_eval_set = split_train_eval(x_train_final, y_train, 0.3)

### LGBM with optuna

In [18]:
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'objective': 'regression',
#               'max_depth': -1,
        'learning_rate': 0.1,
        "boosting": "gbdt",
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0),
        "bagging_freq": 5,
        "bagging_fraction": trial.suggest_float('bagging_fraction', 0.1, 1.0),
        "feature_fraction": trial.suggest_float('feature_fraction', 0.4, 1.0),
        "metric": 'rmse',
        "verbosity": 0,
        "num_boost_round": 1000,
        "early_stopping_rounds": 10
    }
    d_train = lgb.Dataset(x_train_set, label= y_train_set)
    d_valid = lgb.Dataset(x_eval_set, label= y_eval_set)
    watchlist = [d_valid]
    pruning_callback = optuna.integration.LightGBMPruningCallback(trial, 'rmse')
    model = lgb.train(params,
                      train_set=d_train,
                      valid_sets=watchlist,
                      callbacks=[pruning_callback])

    y_pred = model.predict(x_eval_set, num_iteration=model.best_iteration)
    score = mean_squared_error(y_eval_set, y_pred, squared= False)
    return score

In [19]:
study_lgbm = optuna.create_study(direction= 'minimize', study_name= "lgbm opt")
study_lgbm.optimize(objective, n_trials= 50)
print('Best value: ', study_lgbm.best_value)
print('Best params: ', study_lgbm.best_params)

[32m[I 2023-03-19 11:12:25,232][0m A new study created in memory with name: lgbm opt[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:12:53,170][0m Trial 0 finished with value: 2.204701262654878 and parameters: {'num_leaves': 131, 'lambda_l1': 6.275322266205531, 'lambda_l2': 1.3606079157210431, 'bagging_fraction': 0.973099331546361, 'feature_fraction': 0.6416473130261418}. Best is trial 0 with value: 2.204701262654878.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:13:26,691][0m Trial 1 finished with value: 2.1414760122763172 and parameters: {'num_leaves': 193, 'lambda_l1': 5.529143567670685, 'lambda_l2': 3.5353425438207093, 'bagging_fraction': 0.7938408257184608, 'feature_fraction': 0.7262754473464605}. Best is trial 1 with value: 2.1414760122763172.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:13:43,709][0m Trial 2 finished with value: 2.2595557821931296 and parameters: {'num_leaves': 14, 'lambda_l1': 6.467186903626303, 'lambda_l2': 7.036081005853026, 'bagging_fraction': 0.38491146774529195, 'feature_fraction': 0.8945809321896199}. Best is trial 1 with value: 2.1414760122763172.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:14:04,268][0m Trial 3 finished with value: 2.1824893469857702 and parameters: {'num_leaves': 38, 'lambda_l1': 8.77288526442981, 'lambda_l2': 4.001823155910374, 'bagging_fraction': 0.3907425917069304, 'feature_fraction': 0.8792669827479871}. Best is trial 1 with value: 2.1414760122763172.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:14:21,683][0m Trial 4 finished with value: 2.2052633017006436 and parameters: {'num_leaves': 141, 'lambda_l1': 7.386126996016397, 'lambda_l2': 1.0820818468976428, 'bagging_fraction': 0.1668139711522159, 'feature_fraction': 0.7496315254539025}. Best is trial 1 with value: 2.1414760122763172.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:14:28,469][0m Trial 5 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:14:34,822][0m Trial 6 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:14:41,237][0m Trial 7 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:14:48,289][0m Trial 8 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:14:54,486][0m Trial 9 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:15:01,211][0m Trial 10 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:15:08,356][0m Trial 11 pruned. Trial was pruned at iteration 3.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:15:14,914][0m Trial 12 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:15:56,415][0m Trial 13 finished with value: 2.2123806618252186 and parameters: {'num_leaves': 244, 'lambda_l1': 3.9034441823973522, 'lambda_l2': 2.761083452619702, 'bagging_fraction': 0.9515919959073742, 'feature_fraction': 0.9998427767750403}. Best is trial 1 with value: 2.1414760122763172.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:16:02,823][0m Trial 14 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:16:08,985][0m Trial 15 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:16:15,580][0m Trial 16 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:16:22,102][0m Trial 17 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:16:28,779][0m Trial 18 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:16:45,471][0m Trial 19 finished with value: 2.169045895847922 and parameters: {'num_leaves': 108, 'lambda_l1': 2.8627155043650427, 'lambda_l2': 2.6856237963789638, 'bagging_fraction': 0.28425653534324513, 'feature_fraction': 0.6998732909283708}. Best is trial 1 with value: 2.1414760122763172.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:16:51,870][0m Trial 20 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:17:01,628][0m Trial 21 pruned. Trial was pruned at iteration 15.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:17:07,827][0m Trial 22 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:17:14,302][0m Trial 23 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:17:21,485][0m Trial 24 pruned. Trial was pruned at iteration 2.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:17:37,006][0m Trial 25 pruned. Trial was pruned at iteration 51.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:17:43,591][0m Trial 26 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:17:49,735][0m Trial 27 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:17:56,073][0m Trial 28 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:18:03,006][0m Trial 29 pruned. Trial was pruned at iteration 1.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:18:09,460][0m Trial 30 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:18:15,761][0m Trial 31 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:18:22,179][0m Trial 32 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:18:30,186][0m Trial 33 pruned. Trial was pruned at iteration 3.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:18:47,056][0m Trial 34 pruned. Trial was pruned at iteration 20.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:18:56,206][0m Trial 35 pruned. Trial was pruned at iteration 10.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:19:03,404][0m Trial 36 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:19:09,977][0m Trial 37 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:19:16,427][0m Trial 38 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:19:22,963][0m Trial 39 pruned. Trial was pruned at iteration 1.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:19:29,945][0m Trial 40 pruned. Trial was pruned at iteration 2.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:19:37,302][0m Trial 41 pruned. Trial was pruned at iteration 2.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:19:43,493][0m Trial 42 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:19:49,849][0m Trial 43 pruned. Trial was pruned at iteration 1.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:19:58,169][0m Trial 44 pruned. Trial was pruned at iteration 11.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:20:05,174][0m Trial 45 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:20:11,683][0m Trial 46 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:20:18,365][0m Trial 47 pruned. Trial was pruned at iteration 0.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:20:24,847][0m Trial 48 pruned. Trial was pruned at iteration 1.[0m


You can set `force_col_wise=true` to remove the overhead.


[32m[I 2023-03-19 11:20:31,186][0m Trial 49 pruned. Trial was pruned at iteration 0.[0m


Best value:  2.1414760122763172
Best params:  {'num_leaves': 193, 'lambda_l1': 5.529143567670685, 'lambda_l2': 3.5353425438207093, 'bagging_fraction': 0.7938408257184608, 'feature_fraction': 0.7262754473464605}


In [20]:
lgbm_params = study_lgbm.best_params
lgbm_model = lgb.LGBMRegressor(**lgbm_params)
lgbm_model.fit(x_train_set, y_train_set)



LGBMRegressor(bagging_fraction=0.7938408257184608,
              feature_fraction=0.7262754473464605, lambda_l1=5.529143567670685,
              lambda_l2=3.5353425438207093, num_leaves=193)

In [21]:
test_pred = lgbm_model.predict(test[features])
test_pred_lgbm = test_pred
res = {
    'contest-tmp2m-14d__tmp2m': test_pred,
    'index': test_data["index"]
}
res = pd.DataFrame(res)
res.head()

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,27.131038,375734
1,27.141886,375735
2,27.179241,375736
3,27.506303,375737
4,27.842126,375738


In [22]:
res.to_csv("/kaggle/working/lgbm.csv", index= False)

### Trying Catboost

In [23]:
def objective(trial):
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 6, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 250, 1000, step=250),
        'max_bin': trial.suggest_int('max_bin', 100, 300),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 32, 512),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 0.0001, 1.0, log = True),
        "early_stopping_rounds" : 10,
        'random_state': 42,
        'verbose': 0,
        "eval_metric" : 'RMSE',
        "task_type" : "GPU",
        "devices" : '0'
    }

    model = CatBoostRegressor(**params)

    model.fit(
        x_train_set, y_train_set,
        eval_set=(x_eval_set, y_eval_set),
        cat_features=['id'],
        verbose=False,
        plot=True
    )

    y_pred = model.predict(x_eval_set)
    score = mean_squared_error(y_eval_set, y_pred, squared= False)
    return score

In [24]:
study_cat = optuna.create_study(direction= 'minimize', study_name= "cat opt")
study_cat.optimize(objective, n_trials= 50)
print('Best value: ', study_cat.best_value)
print('Best params: ', study_cat.best_params)
cat_params = study_cat.best_params

[32m[I 2023-03-19 11:21:21,720][0m A new study created in memory with name: cat opt[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:23:07,329][0m Trial 0 finished with value: 2.03321014142542 and parameters: {'max_depth': 10, 'learning_rate': 0.0588344504406283, 'n_estimators': 500, 'max_bin': 241, 'min_data_in_leaf': 410, 'l2_leaf_reg': 0.0006407520226101355}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:23:20,166][0m Trial 1 finished with value: 2.4008815908467813 and parameters: {'max_depth': 7, 'learning_rate': 0.03841381973664089, 'n_estimators': 250, 'max_bin': 271, 'min_data_in_leaf': 343, 'l2_leaf_reg': 0.0013758653726468298}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:23:33,401][0m Trial 2 finished with value: 2.26660162644268 and parameters: {'max_depth': 9, 'learning_rate': 0.053743350555450826, 'n_estimators': 1000, 'max_bin': 128, 'min_data_in_leaf': 179, 'l2_leaf_reg': 0.00047013990423399227}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:23:42,670][0m Trial 3 finished with value: 2.551767068245394 and parameters: {'max_depth': 6, 'learning_rate': 0.02764990195229563, 'n_estimators': 500, 'max_bin': 137, 'min_data_in_leaf': 445, 'l2_leaf_reg': 0.019149562340906142}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:23:49,073][0m Trial 4 finished with value: 2.5570467954563068 and parameters: {'max_depth': 6, 'learning_rate': 0.08296265593136963, 'n_estimators': 500, 'max_bin': 200, 'min_data_in_leaf': 32, 'l2_leaf_reg': 0.2525536745392111}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:23:58,292][0m Trial 5 finished with value: 2.3934011077121187 and parameters: {'max_depth': 10, 'learning_rate': 0.07628275164669635, 'n_estimators': 250, 'max_bin': 201, 'min_data_in_leaf': 209, 'l2_leaf_reg': 0.09493304432128197}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:24:05,330][0m Trial 6 finished with value: 2.4569719603957743 and parameters: {'max_depth': 8, 'learning_rate': 0.06813432652137738, 'n_estimators': 1000, 'max_bin': 119, 'min_data_in_leaf': 421, 'l2_leaf_reg': 0.4886431425341614}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:24:15,670][0m Trial 7 finished with value: 2.182168101272079 and parameters: {'max_depth': 7, 'learning_rate': 0.0689508301637237, 'n_estimators': 250, 'max_bin': 104, 'min_data_in_leaf': 477, 'l2_leaf_reg': 0.0013943939934902261}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:24:29,832][0m Trial 8 finished with value: 2.162641603496983 and parameters: {'max_depth': 9, 'learning_rate': 0.09219870939888744, 'n_estimators': 750, 'max_bin': 141, 'min_data_in_leaf': 176, 'l2_leaf_reg': 0.00414612030300784}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:24:51,224][0m Trial 9 finished with value: 2.1448310636766865 and parameters: {'max_depth': 9, 'learning_rate': 0.03677985954118945, 'n_estimators': 750, 'max_bin': 155, 'min_data_in_leaf': 74, 'l2_leaf_reg': 0.0010908812083020329}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:25:28,108][0m Trial 10 finished with value: 2.4412049873044506 and parameters: {'max_depth': 10, 'learning_rate': 0.011032132921836163, 'n_estimators': 500, 'max_bin': 278, 'min_data_in_leaf': 344, 'l2_leaf_reg': 0.00010124311243226911}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:25:38,274][0m Trial 11 finished with value: 2.3396360296090006 and parameters: {'max_depth': 9, 'learning_rate': 0.050500736955589876, 'n_estimators': 750, 'max_bin': 216, 'min_data_in_leaf': 38, 'l2_leaf_reg': 0.00023025596863699072}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:26:08,854][0m Trial 12 finished with value: 2.1390086661548104 and parameters: {'max_depth': 10, 'learning_rate': 0.04160459014518465, 'n_estimators': 750, 'max_bin': 240, 'min_data_in_leaf': 279, 'l2_leaf_reg': 0.0007198205551859416}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:26:26,799][0m Trial 13 finished with value: 2.343138907615169 and parameters: {'max_depth': 10, 'learning_rate': 0.06065583310900064, 'n_estimators': 750, 'max_bin': 246, 'min_data_in_leaf': 333, 'l2_leaf_reg': 0.005730402451447741}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:26:42,126][0m Trial 14 finished with value: 2.334952880013646 and parameters: {'max_depth': 10, 'learning_rate': 0.04979249445500933, 'n_estimators': 500, 'max_bin': 300, 'min_data_in_leaf': 268, 'l2_leaf_reg': 0.0003188950470161856}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:26:49,555][0m Trial 15 finished with value: 2.32548343221831 and parameters: {'max_depth': 8, 'learning_rate': 0.06175955757676334, 'n_estimators': 1000, 'max_bin': 243, 'min_data_in_leaf': 399, 'l2_leaf_reg': 0.00010461676364360349}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:27:02,238][0m Trial 16 finished with value: 2.2593424414112038 and parameters: {'max_depth': 10, 'learning_rate': 0.09773597947671575, 'n_estimators': 500, 'max_bin': 229, 'min_data_in_leaf': 262, 'l2_leaf_reg': 0.016249847498552305}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:27:13,104][0m Trial 17 finished with value: 2.418527644743426 and parameters: {'max_depth': 9, 'learning_rate': 0.04279476797949967, 'n_estimators': 750, 'max_bin': 175, 'min_data_in_leaf': 313, 'l2_leaf_reg': 0.0026767735115901694}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:27:30,576][0m Trial 18 finished with value: 2.301417624301948 and parameters: {'max_depth': 8, 'learning_rate': 0.032738028385653116, 'n_estimators': 750, 'max_bin': 261, 'min_data_in_leaf': 490, 'l2_leaf_reg': 0.0007985699653286892}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:27:56,440][0m Trial 19 finished with value: 2.0665015183459907 and parameters: {'max_depth': 10, 'learning_rate': 0.04334074949798684, 'n_estimators': 500, 'max_bin': 178, 'min_data_in_leaf': 388, 'l2_leaf_reg': 0.00044601805480711676}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:28:07,406][0m Trial 20 finished with value: 2.2443008367734727 and parameters: {'max_depth': 8, 'learning_rate': 0.05845121951480289, 'n_estimators': 250, 'max_bin': 172, 'min_data_in_leaf': 386, 'l2_leaf_reg': 0.0003166790550486694}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:28:22,796][0m Trial 21 finished with value: 2.346346311068724 and parameters: {'max_depth': 10, 'learning_rate': 0.04559193470471801, 'n_estimators': 500, 'max_bin': 227, 'min_data_in_leaf': 508, 'l2_leaf_reg': 0.0007311794304908304}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:28:38,516][0m Trial 22 finished with value: 2.331332162812852 and parameters: {'max_depth': 10, 'learning_rate': 0.045688723210314225, 'n_estimators': 500, 'max_bin': 214, 'min_data_in_leaf': 235, 'l2_leaf_reg': 0.0020520750025389755}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:28:52,741][0m Trial 23 finished with value: 2.4168562079409335 and parameters: {'max_depth': 9, 'learning_rate': 0.027505202594334137, 'n_estimators': 500, 'max_bin': 181, 'min_data_in_leaf': 374, 'l2_leaf_reg': 0.0005544689574589215}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:29:06,659][0m Trial 24 finished with value: 2.253766182362808 and parameters: {'max_depth': 10, 'learning_rate': 0.05565565864574772, 'n_estimators': 750, 'max_bin': 245, 'min_data_in_leaf': 299, 'l2_leaf_reg': 0.00021166519955913263}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:29:22,625][0m Trial 25 finished with value: 2.2233487169159662 and parameters: {'max_depth': 9, 'learning_rate': 0.04258768213350562, 'n_estimators': 500, 'max_bin': 190, 'min_data_in_leaf': 447, 'l2_leaf_reg': 0.002103351221627626}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:29:40,974][0m Trial 26 finished with value: 2.1570535949393457 and parameters: {'max_depth': 10, 'learning_rate': 0.05195311894263449, 'n_estimators': 250, 'max_bin': 160, 'min_data_in_leaf': 136, 'l2_leaf_reg': 0.0005328693298380526}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:30:02,966][0m Trial 27 finished with value: 2.162758765486645 and parameters: {'max_depth': 10, 'learning_rate': 0.0637469092628765, 'n_estimators': 750, 'max_bin': 292, 'min_data_in_leaf': 363, 'l2_leaf_reg': 0.005436539112647473}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:30:25,214][0m Trial 28 finished with value: 2.0981124280099968 and parameters: {'max_depth': 9, 'learning_rate': 0.0568032476543464, 'n_estimators': 1000, 'max_bin': 259, 'min_data_in_leaf': 419, 'l2_leaf_reg': 0.0001802447110845612}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:30:39,720][0m Trial 29 finished with value: 2.230797666703672 and parameters: {'max_depth': 9, 'learning_rate': 0.07070130883250106, 'n_estimators': 1000, 'max_bin': 265, 'min_data_in_leaf': 434, 'l2_leaf_reg': 0.00018669515348680816}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:30:51,691][0m Trial 30 finished with value: 2.201955187531196 and parameters: {'max_depth': 7, 'learning_rate': 0.0578569410763448, 'n_estimators': 250, 'max_bin': 278, 'min_data_in_leaf': 410, 'l2_leaf_reg': 0.0012364639467761633}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:31:18,540][0m Trial 31 finished with value: 2.126303217038431 and parameters: {'max_depth': 10, 'learning_rate': 0.03742436770837955, 'n_estimators': 1000, 'max_bin': 255, 'min_data_in_leaf': 470, 'l2_leaf_reg': 0.00035081557907430607}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:31:30,929][0m Trial 32 finished with value: 2.221255260078112 and parameters: {'max_depth': 9, 'learning_rate': 0.051591021004700484, 'n_estimators': 1000, 'max_bin': 262, 'min_data_in_leaf': 467, 'l2_leaf_reg': 0.00031226114309592734}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:31:56,901][0m Trial 33 finished with value: 2.169518081744101 and parameters: {'max_depth': 10, 'learning_rate': 0.03612771968638664, 'n_estimators': 1000, 'max_bin': 229, 'min_data_in_leaf': 457, 'l2_leaf_reg': 0.0001485380302248112}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:32:11,263][0m Trial 34 finished with value: 2.1822470372979805 and parameters: {'max_depth': 9, 'learning_rate': 0.05614724112769148, 'n_estimators': 1000, 'max_bin': 258, 'min_data_in_leaf': 427, 'l2_leaf_reg': 0.00031621622324312757}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:32:41,000][0m Trial 35 finished with value: 2.166485271357023 and parameters: {'max_depth': 10, 'learning_rate': 0.04797877676415549, 'n_estimators': 1000, 'max_bin': 279, 'min_data_in_leaf': 506, 'l2_leaf_reg': 0.00014398697746534117}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:32:54,167][0m Trial 36 finished with value: 2.2608054093490866 and parameters: {'max_depth': 9, 'learning_rate': 0.06365235761041131, 'n_estimators': 1000, 'max_bin': 206, 'min_data_in_leaf': 351, 'l2_leaf_reg': 0.00045660882010717857}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:33:10,188][0m Trial 37 finished with value: 2.049772533069367 and parameters: {'max_depth': 8, 'learning_rate': 0.07602792696179833, 'n_estimators': 500, 'max_bin': 251, 'min_data_in_leaf': 396, 'l2_leaf_reg': 0.000431052029270141}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:33:20,343][0m Trial 38 finished with value: 2.246499767501136 and parameters: {'max_depth': 8, 'learning_rate': 0.07659523782730548, 'n_estimators': 500, 'max_bin': 187, 'min_data_in_leaf': 391, 'l2_leaf_reg': 0.0010040729048518661}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:33:26,249][0m Trial 39 finished with value: 2.4439580541565324 and parameters: {'max_depth': 6, 'learning_rate': 0.08165192844641318, 'n_estimators': 500, 'max_bin': 220, 'min_data_in_leaf': 411, 'l2_leaf_reg': 0.0015531906458626564}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:33:36,355][0m Trial 40 finished with value: 2.308052172262478 and parameters: {'max_depth': 8, 'learning_rate': 0.06763571245648041, 'n_estimators': 500, 'max_bin': 195, 'min_data_in_leaf': 338, 'l2_leaf_reg': 0.0004773389440207989}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:33:47,392][0m Trial 41 finished with value: 2.252040301917462 and parameters: {'max_depth': 7, 'learning_rate': 0.05448143335421646, 'n_estimators': 500, 'max_bin': 252, 'min_data_in_leaf': 476, 'l2_leaf_reg': 0.00022860050790185488}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:34:00,044][0m Trial 42 finished with value: 2.2364217186103232 and parameters: {'max_depth': 8, 'learning_rate': 0.07202815116237207, 'n_estimators': 500, 'max_bin': 271, 'min_data_in_leaf': 440, 'l2_leaf_reg': 0.0004093255845476174}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:34:15,414][0m Trial 43 finished with value: 2.2091250734467645 and parameters: {'max_depth': 9, 'learning_rate': 0.06534041481231559, 'n_estimators': 250, 'max_bin': 234, 'min_data_in_leaf': 369, 'l2_leaf_reg': 0.0008263742248439767}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:34:46,635][0m Trial 44 finished with value: 2.055809920394916 and parameters: {'max_depth': 10, 'learning_rate': 0.06042314091163139, 'n_estimators': 1000, 'max_bin': 286, 'min_data_in_leaf': 419, 'l2_leaf_reg': 0.00010064543448397826}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:35:03,148][0m Trial 45 finished with value: 2.299226280420962 and parameters: {'max_depth': 10, 'learning_rate': 0.05949791758720054, 'n_estimators': 500, 'max_bin': 285, 'min_data_in_leaf': 323, 'l2_leaf_reg': 0.00011672600991367963}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:35:13,917][0m Trial 46 finished with value: 2.287319983686082 and parameters: {'max_depth': 9, 'learning_rate': 0.07398713769988671, 'n_estimators': 750, 'max_bin': 298, 'min_data_in_leaf': 419, 'l2_leaf_reg': 0.00016914343445929974}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:35:23,395][0m Trial 47 finished with value: 2.316662479811962 and parameters: {'max_depth': 7, 'learning_rate': 0.06716252914105482, 'n_estimators': 500, 'max_bin': 270, 'min_data_in_leaf': 385, 'l2_leaf_reg': 0.00024405081943664034}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:35:44,156][0m Trial 48 finished with value: 2.18062481277554 and parameters: {'max_depth': 10, 'learning_rate': 0.06333600125657081, 'n_estimators': 750, 'max_bin': 142, 'min_data_in_leaf': 294, 'l2_leaf_reg': 0.00011886770099299058}. Best is trial 0 with value: 2.03321014142542.[0m


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

[32m[I 2023-03-19 11:35:55,131][0m Trial 49 finished with value: 2.211235387899279 and parameters: {'max_depth': 9, 'learning_rate': 0.0531110940868037, 'n_estimators': 250, 'max_bin': 203, 'min_data_in_leaf': 357, 'l2_leaf_reg': 0.00017258504305519746}. Best is trial 0 with value: 2.03321014142542.[0m


Best value:  2.03321014142542
Best params:  {'max_depth': 10, 'learning_rate': 0.0588344504406283, 'n_estimators': 500, 'max_bin': 241, 'min_data_in_leaf': 410, 'l2_leaf_reg': 0.0006407520226101355}


In [25]:
cat_params = study_cat.best_params
cat_model = CatBoostRegressor(**cat_params)
cat_model.fit(x_train_set, y_train_set, cat_features= ['id'])

0:	learn: 9.0560276	total: 1.36s	remaining: 11m 19s
1:	learn: 8.5867204	total: 2.71s	remaining: 11m 14s
2:	learn: 8.1370758	total: 4.06s	remaining: 11m 13s
3:	learn: 7.7264539	total: 5.72s	remaining: 11m 49s
4:	learn: 7.3340998	total: 7.09s	remaining: 11m 42s
5:	learn: 6.9660709	total: 8.4s	remaining: 11m 31s
6:	learn: 6.6173530	total: 9.77s	remaining: 11m 28s
7:	learn: 6.2865473	total: 11.1s	remaining: 11m 25s
8:	learn: 5.9781862	total: 12.5s	remaining: 11m 21s
9:	learn: 5.6882360	total: 13.8s	remaining: 11m 18s
10:	learn: 5.4167397	total: 15.2s	remaining: 11m 16s
11:	learn: 5.1616498	total: 16.9s	remaining: 11m 25s
12:	learn: 4.9192708	total: 18.2s	remaining: 11m 21s
13:	learn: 4.6924735	total: 19.5s	remaining: 11m 17s
14:	learn: 4.4761897	total: 20.9s	remaining: 11m 15s
15:	learn: 4.2752268	total: 22.2s	remaining: 11m 11s
16:	learn: 4.0858003	total: 23.5s	remaining: 11m 7s
17:	learn: 3.9058054	total: 24.8s	remaining: 11m 3s
18:	learn: 3.7365366	total: 26.1s	remaining: 10m 59s
19:	le

<catboost.core.CatBoostRegressor at 0x7fec2b6a4250>

In [26]:
test_pred = cat_model.predict(test[features])
test_pred_cat = test_pred
res = {
    'contest-tmp2m-14d__tmp2m': test_pred,
    'index': test_data["index"]
}
res = pd.DataFrame(res)
res.head()

Unnamed: 0,contest-tmp2m-14d__tmp2m,index
0,22.904485,375734
1,22.525968,375735
2,22.675056,375736
3,22.295956,375737
4,22.48881,375738


In [27]:
res.to_csv("/kaggle/working/cat.csv", index= False)

### XGBoost with optuna

In [28]:
# def objective(trial):
#     params = {
#             'objective': "reg:squarederror",
#             'max_depth': trial.suggest_int('max_depth', 1, 9),
#             'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
#             'n_estimators': trial.suggest_int('n_estimators', 50, 500),
#             'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#             'gamma': trial.suggest_float('gamma', 1e-8, 1.0),
#             'subsample': trial.suggest_float('subsample', 0.01, 1.0),
#             'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
#             'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0),
#             'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0),
#             'eval_metric': 'mlogloss',
#             'use_label_encoder': False
#         }
#     eval_set = [(x_eval_set, y_eval_set)]
#     xg_model = XGBRegressor(**params)
#     xg_model.fit(x_train_set, y_train_set, eval_set= eval_set, early_stopping_rounds= 10)
#     y_pred = xg_model.predict(x_eval_set)
#     score = mean_squared_error(y_eval_set, y_pred, squared= False)
#     return score

In [29]:
# study_xg = optuna.create_study(direction= 'minimize', study_name= "xg opt")
# study_xg.optimize(objective, n_trials= 50)
# print('Best value: ', study_xg.best_value)
# print('Best params: ', study_xg.best_params)

In [30]:
# xg_params = study_xg.best_params
# xg_model = XGBRegressor(**xg_params)
# # xg_model = XGBRegressor(objective="reg:squarederror", eval_metric='rmse', 
# #                              random_state=42)
# xg_model.fit(x_train_set, y_train_set)
# y_pred = xg_model.predict(x_eval_set)
# score = mean_squared_error(y_eval_set, y_pred, squared= False)

In [31]:
# test_pred = xg_model.predict(test[features])
# test_pred_xg = test_pred
# res = {
#     'contest-tmp2m-14d__tmp2m': test_pred,
#     'index': test_data["index"]
# }
# res = pd.DataFrame(res)
# res.head()

In [32]:
# res.to_csv("/kaggle/working/xg.csv", index= False)

### Ensemble

In [33]:
def ensembling(lgb_ratio, cat_ratio):
    ensemble_preds = test_pred_lgbm * lgb_ratio + test_pred_cat * (1 - lgb_ratio)
    return ensemble_preds

In [34]:
def export_ensemble(ratio_list, prev):
    for i in ratio_list:
        test_pred = ensembling(i[0], i[1])
        pr = prev
        pr["contest-tmp2m-14d__tmp2m"] = pr["contest-tmp2m-14d__tmp2m"] + test_pred
        res = pr[["contest-tmp2m-14d__tmp2m", "index"]]
        res.to_csv('submission.csv', index = False)

In [35]:
ratios = [
    [0.4, 0.6],
]
export_ensemble(ratios, sub)