## EDA

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sys
sys.path.append('/content/drive/Shareddrives/Kaggle/Kaggle/Scripts/')
import my_utils

In [None]:
import pandas as pd
import numpy as np
import warnings
from my_utils import remove_miss_columns
warnings.filterwarnings('ignore')
from sklearn.preprocessing import FunctionTransformer
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 14})
import seaborn as sns

In [None]:
data = pd.read_csv('/content/drive/Shareddrives/Kaggle/Kaggle/Data/train_data.csv')
data2 = pd.read_csv('/content/drive/Shareddrives/Kaggle/Kaggle/Data/test_data.csv')
data2.head()

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,nmme0-tmp2m-34w__cancm40,nmme0-tmp2m-34w__ccsm30,nmme0-tmp2m-34w__ccsm40,nmme0-tmp2m-34w__cfsv20,...,wind-vwnd-925-2010-11,wind-vwnd-925-2010-12,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20
0,375734,0.0,0.833333,11/1/22,339.88,30.88,30.92,29.17,31.02,29.47,...,-19.28,-39.77,-29.25,40.88,-8.31,14.91,-24.62,31.05,-23.69,6.27
1,375735,0.0,0.833333,11/2/22,334.63,30.88,30.92,29.17,31.02,29.47,...,-19.58,-43.14,-28.62,45.37,-5.42,16.97,-23.94,28.84,-20.61,14.16
2,375736,0.0,0.833333,11/3/22,337.83,30.88,30.92,29.17,31.02,29.47,...,-13.73,-44.22,-27.67,49.76,-1.31,21.44,-19.06,26.85,-16.78,13.42
3,375737,0.0,0.833333,11/4/22,345.81,30.88,30.92,29.17,31.02,29.47,...,-7.97,-49.47,-19.32,52.62,-0.44,21.65,-23.12,23.7,-18.62,10.69
4,375738,0.0,0.833333,11/5/22,357.39,30.88,30.92,29.17,31.02,29.47,...,-0.8,-56.07,-9.89,51.23,-7.57,19.86,-30.56,20.66,-25.08,19.64


### Other cleaning

In [None]:
data.shape

(375734, 246)

In [None]:
data2.shape

(31354, 245)

In [None]:
# prepare the dataset
train = my_utils.remove_miss_columns(data)
train.shape

(375734, 246)

In [None]:
train = my_utils.remove_outliers(train, num_std=5)
train.shape

(337256, 246)

In [None]:
train = my_utils.remove_features_near_zero_variance(train, threshold=1e-4)
train.shape

(337256, 246)

In [None]:
target = 'contest-tmp2m-14d__tmp2m'
response = train[target]
train = train.drop([target], axis=1)
train = my_utils.remove_highly_correlated_features(train, threshold=0.9)
train.shape
features = train.columns

In [None]:
test = data2[train.columns]

In [None]:
train['contest-tmp2m-14d__tmp2m'] = response
train.shape

(337256, 184)

In [None]:
test.shape

(31354, 183)

### NA handling

In [None]:
round(train.isnull().sum()*100/len(train),2).sort_values(ascending=False)[:10]

index                   0.0
icec-2010-10            0.0
wind-hgt-500-2010-10    0.0
icec-2010-2             0.0
icec-2010-3             0.0
icec-2010-4             0.0
icec-2010-5             0.0
icec-2010-6             0.0
icec-2010-7             0.0
icec-2010-8             0.0
dtype: float64

### location rounding issue

In [None]:
# Concatenate train and test data
all = pd.concat([train, test], axis=0)

# Create new feature
all['loc_group'] = all.groupby(['lat','lon']).ngroup()
display(all)

print(f'{all.loc_group.nunique()} unique locations')

# Split back up
train1 = all.iloc[:len(train)]
test1 = all.iloc[len(train):]

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,contest-wind-h10-14d__wind-hgt-10,contest-rhum-sig995-14d__rhum,nmme-prate-34w__cancm3,nmme-prate-34w__cancm4,...,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20,contest-tmp2m-14d__tmp2m,loc_group
0,0,0.0,0.833333,9/1/14,237.00,29.02,31246.63,81.72,25.33,17.55,...,8.32,9.56,-2.03,48.13,28.09,-13.50,11.90,4.58,28.744480,0
1,1,0.0,0.833333,9/2/14,228.90,29.02,31244.78,82.56,25.33,17.55,...,8.77,21.17,4.44,48.60,27.41,-23.77,15.44,3.42,28.370585,0
2,2,0.0,0.833333,9/3/14,220.69,29.02,31239.27,83.29,25.33,17.55,...,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82,28.133059,0
3,3,0.0,0.833333,9/4/14,225.28,29.02,31232.86,83.26,25.33,17.55,...,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74,28.256798,0
4,4,0.0,0.833333,9/5/14,237.24,29.02,31226.16,82.50,25.33,17.55,...,7.47,38.62,-5.21,54.73,-2.58,-42.30,21.91,10.95,28.372353,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31349,407083,1.0,0.866667,12/27/22,62.72,4.60,30269.05,84.04,15.75,15.00,...,32.39,38.82,7.42,11.75,-23.62,-0.24,-5.94,51.23,,574
31350,407084,1.0,0.866667,12/28/22,73.41,4.60,30264.55,82.11,15.20,14.27,...,26.23,37.64,13.01,17.84,-22.05,-3.03,1.31,51.45,,574
31351,407085,1.0,0.866667,12/29/22,70.00,4.60,30274.65,82.06,14.66,13.54,...,21.06,36.53,14.15,23.12,-25.60,-5.88,9.32,45.32,,574
31352,407086,1.0,0.866667,12/30/22,79.81,4.60,30296.92,79.89,14.11,12.80,...,20.42,36.05,6.38,29.00,-27.06,-1.42,16.06,31.88,,574


575 unique locations


In [None]:
print('Locations in train that are not in test')
print([c for c in train1.loc_group.unique() if c not in test1.loc_group.unique()])

print('Locations in test that are not in train')
print([c for c in test1.loc_group.unique() if c not in train1.loc_group.unique()])

Locations in train that are not in test
[33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 76, 101, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 181, 210, 239, 269, 300, 330, 360, 389, 418, 447, 475, 504, 533, 559]
Locations in test that are not in train
[52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 75, 100, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 180, 209, 238, 268, 299, 329, 359, 388, 417, 446, 474, 503, 532, 558]


In [None]:
scale = 14

train1.loc[:,'lat']=round(train1.lat,scale)
train1.loc[:,'lon']=round(train1.lon,scale)

test1.loc[:,'lat']=round(test1.lat,scale)
test1.loc[:,'lon']=round(test1.lon,scale)

# Concatenate train and test data
all = pd.concat([train1, test1], axis=0)

# Create new feature
all['loc_group'] = all.groupby(['lat','lon']).ngroup()
display(all)

print(f'{all.loc_group.nunique()} unique locations')

# Split back up
train1 = all.iloc[:len(train1)]
test1 = all.iloc[len(train1):]

print('Locations in train that are not in test')
print([c for c in train1.loc_group.unique() if c not in test1.loc_group.unique()])

print('Locations in test that are not in train')
print([c for c in test1.loc_group.unique() if c not in train1.loc_group.unique()])

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,contest-wind-h10-14d__wind-hgt-10,contest-rhum-sig995-14d__rhum,nmme-prate-34w__cancm3,nmme-prate-34w__cancm4,...,wind-vwnd-925-2010-13,wind-vwnd-925-2010-14,wind-vwnd-925-2010-15,wind-vwnd-925-2010-16,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20,contest-tmp2m-14d__tmp2m,loc_group
0,0,0.0,0.833333,9/1/14,237.00,29.02,31246.63,81.72,25.33,17.55,...,8.32,9.56,-2.03,48.13,28.09,-13.50,11.90,4.58,28.744480,0
1,1,0.0,0.833333,9/2/14,228.90,29.02,31244.78,82.56,25.33,17.55,...,8.77,21.17,4.44,48.60,27.41,-23.77,15.44,3.42,28.370585,0
2,2,0.0,0.833333,9/3/14,220.69,29.02,31239.27,83.29,25.33,17.55,...,6.99,32.16,5.01,48.53,19.21,-33.16,15.11,4.82,28.133059,0
3,3,0.0,0.833333,9/4/14,225.28,29.02,31232.86,83.26,25.33,17.55,...,6.17,39.66,-1.41,50.59,8.29,-37.22,18.24,9.74,28.256798,0
4,4,0.0,0.833333,9/5/14,237.24,29.02,31226.16,82.50,25.33,17.55,...,7.47,38.62,-5.21,54.73,-2.58,-42.30,21.91,10.95,28.372353,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31349,407083,1.0,0.866667,12/27/22,62.72,4.60,30269.05,84.04,15.75,15.00,...,32.39,38.82,7.42,11.75,-23.62,-0.24,-5.94,51.23,,513
31350,407084,1.0,0.866667,12/28/22,73.41,4.60,30264.55,82.11,15.20,14.27,...,26.23,37.64,13.01,17.84,-22.05,-3.03,1.31,51.45,,513
31351,407085,1.0,0.866667,12/29/22,70.00,4.60,30274.65,82.06,14.66,13.54,...,21.06,36.53,14.15,23.12,-25.60,-5.88,9.32,45.32,,513
31352,407086,1.0,0.866667,12/30/22,79.81,4.60,30296.92,79.89,14.11,12.80,...,20.42,36.05,6.38,29.00,-27.06,-1.42,16.06,31.88,,513


514 unique locations
Locations in train that are not in test
[]
Locations in test that are not in train
[]


### time engineering

In [None]:
train1.startdate = pd.to_datetime(train1.startdate)
test1.startdate = pd.to_datetime(test1.startdate)

In [None]:
def create_time_features(df):
    df = df.copy()
    #df['year'] = df.startdate.dt.year
    df['quarter'] = df.startdate.dt.quarter
    df['month'] = df.startdate.dt.month
    df['week'] = df.startdate.dt.weekofyear
    df['dayofyear'] = df.startdate.dt.day_of_year
    return df

train_df = create_time_features(train1)
test_df = create_time_features(test1)
train_df.head()

Unnamed: 0,index,lat,lon,startdate,contest-pevpr-sfc-gauss-14d__pevpr,nmme0-tmp2m-34w__cancm30,contest-wind-h10-14d__wind-hgt-10,contest-rhum-sig995-14d__rhum,nmme-prate-34w__cancm3,nmme-prate-34w__cancm4,...,wind-vwnd-925-2010-17,wind-vwnd-925-2010-18,wind-vwnd-925-2010-19,wind-vwnd-925-2010-20,contest-tmp2m-14d__tmp2m,loc_group,quarter,month,week,dayofyear
0,0,0.0,0.833333,2014-09-01,237.0,29.02,31246.63,81.72,25.33,17.55,...,28.09,-13.5,11.9,4.58,28.74448,0,3,9,36,244
1,1,0.0,0.833333,2014-09-02,228.9,29.02,31244.78,82.56,25.33,17.55,...,27.41,-23.77,15.44,3.42,28.370585,0,3,9,36,245
2,2,0.0,0.833333,2014-09-03,220.69,29.02,31239.27,83.29,25.33,17.55,...,19.21,-33.16,15.11,4.82,28.133059,0,3,9,36,246
3,3,0.0,0.833333,2014-09-04,225.28,29.02,31232.86,83.26,25.33,17.55,...,8.29,-37.22,18.24,9.74,28.256798,0,3,9,36,247
4,4,0.0,0.833333,2014-09-05,237.24,29.02,31226.16,82.5,25.33,17.55,...,-2.58,-42.3,21.91,10.95,28.372353,0,3,9,36,248


In [None]:
# Copied from https://colab.research.google.com/drive/10r73mOp1R7cORfeuP97V65a-rgwGyfWr?usp=sharing#scrollTo=c9ZkVb2aU-S7

def add_season(df):
    month_to_season = {
      1: 0,
      2: 0,
      3: 1,
      4: 1,
      5: 1,
      6: 2,
      7: 2,
      8: 2, 
      9: 3, 
      10: 3,
      11: 3,
      12: 0
  }
    df['season'] = df['month'].apply(lambda x: month_to_season[x])

add_season(train_df)
add_season(test_df)

In [None]:
# Copied from https://colab.research.google.com/drive/10r73mOp1R7cORfeuP97V65a-rgwGyfWr?usp=sharing#scrollTo=c9ZkVb2aU-S7

def sin_transformer(period):
    return FunctionTransformer(lambda x: np.sin(x / period * 2 * np.pi))


def cos_transformer(period):
    return FunctionTransformer(lambda x: np.cos(x / period * 2 * np.pi))

def encode_cyclical(df):
    # encode the day with a period of 365
    df['day_of_year_sin'] = sin_transformer(365).fit_transform(df['dayofyear'])
    df['day_of_year_cos'] = cos_transformer(365).fit_transform(df['dayofyear'])

    # encode the week with a period of 52
    df['week_sin'] = sin_transformer(52).fit_transform(df['week'])
    df['week_cos'] = cos_transformer(52).fit_transform(df['week'])

    # encode the month with a period of 12
    df['month_sin'] = sin_transformer(12).fit_transform(df['month'])
    df['month_cos'] = cos_transformer(12).fit_transform(df['month'])

    # encode the season with a period of 4
    df['season_sin'] = sin_transformer(4).fit_transform(df['season'])
    df['season_cos'] = cos_transformer(4).fit_transform(df['season'])
    
    # encode the quarter with a period of 4
    df['quarter_sin'] = sin_transformer(4).fit_transform(df['quarter'])
    df['quarter_cos'] = cos_transformer(4).fit_transform(df['quarter'])

encode_cyclical(train_df)
encode_cyclical(test_df)

In [None]:
train_df['quarter_sin'].unique()

array([-1.0000000e+00, -2.4492936e-16,  1.0000000e+00,  1.2246468e-16])

## Exporting

### Encoding and Export Cleaned Data

In [None]:
train_df['climateregions__climateregion'].unique()

array(['BSh', 'Cfa', 'BSk', 'BWk', 'BWh', 'Csa', 'Csb', 'Cfb', 'Dfb',
       'Dsc', 'Dfc', 'Dfa', 'Dsb', 'Dwa', 'Dwb'], dtype=object)

In [None]:
one_hot = pd.get_dummies(train_df['climateregions__climateregion'])
one_hot2 = pd.get_dummies(test_df['climateregions__climateregion'])

In [None]:
one_hot2

Unnamed: 0,BSh,BSk,BWh,BWk,Cfa,Cfb,Csa,Csb,Dfa,Dfb,Dfc,Dsb,Dsc,Dwa,Dwb
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31349,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
31350,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
31351,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
31352,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [None]:
final_train = train_df.drop(['startdate','loc_group', 'quarter', 'month', 'week', 'dayofyear', 'season','climateregions__climateregion'],axis = 1, inplace=False)
features = one_hot.columns
final_train[features] = one_hot[features]

In [None]:
final_test = test_df.drop(['startdate','loc_group', 'quarter', 'month', 'week', 'dayofyear', 'season','climateregions__climateregion'],axis = 1, inplace=False)
features = one_hot2.columns
final_test[features] = one_hot2[features]       

In [None]:
test1 = final_train.drop(['nmme0-prate-34w__ccsm30','nmme0-prate-56w__ccsm30','nmme-prate-34w__ccsm3'],axis = 1, inplace=False)

In [None]:
path2 = '/content/drive/Shareddrives/Kaggle/Kaggle/Data/ctrain.csv'
with open(path2, 'w', encoding = 'utf-8-sig') as f:
  final_train.to_csv(f)

KeyboardInterrupt: ignored

In [None]:
path = '/content/drive/Shareddrives/Kaggle/Kaggle/Data/train1.csv'
with open(path, 'w', encoding = 'utf-8-sig') as f:
  test1.to_csv(f)

In [None]:
path3 = '/content/drive/Shareddrives/Kaggle/Kaggle/Data/ctest.csv'
with open(path3, 'w', encoding = 'utf-8-sig') as f:
  final_test.to_csv(f)

### Export Subset Data Based On Climate Regions

In [None]:
train_df['climateregions__climateregion'].unique()

array(['BSh', 'Cfa', 'BSk', 'BWk', 'BWh', 'Csa', 'Csb', 'Cfb', 'Dfb',
       'Dsc', 'Dfc', 'Dfa', 'Dsb', 'Dwa', 'Dwb'], dtype=object)

In [None]:
len(train_df['climateregions__climateregion'].unique())

15

In [None]:
for cregion in train_df['climateregions__climateregion'].unique():
  length = train_df.loc[train_df['climateregions__climateregion'] == cregion].shape[0]
  print(cregion,length)

BSh 4802
Cfa 46956
BSk 130958
BWk 13030
BWh 8918
Csa 7966
Csb 29602
Cfb 2924
Dfb 48488
Dsc 1954
Dfc 11513
Dfa 21236
Dsb 6851
Dwa 1372
Dwb 686


In [None]:
train_df2 = train_df.drop(['startdate','loc_group', 'quarter', 'month', 'week', 'dayofyear', 'season'],axis = 1, inplace=False)
test_df2 = test_df.drop(['startdate','loc_group', 'quarter', 'month', 'week', 'dayofyear', 'season'],axis = 1, inplace=False)

In [None]:
import os.path
from os import path
for cregion in train_df2['climateregions__climateregion'].unique():
  path = '/content/drive/Shareddrives/Kaggle/Kaggle/Data/' + cregion
  os.mkdir(path)
  region_path_train = path + '/train.csv'
  region_path_test = path + '/test.csv'
  subtrain = train_df2.loc[train_df2['climateregions__climateregion'] == cregion].drop(['climateregions__climateregion'],axis = 1, inplace=False)
  subtest = test_df2.loc[test_df2['climateregions__climateregion'] == cregion].drop(['climateregions__climateregion'],axis = 1, inplace=False)
  with open(region_path_train, 'w', encoding = 'utf-8-sig') as f:
    subtrain.to_csv(f,index=False)

  with open(region_path_test, 'w', encoding = 'utf-8-sig') as f:
    subtest.to_csv(f,index=False)


### LightLGBM cross validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
import lightgbm as lgbm
from tqdm import tqdm

k = 5
kfold = KFold(k,shuffle=True, random_state=42)
val_scores = []
test_preds= []

for i,(train_idxs,val_idxs) in tqdm(enumerate(kfold.split(final_train))):

    X_train = final_train.iloc[train_idxs][features]
    y_train = final_train.iloc[train_idxs][target]
    X_val = final_train.iloc[val_idxs][features]
    y_val = final_train.iloc[val_idxs][target]
    test = final_train.iloc[val_idxs]
    
    params= {
     'learning_rate':0.02,
     'lambda_l1': 1.945,
     'num_leaves': 87,
     'feature_fraction': 0.79,
     'bagging_fraction': 0.93,
     'bagging_freq': 4,
     'min_data_in_leaf': 103,
     'max_depth': 17,
     'num_iterations':5000
    }
    
    model = lgbm.LGBMRegressor(**params)    

    model.fit(X= X_train,
              y= y_train,
              eval_set = (X_val,y_val),
              early_stopping_rounds = 100,
              verbose=500
             )
    preds = model.predict(X_val)
    rmse = mse(y_val, preds,squared=False)
    val_scores.append(rmse)
    print(f'=== Fold {i} RMSE {rmse} ====')
    
    preds = model.predict(test[features])
    test_preds.append(preds)
    
print(f'=== Average RMSE of {k} Folds: {np.mean(val_scores)} ====')

0it [00:00, ?it/s]

Training until validation scores don't improve for 100 rounds.
[500]	valid_0's l2: 0.603344
[1000]	valid_0's l2: 0.328388
[1500]	valid_0's l2: 0.232018
[2000]	valid_0's l2: 0.181182
[2500]	valid_0's l2: 0.149465
[3000]	valid_0's l2: 0.127644
[3500]	valid_0's l2: 0.111852
[4000]	valid_0's l2: 0.0999349
[4500]	valid_0's l2: 0.0910619
[5000]	valid_0's l2: 0.0835539
Did not meet early stopping. Best iteration is:
[5000]	valid_0's l2: 0.0835539
=== Fold 0 RMSE 0.28905699229668863 ====


1it [23:13, 1393.31s/it]

Training until validation scores don't improve for 100 rounds.
[500]	valid_0's l2: 0.613532
[1000]	valid_0's l2: 0.336144
[1500]	valid_0's l2: 0.238272
[2000]	valid_0's l2: 0.186425
[2500]	valid_0's l2: 0.152156
[3000]	valid_0's l2: 0.130113
[3500]	valid_0's l2: 0.114159
[4000]	valid_0's l2: 0.101829
[4500]	valid_0's l2: 0.092235
[5000]	valid_0's l2: 0.0845459
Did not meet early stopping. Best iteration is:
[5000]	valid_0's l2: 0.0845459
=== Fold 1 RMSE 0.29076785097063923 ====


2it [46:30, 1395.59s/it]

Training until validation scores don't improve for 100 rounds.
[500]	valid_0's l2: 0.627312
[1000]	valid_0's l2: 0.338837
[1500]	valid_0's l2: 0.238233
[2000]	valid_0's l2: 0.185677
[2500]	valid_0's l2: 0.152953
[3000]	valid_0's l2: 0.130828
[3500]	valid_0's l2: 0.114723
[4000]	valid_0's l2: 0.102476
[4500]	valid_0's l2: 0.0926826
[5000]	valid_0's l2: 0.0851135
Did not meet early stopping. Best iteration is:
[5000]	valid_0's l2: 0.0851135
=== Fold 2 RMSE 0.2917421491210307 ====


3it [1:09:20, 1383.69s/it]

Training until validation scores don't improve for 100 rounds.
[500]	valid_0's l2: 0.622913
[1000]	valid_0's l2: 0.333782
[1500]	valid_0's l2: 0.237192
[2000]	valid_0's l2: 0.185364
[2500]	valid_0's l2: 0.152128
[3000]	valid_0's l2: 0.130132
[3500]	valid_0's l2: 0.114069
[4000]	valid_0's l2: 0.102126
[4500]	valid_0's l2: 0.0928804
[5000]	valid_0's l2: 0.0854259
Did not meet early stopping. Best iteration is:
[5000]	valid_0's l2: 0.0854259
=== Fold 3 RMSE 0.2922771249596201 ====


4it [1:32:08, 1377.66s/it]

Training until validation scores don't improve for 100 rounds.
[500]	valid_0's l2: 0.617075
[1000]	valid_0's l2: 0.338035
[1500]	valid_0's l2: 0.240411
[2000]	valid_0's l2: 0.187812
[2500]	valid_0's l2: 0.155226
[3000]	valid_0's l2: 0.132158
[3500]	valid_0's l2: 0.116079
[4000]	valid_0's l2: 0.103777
[4500]	valid_0's l2: 0.0939991
[5000]	valid_0's l2: 0.086092
Did not meet early stopping. Best iteration is:
[5000]	valid_0's l2: 0.086092
=== Fold 4 RMSE 0.2934143349257904 ====


5it [1:55:03, 1380.66s/it]

=== Average RMSE of 5 Folds: 0.29145169045475383 ====



