In [1]:
import numpy as np 
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
warnings.filterwarnings('ignore')
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split
from sklearn import metrics
from tqdm import tqdm
from scipy import signal
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [2]:
def read_data():
    print('Reading training, testing and submission data...')
    
    train = pd.read_csv('../input/train.csv')
    train_clean = np.load("../input/train_detrend.npz", allow_pickle=True)
    assert np.all(train['open_channels'] == train_clean['train_opench'])
    train['signal_clean'] = train_clean['train_signal']
    
    test = pd.read_csv('../input/test.csv')
    test_clean = np.load("../input/test_detrend.npz", allow_pickle=True)
    test['signal_clean'] = test_clean['test_signal']
    
    submission = pd.read_csv('../input/sample_submission.csv', dtype={'time':str})
    print('Train set has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
    print('Test set has {} rows and {} columns'.format(test.shape[0], test.shape[1]))
    return train, test, submission

def get_batch(train, test):
    # concatenate data
    batch = 50
    total_batches = 14
    train['set'] = 'train'
    test['set'] = 'test'
    data = pd.concat([train, test])
    
    for i in range(int(total_batches)):
        data.loc[(data['time'] > i * batch) & (data['time'] <= (i + 1) * batch), 'batch'] = i + 1
        
    train = data[data['set'] == 'train']
    test = data[data['set'] == 'test']
    train.drop(['set'], inplace = True, axis = 1)
    test.drop(['set'], inplace = True, axis = 1)
    del data
    return train, test

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        if col!='open_channels':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def calc_gradients(s, n_grads = 4):
    '''
    Calculate gradients for a pandas series. Returns the same number of samples
    '''
    grads = pd.DataFrame()
    
    g = s.values
    for i in range(n_grads):
        g = np.gradient(g)
        grads['grad_' + str(i+1)] = g
        
    return grads

def calc_low_pass(s, n_filts=10):
    '''
    Applies low pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.3, n_filts)
    
    low_pass = pd.DataFrame()
    x = s.values
    for wn in wns:
        b, a = signal.butter(1, Wn=wn, btype='low')
        zi = signal.lfilter_zi(b, a)
        low_pass['lowpass_lf_' + str('%.4f' %wn)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        low_pass['lowpass_ff_' + str('%.4f' %wn)] = signal.filtfilt(b, a, x)
        
    return low_pass

def calc_high_pass(s, n_filts=10):
    '''
    Applies high pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.1, n_filts)
    
    high_pass = pd.DataFrame()
    x = s.values
    for wn in wns:
        b, a = signal.butter(1, Wn=wn, btype='high')
        zi = signal.lfilter_zi(b, a)
        high_pass['highpass_lf_' + str('%.4f' %wn)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        high_pass['highpass_ff_' + str('%.4f' %wn)] = signal.filtfilt(b, a, x)
        
    return high_pass

def calc_ewm(s, windows=[10, 50, 100, 500, 1000]):
    '''
    Calculates exponential weighted functions
    '''
    ewm = pd.DataFrame()
    for w in windows:
        ewm['ewm_mean_' + str(w)] = s.ewm(span=w, min_periods=1).mean()
        ewm['ewm_std_' + str(w)] = s.ewm(span=w, min_periods=1).std()
        
    # add zeros when na values (std)
    ewm = ewm.fillna(value=0)
        
    return ewm


def add_features(s):
    '''
    All calculations together
    '''
    
    gradients = calc_gradients(s)
    low_pass = calc_low_pass(s)
    high_pass = calc_high_pass(s)
    ewm = calc_ewm(s)
    
    return pd.concat([s, gradients, low_pass, high_pass, ewm], axis=1)


def divide_and_add_features(s, signal_size=500000):
    '''
    Divide the signal in bags of "signal_size".
    Normalize the data dividing it by 15.0
    '''
    # normalize
    s = s / 15.0
    
    ls = []
    # this is just to divide the data up into batches (bags) to keep track of progress
    # output is still the same size as input
    for i in tqdm(range(int(s.shape[0]/signal_size))):
        sig = s[i*signal_size:(i+1)*signal_size].copy().reset_index(drop=True)
        sig_featured = add_features(sig)
        ls.append(sig_featured)
    
    return pd.concat(ls, axis=0)

def rolling_features(train, test):
    
    pre_train = train.copy()
    pre_test = test.copy()
    
        
    for df in [pre_train, pre_test]:
        
        df['lag_t1'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(1))
        df['lag_t2'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(2))
        df['lag_t3'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(3))
        
        df['lead_t1'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(-1))
        df['lead_t2'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(-2))
        df['lead_t3'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(-3))
                
        for window in [1000, 5000, 10000, 20000, 40000, 80000]:
            
            # roll backwards
            df['signalmean_t' + str(window)] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(1).rolling(window).mean())
            df['signalstd_t' + str(window)] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(1).rolling(window).std())
            df['signalvar_t' + str(window)] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(1).rolling(window).var())
            df['signalmin_t' + str(window)] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(1).rolling(window).min())
            df['signalmax_t' + str(window)] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(1).rolling(window).max())
            min_max = (df['signal_clean'] - df['signalmin_t' + str(window)]) / (df['signalmax_t' + str(window)] - df['signalmin_t' + str(window)])
            df['norm_t' + str(window)] = min_max * (np.floor(df['signalmax_t' + str(window)]) - np.ceil(df['signalmin_t' + str(window)]))
            
            # roll forward
            df['signalmean_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(- window - 1).rolling(window).mean())
            df['signalstd_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(- window - 1).rolling(window).std())
            df['signalvar_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(- window - 1).rolling(window).var())
            df['signalmin_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(- window - 1).rolling(window).min())
            df['signalmax_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(- window - 1).rolling(window).max())   
            min_max = (df['signal_clean'] - df['signalmin_t' + str(window) + '_lead']) / (df['signalmax_t' + str(window) + '_lead'] - df['signalmin_t' + str(window) + '_lead'])
            df['norm_t' + str(window) + '_lead'] = min_max * (np.floor(df['signalmax_t' + str(window) + '_lead']) - np.ceil(df['signalmin_t' + str(window) + '_lead']))
            
    del train, test, min_max
    
    return pre_train, pre_test

def static_batch_features(df, n):
    
    df = df.copy()
    df.drop('batch', inplace = True, axis = 1)
    df = df.sort_values(by=['time']).reset_index(drop=True)
    df.index = ((df.time * 10000) - 1).values
    df['batch_' + str(n)] = df.index // n
    df['batch_index_' + str(n)] = df.index  - (df['batch_' + str(n)] * n)
    df['batch_slices_' + str(n)] = df['batch_index_' + str(n)]  // (n / 10)
    df['batch_slices2_' + str(n)] = df.apply(lambda r: '_'.join([str(r['batch_' + str(n)]).zfill(3), str(r['batch_slices_' + str(n)]).zfill(3)]), axis=1)

    for c in ['batch_' + str(n), 'batch_slices2_' + str(n)]:
        d = {}
        # -----------------------------------------------
        d['mean' + c] = df.groupby([c])['signal_clean'].mean()
        d['median' + c] = df.groupby([c])['signal_clean'].median()
        d['max' + c] = df.groupby([c])['signal_clean'].max()
        d['min' + c] = df.groupby([c])['signal_clean'].min()
        d['std' + c] = df.groupby([c])['signal_clean'].std()
        d['p10' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.percentile(x, 10))
        d['p25' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.percentile(x, 25))
        d['p75' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.percentile(x, 75))
        d['p90' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.percentile(x, 90))
        d['skew' + c] = df.groupby([c])['signal_clean'].apply(lambda x: pd.Series(x).skew())
        d['kurtosis' + c] = df.groupby([c])['signal_clean'].apply(lambda x: pd.Series(x).kurtosis())
        min_max = (d['mean' + c] - d['min' + c]) / (d['max' + c] - d['min' + c])
        d['norm' + c] = min_max * (np.floor(d['max' + c]) - np.ceil(d['min' + c]))
        d['mean_abs_chg' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.mean(np.abs(np.diff(x))))
        d['abs_max' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.max(np.abs(x)))
        d['abs_min' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.min(np.abs(x)))
        d['range' + c] = d['max' + c] - d['min' + c]
        d['maxtomin' + c] = d['max' + c] / d['min' + c]
        d['abs_avg' + c] = (d['abs_min' + c] + d['abs_max' + c]) / 2
        # -----------------------------------------------
        for v in d:
            df[v] = df[c].map(d[v].to_dict())

    for c in [c1 for c1 in df.columns if c1 not in ['time', 'signal', 'signal_clean', 'open_channels', 'batch', 'batch_' + str(n), 
                                                    'batch_index_' + str(n), 'batch_slices_' + str(n), 
                                                    'batch_slices2_' + str(n)]]:
        df[c + '_msignal'] = df[c] - df['signal_clean']
        
    df.reset_index(drop = True, inplace = True)
        
    return df

In [3]:
train, test, submission = read_data()

Reading training, testing and submission data...
Train set has 5000000 rows and 4 columns
Test set has 2000000 rows and 3 columns


In [4]:
pre_train4 = divide_and_add_features(train['signal_clean'])
pre_test4 = divide_and_add_features(test['signal_clean'])
pre_train4.drop(['signal_clean'], inplace=True, axis=1)
pre_test4.drop(['signal_clean'], inplace=True, axis = 1)
pre_train4.reset_index(inplace=True, drop=True)
pre_test4.reset_index(inplace=True, drop=True)
pre_train4 = reduce_mem_usage(pre_train4)
pre_test4 = reduce_mem_usage(pre_test4)

100%|██████████| 10/10 [00:03<00:00,  2.50it/s]
100%|██████████| 4/4 [00:01<00:00,  2.72it/s]


Mem. usage decreased to 514.98 Mb (75.0% reduction)
Mem. usage decreased to 205.99 Mb (75.0% reduction)


In [5]:
train, test = get_batch(train, test)
pre_train1, pre_test1 = rolling_features(train, test)
pre_train1 = reduce_mem_usage(pre_train1)
pre_test1 = reduce_mem_usage(pre_test1)
pre_train2 = static_batch_features(train, 25000)
pre_train2 = reduce_mem_usage(pre_train2)
pre_test2 = static_batch_features(test, 25000)
pre_test2 = reduce_mem_usage(pre_test2)

Mem. usage decreased to 858.31 Mb (73.2% reduction)
Mem. usage decreased to 343.32 Mb (73.2% reduction)
Mem. usage decreased to 820.16 Mb (73.1% reduction)
Mem. usage decreased to 328.06 Mb (73.1% reduction)


In [6]:
feat2 = [col for col in pre_train2.columns if col not in ['open_channels', 'signal', 'time', 'batch_25000', 'batch_index_25000', 'batch_slices_25000', 'batch_slices2_25000']]
pre_train = pd.concat([pre_train1, pre_train2[feat2], pre_train4], axis = 1)
pre_test = pd.concat([pre_test1, pre_test2[feat2], pre_test4], axis = 1)
del pre_train1, pre_train2, pre_train4, pre_test1, pre_test2, pre_test4

In [7]:
features = [col for col in pre_train.columns if col not in ['open_channels', 'time', 'batch']]

In [8]:
# params = {
#     'boosting_type': 'gbdt',
#     'metric': 'rmse',
#     'objective': 'regression',
#     'n_jobs': 6,
#     'seed': 236,
#     'num_leaves': 280,
#     'learning_rate': 0.026623466966581126,
#     'max_depth': 80,
#     'lambda_l1': 2.959759088169741,
#     'lambda_l2': 1.331172832164913,
#     'bagging_fraction': 0.9655406551472153,
#     'bagging_freq': 9,
#     'colsample_bytree': 0.6867118652742716
# }

params = {
    "boosting": "gbdt",
    "metric": 'rmse',
    'objective': 'huber',
    'random_state': 236,
    'num_leaves': 280,
    'learning_rate': 0.026623466966581126,
    'max_depth': 80,
    'reg_alpha': 2.959759088169741, # L1
    'reg_lambda': 1.331172832164913, # L2
    "bagging_fraction": 0.9655406551472153,
    "bagging_freq": 9,
    'colsample_bytree': 0.6867118652742716
}

In [9]:
pre_train.head()

Unnamed: 0,time,signal,open_channels,signal_clean,batch,lag_t1,lag_t2,lag_t3,lead_t1,lead_t2,lead_t3,signalmean_t1000,signalstd_t1000,signalvar_t1000,signalmin_t1000,signalmax_t1000,norm_t1000,signalmean_t1000_lead,signalstd_t1000_lead,signalvar_t1000_lead,signalmin_t1000_lead,signalmax_t1000_lead,norm_t1000_lead,signalmean_t5000,signalstd_t5000,signalvar_t5000,signalmin_t5000,signalmax_t5000,norm_t5000,signalmean_t5000_lead,signalstd_t5000_lead,signalvar_t5000_lead,signalmin_t5000_lead,signalmax_t5000_lead,norm_t5000_lead,signalmean_t10000,signalstd_t10000,signalvar_t10000,signalmin_t10000,signalmax_t10000,norm_t10000,signalmean_t10000_lead,signalstd_t10000_lead,signalvar_t10000_lead,signalmin_t10000_lead,signalmax_t10000_lead,norm_t10000_lead,signalmean_t20000,signalstd_t20000,signalvar_t20000,signalmin_t20000,signalmax_t20000,norm_t20000,signalmean_t20000_lead,signalstd_t20000_lead,signalvar_t20000_lead,signalmin_t20000_lead,signalmax_t20000_lead,norm_t20000_lead,signalmean_t40000,signalstd_t40000,signalvar_t40000,signalmin_t40000,signalmax_t40000,norm_t40000,signalmean_t40000_lead,signalstd_t40000_lead,signalvar_t40000_lead,signalmin_t40000_lead,signalmax_t40000_lead,norm_t40000_lead,signalmean_t80000,signalstd_t80000,signalvar_t80000,signalmin_t80000,signalmax_t80000,norm_t80000,signalmean_t80000_lead,signalstd_t80000_lead,signalvar_t80000_lead,signalmin_t80000_lead,signalmax_t80000_lead,norm_t80000_lead,signal_clean.1,meanbatch_25000,medianbatch_25000,maxbatch_25000,minbatch_25000,stdbatch_25000,p10batch_25000,p25batch_25000,p75batch_25000,p90batch_25000,skewbatch_25000,kurtosisbatch_25000,normbatch_25000,mean_abs_chgbatch_25000,abs_maxbatch_25000,abs_minbatch_25000,rangebatch_25000,maxtominbatch_25000,abs_avgbatch_25000,meanbatch_slices2_25000,medianbatch_slices2_25000,maxbatch_slices2_25000,minbatch_slices2_25000,stdbatch_slices2_25000,p10batch_slices2_25000,p25batch_slices2_25000,p75batch_slices2_25000,p90batch_slices2_25000,skewbatch_slices2_25000,kurtosisbatch_slices2_25000,normbatch_slices2_25000,mean_abs_chgbatch_slices2_25000,abs_maxbatch_slices2_25000,abs_minbatch_slices2_25000,rangebatch_slices2_25000,maxtominbatch_slices2_25000,abs_avgbatch_slices2_25000,meanbatch_25000_msignal,medianbatch_25000_msignal,maxbatch_25000_msignal,minbatch_25000_msignal,stdbatch_25000_msignal,p10batch_25000_msignal,p25batch_25000_msignal,p75batch_25000_msignal,p90batch_25000_msignal,skewbatch_25000_msignal,kurtosisbatch_25000_msignal,normbatch_25000_msignal,mean_abs_chgbatch_25000_msignal,abs_maxbatch_25000_msignal,abs_minbatch_25000_msignal,rangebatch_25000_msignal,maxtominbatch_25000_msignal,abs_avgbatch_25000_msignal,meanbatch_slices2_25000_msignal,medianbatch_slices2_25000_msignal,maxbatch_slices2_25000_msignal,minbatch_slices2_25000_msignal,stdbatch_slices2_25000_msignal,p10batch_slices2_25000_msignal,p25batch_slices2_25000_msignal,p75batch_slices2_25000_msignal,p90batch_slices2_25000_msignal,skewbatch_slices2_25000_msignal,kurtosisbatch_slices2_25000_msignal,normbatch_slices2_25000_msignal,mean_abs_chgbatch_slices2_25000_msignal,abs_maxbatch_slices2_25000_msignal,abs_minbatch_slices2_25000_msignal,rangebatch_slices2_25000_msignal,maxtominbatch_slices2_25000_msignal,abs_avgbatch_slices2_25000_msignal,grad_1,grad_2,grad_3,grad_4,lowpass_lf_0.0100,lowpass_ff_0.0100,lowpass_lf_0.0154,lowpass_ff_0.0154,lowpass_lf_0.0239,lowpass_ff_0.0239,lowpass_lf_0.0369,lowpass_ff_0.0369,lowpass_lf_0.0570,lowpass_ff_0.0570,lowpass_lf_0.0880,lowpass_ff_0.0880,lowpass_lf_0.1359,lowpass_ff_0.1359,lowpass_lf_0.2100,lowpass_ff_0.2100,lowpass_lf_0.3244,lowpass_ff_0.3244,lowpass_lf_0.5012,lowpass_ff_0.5012,highpass_lf_0.0100,highpass_ff_0.0100,highpass_lf_0.0163,highpass_ff_0.0163,highpass_lf_0.0264,highpass_ff_0.0264,highpass_lf_0.0430,highpass_ff_0.0430,highpass_lf_0.0699,highpass_ff_0.0699,highpass_lf_0.1136,highpass_ff_0.1136,highpass_lf_0.1848,highpass_ff_0.1848,highpass_lf_0.3005,highpass_ff_0.3005,highpass_lf_0.4885,highpass_ff_0.4885,highpass_lf_0.7943,highpass_ff_0.7943,ewm_mean_10,ewm_std_10,ewm_mean_50,ewm_std_50,ewm_mean_100,ewm_std_100,ewm_mean_500,ewm_std_500,ewm_mean_1000,ewm_std_1000
0,0.0001,-2.759766,0.0,-2.759766,1.0,,,,-2.855469,-2.408203,-3.140625,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.759766,-2.683594,-2.691406,-0.765137,-3.654297,0.272949,-3.005859,-2.859375,-2.527344,-2.375,0.963379,4.140625,0.672363,0.27417,3.654297,0.765137,2.890625,0.209351,2.210938,-2.693359,-2.693359,-1.90332,-3.466797,0.244385,-3.009766,-2.861328,-2.533203,-2.373047,0.02803,-0.052124,0.494385,0.269043,3.466797,1.90332,1.564453,0.548828,2.685547,0.076233,0.067688,1.995117,-0.89502,3.033203,-0.245239,-0.098999,0.232788,0.385742,3.722656,6.902344,3.431641,3.035156,6.414062,3.525391,5.648438,2.96875,4.96875,0.066223,0.065979,0.856934,-0.707031,3.003906,-0.25,-0.100952,0.22644,0.38623,2.787109,2.707031,3.253906,3.029297,6.226562,4.664062,4.324219,3.308594,5.445312,-0.006378,0.018127,-0.019684,0.001474,-0.18396,-0.186157,-0.18396,-0.186401,-0.18396,-0.186157,-0.18396,-0.185425,-0.18396,-0.184814,-0.18396,-0.184204,-0.18396,-0.18396,-0.18396,-0.18396,-0.18396,-0.18396,-0.18396,-0.18396,0.0,0.002104,-0.0,0.002359,0.0,0.001995,0.0,0.001215,0.0,0.000477,0.0,8e-05,0.0,-1.1e-05,0.0,-2e-06,0.0,-0.0,0.0,-2.5e-05,-0.18396,0.0,-0.18396,0.0,-0.18396,0.0,-0.18396,0.0,-0.18396,0.0
1,0.0002,-2.855469,0.0,-2.855469,1.0,-2.759766,,,-2.408203,-3.140625,-3.152344,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.855469,-2.683594,-2.691406,-0.765137,-3.654297,0.272949,-3.005859,-2.859375,-2.527344,-2.375,0.963379,4.140625,0.672363,0.27417,3.654297,0.765137,2.890625,0.209351,2.210938,-2.693359,-2.693359,-1.90332,-3.466797,0.244385,-3.009766,-2.861328,-2.533203,-2.373047,0.02803,-0.052124,0.494385,0.269043,3.466797,1.90332,1.564453,0.548828,2.685547,0.171997,0.163452,2.089844,-0.799316,3.128906,-0.149536,-0.0033,0.328613,0.481445,3.818359,6.996094,3.527344,3.130859,6.511719,3.621094,5.746094,3.064453,5.066406,0.161987,0.161743,0.952637,-0.611328,3.099609,-0.154297,-0.005226,0.322266,0.481934,2.882812,2.802734,3.349609,3.125,6.324219,4.757812,4.417969,3.404297,5.539062,0.011757,-0.001555,-0.018219,0.013496,-0.184082,-0.186035,-0.184204,-0.186401,-0.184204,-0.186157,-0.184326,-0.185669,-0.18457,-0.185059,-0.184814,-0.184692,-0.185181,-0.18457,-0.185669,-0.184082,-0.186279,-0.182861,-0.187256,-0.181274,-0.006283,-0.004295,-0.006222,-0.004002,-0.006126,-0.004303,-0.005978,-0.00499,-0.005745,-0.005569,-0.005405,-0.005798,-0.004913,-0.006065,-0.004223,-0.007271,-0.003248,-0.009033,-0.0016,-0.005028,-0.1875,0.004513,-0.187256,0.004513,-0.187256,0.004513,-0.187256,0.004513,-0.187134,0.004513
2,0.0003,-2.408203,0.0,-2.408203,1.0,-2.855469,-2.759766,,-3.140625,-3.152344,-2.642578,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.408203,-2.683594,-2.691406,-0.765137,-3.654297,0.272949,-3.005859,-2.859375,-2.527344,-2.375,0.963379,4.140625,0.672363,0.27417,3.654297,0.765137,2.890625,0.209351,2.210938,-2.693359,-2.693359,-1.90332,-3.466797,0.244385,-3.009766,-2.861328,-2.533203,-2.373047,0.02803,-0.052124,0.494385,0.269043,3.466797,1.90332,1.564453,0.548828,2.685547,-0.276367,-0.284912,1.642578,-1.248047,2.679688,-0.597656,-0.45166,-0.119812,0.033112,3.371094,6.546875,3.080078,2.681641,6.0625,3.171875,5.296875,2.617188,4.617188,-0.286377,-0.286621,0.504395,-1.05957,2.652344,-0.602539,-0.453613,-0.126099,0.0336,2.435547,2.355469,2.902344,2.675781,5.875,4.308594,3.970703,2.957031,5.09375,-0.009491,-0.018295,0.007305,0.018677,-0.183838,-0.186035,-0.183716,-0.186401,-0.183594,-0.186279,-0.18335,-0.185791,-0.182983,-0.185425,-0.182495,-0.185303,-0.181641,-0.185669,-0.18042,-0.185791,-0.178467,-0.184326,-0.175415,-0.180176,0.023331,0.025574,0.023224,0.025925,0.023056,0.025696,0.022781,0.025146,0.022308,0.024826,0.021561,0.02504,0.020355,0.025314,0.018417,0.024261,0.015152,0.02002,0.008293,0.008034,-0.176636,0.016617,-0.177979,0.015915,-0.178101,0.015823,-0.178223,0.015762,-0.178223,0.015747
3,0.0004,-3.140625,0.0,-3.140625,1.0,-2.408203,-2.855469,-2.759766,-3.152344,-2.642578,-2.699219,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-3.140625,-2.683594,-2.691406,-0.765137,-3.654297,0.272949,-3.005859,-2.859375,-2.527344,-2.375,0.963379,4.140625,0.672363,0.27417,3.654297,0.765137,2.890625,0.209351,2.210938,-2.693359,-2.693359,-1.90332,-3.466797,0.244385,-3.009766,-2.861328,-2.533203,-2.373047,0.02803,-0.052124,0.494385,0.269043,3.466797,1.90332,1.564453,0.548828,2.685547,0.456543,0.447998,2.375,-0.514648,3.414062,0.135132,0.281494,0.613281,0.766113,4.105469,7.28125,3.8125,3.414062,6.796875,3.90625,6.03125,3.349609,5.351562,0.446533,0.446289,1.237305,-0.326416,3.384766,0.130371,0.279541,0.606934,0.766602,3.167969,3.087891,3.634766,3.410156,6.605469,5.042969,4.703125,3.689453,5.824219,-0.024841,0.013054,0.019135,-0.008789,-0.183838,-0.186035,-0.183838,-0.186401,-0.183716,-0.186401,-0.183594,-0.186035,-0.18335,-0.185913,-0.183105,-0.186401,-0.182861,-0.187866,-0.182739,-0.190308,-0.183105,-0.193481,-0.184937,-0.197388,-0.025497,-0.0233,-0.025574,-0.022903,-0.025696,-0.02301,-0.025879,-0.023361,-0.026138,-0.023285,-0.026428,-0.022171,-0.026627,-0.019836,-0.026382,-0.01651,-0.024597,-0.012253,-0.016388,-0.005165,-0.187378,0.022034,-0.186279,0.020538,-0.186157,0.020355,-0.186035,0.020203,-0.186035,0.020187
4,0.0005,-3.152344,0.0,-3.152344,1.0,-3.140625,-2.408203,-2.855469,-2.642578,-2.699219,-2.59375,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-3.152344,-2.683594,-2.691406,-0.765137,-3.654297,0.272949,-3.005859,-2.859375,-2.527344,-2.375,0.963379,4.140625,0.672363,0.27417,3.654297,0.765137,2.890625,0.209351,2.210938,-2.693359,-2.693359,-1.90332,-3.466797,0.244385,-3.009766,-2.861328,-2.533203,-2.373047,0.02803,-0.052124,0.494385,0.269043,3.466797,1.90332,1.564453,0.548828,2.685547,0.46875,0.460205,2.386719,-0.502441,3.425781,0.147339,0.293457,0.625488,0.77832,4.117188,7.292969,3.824219,3.425781,6.808594,3.917969,6.042969,3.361328,5.363281,0.45874,0.458496,1.25,-0.314453,3.396484,0.142456,0.291504,0.619141,0.778809,3.179688,3.099609,3.646484,3.421875,6.621094,5.054688,4.714844,3.701172,5.835938,0.016617,0.019974,-0.010277,-0.016312,-0.184692,-0.186035,-0.185059,-0.186401,-0.185547,-0.186523,-0.186401,-0.186279,-0.187744,-0.186157,-0.189575,-0.186646,-0.192383,-0.188232,-0.196533,-0.191162,-0.202271,-0.195435,-0.209839,-0.201538,-0.025497,-0.024124,-0.025085,-0.023682,-0.024429,-0.023743,-0.023361,-0.024033,-0.021667,-0.023895,-0.019028,-0.022675,-0.014999,-0.020004,-0.009087,-0.015556,-0.000854,-0.009071,0.007965,-0.001169,-0.19397,0.021454,-0.191406,0.020752,-0.191162,0.020645,-0.190918,0.020538,-0.190918,0.020538


In [10]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
target = 'open_channels'

In [11]:
for fold, (trn_ndcs, vld_ndcs) in enumerate(kf.split(pre_train, pre_train[target])):
    x_trn, x_vld = pre_train[features].iloc[trn_ndcs], pre_train[features].iloc[vld_ndcs]
    y_trn, y_vld = pre_train[target][trn_ndcs], pre_train[target][vld_ndcs]
    #trn_set = lgb.Dataset(x_trn, y_trn)
    #vld_set = lgb.Dataset(x_vld, y_vld)
    break

In [12]:
# model = lgb.train(params, trn_set, num_boost_round=10000, early_stopping_rounds=100, valid_sets=[vld_set], verbose_eval=50)
model = lgb.LGBMRegressor(**params, n_estimators=10000, n_jobs=12)
model.fit(X=x_trn, y=y_trn, eval_set=[(x_vld, y_vld)], eval_metric='rmse', verbose=50, early_stopping_rounds=100)

Training until validation scores don't improve for 100 rounds
[50]	valid_0's rmse: 1.7831
[100]	valid_0's rmse: 1.13112
[150]	valid_0's rmse: 0.692472
[200]	valid_0's rmse: 0.377599
[250]	valid_0's rmse: 0.205497
[300]	valid_0's rmse: 0.159849
[350]	valid_0's rmse: 0.155435
[400]	valid_0's rmse: 0.155023
[450]	valid_0's rmse: 0.154923
[500]	valid_0's rmse: 0.154859
[550]	valid_0's rmse: 0.154815
[600]	valid_0's rmse: 0.154777
[650]	valid_0's rmse: 0.154733
[700]	valid_0's rmse: 0.154707
[750]	valid_0's rmse: 0.154669
[800]	valid_0's rmse: 0.154642
[850]	valid_0's rmse: 0.154607
[900]	valid_0's rmse: 0.15458
[950]	valid_0's rmse: 0.154569
[1000]	valid_0's rmse: 0.154553
[1050]	valid_0's rmse: 0.154533
[1100]	valid_0's rmse: 0.154519
[1150]	valid_0's rmse: 0.154485
[1200]	valid_0's rmse: 0.15447
[1250]	valid_0's rmse: 0.154459
[1300]	valid_0's rmse: 0.15445
[1350]	valid_0's rmse: 0.154436
[1400]	valid_0's rmse: 0.154421
[1450]	valid_0's rmse: 0.154407
[1500]	valid_0's rmse: 0.154396
[155

LGBMRegressor(bagging_fraction=0.9655406551472153, bagging_freq=9,
              boosting='gbdt', boosting_type='gbdt', class_weight=None,
              colsample_bytree=0.6867118652742716, importance_type='split',
              learning_rate=0.026623466966581126, max_depth=80, metric='rmse',
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10000, n_jobs=12, num_leaves=280, objective='huber',
              random_state=236, reg_alpha=2.959759088169741,
              reg_lambda=1.331172832164913, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

In [14]:
vld_pred = model.predict(x_vld, num_iteration=model.best_iteration_)
vld_pred = np.round(np.clip(vld_pred, 0, 10)).astype(int)

In [25]:
f1 = metrics.f1_score(y_vld.astype(int), vld_pred, average = 'macro')
print(f1)

0.9384238806036712
