In [1]:
import numpy as np 
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import gc
warnings.filterwarnings('ignore')
import lightgbm as lgb
from sklearn.model_selection import GroupKFold, StratifiedKFold, train_test_split
from sklearn import metrics
from tqdm import tqdm
from scipy import signal
from pykalman import KalmanFilter
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_rows', 10000)

In [2]:
def read_data():
    print('Reading training, testing and submission data...')
    
    train = pd.read_csv('../input/train.csv')
    train_clean = np.load("../input/train_detrend.npz", allow_pickle=True)
    assert np.all(train['open_channels'] == train_clean['train_opench'])
    train['signal_clean'] = train_clean['train_signal']
    
    test = pd.read_csv('../input/test.csv')
    test_clean = np.load("../input/test_detrend.npz", allow_pickle=True)
    test['signal_clean'] = test_clean['test_signal']
    
    submission = pd.read_csv('../input/sample_submission.csv', dtype={'time':str})
    print('Train set has {} rows and {} columns'.format(train.shape[0], train.shape[1]))
    print('Test set has {} rows and {} columns'.format(test.shape[0], test.shape[1]))
    return train, test, submission

def get_batch(train, test):
    # concatenate data
    batch = 50
    total_batches = 14
    train['set'] = 'train'
    test['set'] = 'test'
    data = pd.concat([train, test])
    
    for i in range(int(total_batches)):
        data.loc[(data['time'] > i * batch) & (data['time'] <= (i + 1) * batch), 'batch'] = i + 1
        
    train = data[data['set'] == 'train']
    test = data[data['set'] == 'test']
    train.drop(['set'], inplace = True, axis = 1)
    test.drop(['set'], inplace = True, axis = 1)
    del data
    return train, test

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        if col!='open_channels':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def calc_gradients(s, n_grads = 4):
    '''
    Calculate gradients for a pandas series. Returns the same number of samples
    '''
    grads = pd.DataFrame()
    
    g = s.values
    for i in range(n_grads):
        g = np.gradient(g)
        grads['grad_' + str(i+1)] = g
        
    return grads

def calc_low_pass(s, n_filts=10):
    '''
    Applies low pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.3, n_filts)
    
    low_pass = pd.DataFrame()
    x = s.values
    for wn in wns:
        b, a = signal.butter(1, Wn=wn, btype='low')
        zi = signal.lfilter_zi(b, a)
        low_pass['lowpass_lf_' + str('%.4f' %wn)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        low_pass['lowpass_ff_' + str('%.4f' %wn)] = signal.filtfilt(b, a, x)
        
    return low_pass

def calc_high_pass(s, n_filts=10):
    '''
    Applies high pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.1, n_filts)
    
    high_pass = pd.DataFrame()
    x = s.values
    for wn in wns:
        b, a = signal.butter(1, Wn=wn, btype='high')
        zi = signal.lfilter_zi(b, a)
        high_pass['highpass_lf_' + str('%.4f' %wn)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        high_pass['highpass_ff_' + str('%.4f' %wn)] = signal.filtfilt(b, a, x)
        
    return high_pass

def calc_ewm(s, windows=[10, 50, 100, 500, 1000]):
    '''
    Calculates exponential weighted functions
    '''
    ewm = pd.DataFrame()
    for w in windows:
        ewm['ewm_mean_' + str(w)] = s.ewm(span=w, min_periods=1).mean()
        ewm['ewm_std_' + str(w)] = s.ewm(span=w, min_periods=1).std()
        
    # add zeros when na values (std)
    ewm = ewm.fillna(value=0)
        
    return ewm


def add_features(s):
    '''
    All calculations together
    '''
    
    gradients = calc_gradients(s)
    low_pass = calc_low_pass(s)
    high_pass = calc_high_pass(s)
    ewm = calc_ewm(s)
    
    return pd.concat([s, gradients, low_pass, high_pass, ewm], axis=1)


def divide_and_add_features(s, signal_size=500000):
    '''
    Divide the signal in bags of "signal_size".
    Normalize the data dividing it by 15.0
    '''
    # normalize
    s = s / 15.0
    
    ls = []
    # this is just to divide the data up into batches (bags) to keep track of progress
    # output is still the same size as input
    for i in tqdm(range(int(s.shape[0]/signal_size))):
        sig = s[i*signal_size:(i+1)*signal_size].copy().reset_index(drop=True)
        sig_featured = add_features(sig)
        ls.append(sig_featured)
    
    return pd.concat(ls, axis=0)

def rolling_features(train, test):
    
    pre_train = train.copy()
    pre_test = test.copy()
    
        
    for df in [pre_train, pre_test]:
        
        df['lag_t1'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(1))
        df['lag_t2'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(2))
        df['lag_t3'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(3))
        
        df['lead_t1'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(-1))
        df['lead_t2'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(-2))
        df['lead_t3'] = df.groupby('batch')['signal_clean'].transform(lambda x: x.shift(-3))
                
        for window in [1000, 5000, 10000, 20000, 40000, 80000]:
            
            # roll backwards
            df['signalmean_t' + str(window)] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(1).rolling(window).mean())
            df['signalstd_t' + str(window)] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(1).rolling(window).std())
            df['signalvar_t' + str(window)] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(1).rolling(window).var())
            df['signalmin_t' + str(window)] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(1).rolling(window).min())
            df['signalmax_t' + str(window)] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(1).rolling(window).max())
            min_max = (df['signal_clean'] - df['signalmin_t' + str(window)]) / (df['signalmax_t' + str(window)] - df['signalmin_t' + str(window)])
            df['norm_t' + str(window)] = min_max * (np.floor(df['signalmax_t' + str(window)]) - np.ceil(df['signalmin_t' + str(window)]))
            
            # roll forward
            df['signalmean_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(- window - 1).rolling(window).mean())
            df['signalstd_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(- window - 1).rolling(window).std())
            df['signalvar_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(- window - 1).rolling(window).var())
            df['signalmin_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(- window - 1).rolling(window).min())
            df['signalmax_t' + str(window) + '_lead'] = df.groupby(['batch'])['signal_clean'].transform(lambda x: x.shift(- window - 1).rolling(window).max())   
            min_max = (df['signal_clean'] - df['signalmin_t' + str(window) + '_lead']) / (df['signalmax_t' + str(window) + '_lead'] - df['signalmin_t' + str(window) + '_lead'])
            df['norm_t' + str(window) + '_lead'] = min_max * (np.floor(df['signalmax_t' + str(window) + '_lead']) - np.ceil(df['signalmin_t' + str(window) + '_lead']))
            
    del train, test, min_max
    
    return pre_train, pre_test

def static_batch_features(df, n):
    
    df = df.copy()
    df.drop('batch', inplace = True, axis = 1)
    df = df.sort_values(by=['time']).reset_index(drop=True)
    df.index = ((df.time * 10000) - 1).values
    df['batch_' + str(n)] = df.index // n
    df['batch_index_' + str(n)] = df.index  - (df['batch_' + str(n)] * n)
    df['batch_slices_' + str(n)] = df['batch_index_' + str(n)]  // (n / 10)
    df['batch_slices2_' + str(n)] = df.apply(lambda r: '_'.join([str(r['batch_' + str(n)]).zfill(3), str(r['batch_slices_' + str(n)]).zfill(3)]), axis=1)

    for c in ['batch_' + str(n), 'batch_slices2_' + str(n)]:
        d = {}
        # -----------------------------------------------
        d['mean' + c] = df.groupby([c])['signal_clean'].mean()
        d['median' + c] = df.groupby([c])['signal_clean'].median()
        d['max' + c] = df.groupby([c])['signal_clean'].max()
        d['min' + c] = df.groupby([c])['signal_clean'].min()
        d['std' + c] = df.groupby([c])['signal_clean'].std()
        d['p10' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.percentile(x, 10))
        d['p25' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.percentile(x, 25))
        d['p75' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.percentile(x, 75))
        d['p90' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.percentile(x, 90))
        d['skew' + c] = df.groupby([c])['signal_clean'].apply(lambda x: pd.Series(x).skew())
        d['kurtosis' + c] = df.groupby([c])['signal_clean'].apply(lambda x: pd.Series(x).kurtosis())
        min_max = (d['mean' + c] - d['min' + c]) / (d['max' + c] - d['min' + c])
        d['norm' + c] = min_max * (np.floor(d['max' + c]) - np.ceil(d['min' + c]))
        d['mean_abs_chg' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.mean(np.abs(np.diff(x))))
        d['abs_max' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.max(np.abs(x)))
        d['abs_min' + c] = df.groupby([c])['signal_clean'].apply(lambda x: np.min(np.abs(x)))
        d['range' + c] = d['max' + c] - d['min' + c]
        d['maxtomin' + c] = d['max' + c] / d['min' + c]
        d['abs_avg' + c] = (d['abs_min' + c] + d['abs_max' + c]) / 2
        # -----------------------------------------------
        for v in d:
            df[v] = df[c].map(d[v].to_dict())

    for c in [c1 for c1 in df.columns if c1 not in ['time', 'signal', 'signal_clean', 'open_channels', 'batch', 'batch_' + str(n), 
                                                    'batch_index_' + str(n), 'batch_slices_' + str(n), 
                                                    'batch_slices2_' + str(n)]]:
        df[c + '_msignal'] = df[c] - df['signal_clean']
        
    df.reset_index(drop = True, inplace = True)
        
    return df

def Kalman1D(observations,damping=1):
    # To return the smoothed time series data
    observation_covariance = damping
    initial_value_guess = observations[0]
    transition_matrix = 1
    transition_covariance = 0.1
    initial_value_guess
    kf = KalmanFilter(
            initial_state_mean=initial_value_guess,
            initial_state_covariance=observation_covariance,
            observation_covariance=observation_covariance,
            transition_covariance=transition_covariance,
            transition_matrices=transition_matrix
        )
    pred_state, state_cov = kf.smooth(observations)
    return pred_state

In [3]:
train, test, submission = read_data()

Reading training, testing and submission data...
Train set has 5000000 rows and 4 columns
Test set has 2000000 rows and 3 columns


In [4]:
observation_covariance = .0015
train['signal_clean'] = Kalman1D(train.signal_clean.values, observation_covariance)
test['signal_clean'] = Kalman1D(test.signal_clean.values, observation_covariance)

In [5]:
pre_train4 = divide_and_add_features(train['signal_clean'])
pre_test4 = divide_and_add_features(test['signal_clean'])
pre_train4.drop(['signal_clean'], inplace=True, axis=1)
pre_test4.drop(['signal_clean'], inplace=True, axis = 1)
pre_train4.reset_index(inplace=True, drop=True)
pre_test4.reset_index(inplace=True, drop=True)
pre_train4 = reduce_mem_usage(pre_train4)
pre_test4 = reduce_mem_usage(pre_test4)

100%|██████████| 10/10 [00:03<00:00,  2.51it/s]
100%|██████████| 4/4 [00:01<00:00,  2.63it/s]


Mem. usage decreased to 514.98 Mb (75.0% reduction)
Mem. usage decreased to 205.99 Mb (75.0% reduction)


In [6]:
train, test = get_batch(train, test)
pre_train1, pre_test1 = rolling_features(train, test)
pre_train1 = reduce_mem_usage(pre_train1)
pre_test1 = reduce_mem_usage(pre_test1)
pre_train2 = static_batch_features(train, 25000)
pre_train2 = reduce_mem_usage(pre_train2)
pre_test2 = static_batch_features(test, 25000)
pre_test2 = reduce_mem_usage(pre_test2)

Mem. usage decreased to 858.31 Mb (73.2% reduction)
Mem. usage decreased to 343.32 Mb (73.2% reduction)
Mem. usage decreased to 820.16 Mb (73.1% reduction)
Mem. usage decreased to 328.06 Mb (73.1% reduction)


In [7]:
feat2 = [col for col in pre_train2.columns if col not in ['open_channels', 'signal', 'time', 'batch_25000', 'batch_index_25000', 'batch_slices_25000', 'batch_slices2_25000']]
pre_train = pd.concat([pre_train1, pre_train2[feat2], pre_train4], axis = 1)
pre_test = pd.concat([pre_test1, pre_test2[feat2], pre_test4], axis = 1)
del pre_train1, pre_train2, pre_train4, pre_test1, pre_test2, pre_test4

In [8]:
features = [col for col in pre_train.columns if col not in ['open_channels', 'time', 'batch']]

In [9]:
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'n_jobs': 6,
    'seed': 236,
    'num_leaves': 280,
    'learning_rate': 0.026623466966581126,
    'max_depth': 80,
    'lambda_l1': 2.959759088169741,
    'lambda_l2': 1.331172832164913,
    'bagging_fraction': 0.9655406551472153,
    'bagging_freq': 9,
    'colsample_bytree': 0.6867118652742716
}

In [10]:
pre_train.head()

Unnamed: 0,time,signal,open_channels,signal_clean,batch,lag_t1,lag_t2,lag_t3,lead_t1,lead_t2,lead_t3,signalmean_t1000,signalstd_t1000,signalvar_t1000,signalmin_t1000,signalmax_t1000,norm_t1000,signalmean_t1000_lead,signalstd_t1000_lead,signalvar_t1000_lead,signalmin_t1000_lead,signalmax_t1000_lead,norm_t1000_lead,signalmean_t5000,signalstd_t5000,signalvar_t5000,signalmin_t5000,signalmax_t5000,norm_t5000,signalmean_t5000_lead,signalstd_t5000_lead,signalvar_t5000_lead,signalmin_t5000_lead,signalmax_t5000_lead,norm_t5000_lead,signalmean_t10000,signalstd_t10000,signalvar_t10000,signalmin_t10000,signalmax_t10000,norm_t10000,signalmean_t10000_lead,signalstd_t10000_lead,signalvar_t10000_lead,signalmin_t10000_lead,signalmax_t10000_lead,norm_t10000_lead,signalmean_t20000,signalstd_t20000,signalvar_t20000,signalmin_t20000,signalmax_t20000,norm_t20000,signalmean_t20000_lead,signalstd_t20000_lead,signalvar_t20000_lead,signalmin_t20000_lead,signalmax_t20000_lead,norm_t20000_lead,signalmean_t40000,signalstd_t40000,signalvar_t40000,signalmin_t40000,signalmax_t40000,norm_t40000,signalmean_t40000_lead,signalstd_t40000_lead,signalvar_t40000_lead,signalmin_t40000_lead,signalmax_t40000_lead,norm_t40000_lead,signalmean_t80000,signalstd_t80000,signalvar_t80000,signalmin_t80000,signalmax_t80000,norm_t80000,signalmean_t80000_lead,signalstd_t80000_lead,signalvar_t80000_lead,signalmin_t80000_lead,signalmax_t80000_lead,norm_t80000_lead,signal_clean.1,meanbatch_25000,medianbatch_25000,maxbatch_25000,minbatch_25000,stdbatch_25000,p10batch_25000,p25batch_25000,p75batch_25000,p90batch_25000,skewbatch_25000,kurtosisbatch_25000,normbatch_25000,mean_abs_chgbatch_25000,abs_maxbatch_25000,abs_minbatch_25000,rangebatch_25000,maxtominbatch_25000,abs_avgbatch_25000,meanbatch_slices2_25000,medianbatch_slices2_25000,maxbatch_slices2_25000,minbatch_slices2_25000,stdbatch_slices2_25000,p10batch_slices2_25000,p25batch_slices2_25000,p75batch_slices2_25000,p90batch_slices2_25000,skewbatch_slices2_25000,kurtosisbatch_slices2_25000,normbatch_slices2_25000,mean_abs_chgbatch_slices2_25000,abs_maxbatch_slices2_25000,abs_minbatch_slices2_25000,rangebatch_slices2_25000,maxtominbatch_slices2_25000,abs_avgbatch_slices2_25000,meanbatch_25000_msignal,medianbatch_25000_msignal,maxbatch_25000_msignal,minbatch_25000_msignal,stdbatch_25000_msignal,p10batch_25000_msignal,p25batch_25000_msignal,p75batch_25000_msignal,p90batch_25000_msignal,skewbatch_25000_msignal,kurtosisbatch_25000_msignal,normbatch_25000_msignal,mean_abs_chgbatch_25000_msignal,abs_maxbatch_25000_msignal,abs_minbatch_25000_msignal,rangebatch_25000_msignal,maxtominbatch_25000_msignal,abs_avgbatch_25000_msignal,meanbatch_slices2_25000_msignal,medianbatch_slices2_25000_msignal,maxbatch_slices2_25000_msignal,minbatch_slices2_25000_msignal,stdbatch_slices2_25000_msignal,p10batch_slices2_25000_msignal,p25batch_slices2_25000_msignal,p75batch_slices2_25000_msignal,p90batch_slices2_25000_msignal,skewbatch_slices2_25000_msignal,kurtosisbatch_slices2_25000_msignal,normbatch_slices2_25000_msignal,mean_abs_chgbatch_slices2_25000_msignal,abs_maxbatch_slices2_25000_msignal,abs_minbatch_slices2_25000_msignal,rangebatch_slices2_25000_msignal,maxtominbatch_slices2_25000_msignal,abs_avgbatch_slices2_25000_msignal,grad_1,grad_2,grad_3,grad_4,lowpass_lf_0.0100,lowpass_ff_0.0100,lowpass_lf_0.0154,lowpass_ff_0.0154,lowpass_lf_0.0239,lowpass_ff_0.0239,lowpass_lf_0.0369,lowpass_ff_0.0369,lowpass_lf_0.0570,lowpass_ff_0.0570,lowpass_lf_0.0880,lowpass_ff_0.0880,lowpass_lf_0.1359,lowpass_ff_0.1359,lowpass_lf_0.2100,lowpass_ff_0.2100,lowpass_lf_0.3244,lowpass_ff_0.3244,lowpass_lf_0.5012,lowpass_ff_0.5012,highpass_lf_0.0100,highpass_ff_0.0100,highpass_lf_0.0163,highpass_ff_0.0163,highpass_lf_0.0264,highpass_ff_0.0264,highpass_lf_0.0430,highpass_ff_0.0430,highpass_lf_0.0699,highpass_ff_0.0699,highpass_lf_0.1136,highpass_ff_0.1136,highpass_lf_0.1848,highpass_ff_0.1848,highpass_lf_0.3005,highpass_ff_0.3005,highpass_lf_0.4885,highpass_ff_0.4885,highpass_lf_0.7943,highpass_ff_0.7943,ewm_mean_10,ewm_std_10,ewm_mean_50,ewm_std_50,ewm_mean_100,ewm_std_100,ewm_mean_500,ewm_std_500,ewm_mean_1000,ewm_std_1000
0,0.0001,-2.759766,0.0,-2.759766,1.0,,,,-2.847656,-2.423828,-3.130859,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.759766,-2.683594,-2.693359,-0.783691,-3.625,0.266602,-2.996094,-2.855469,-2.53125,-2.382812,1.015625,4.449219,0.663086,0.262451,3.625,0.783691,2.841797,0.216064,2.205078,-2.693359,-2.693359,-1.919922,-3.447266,0.237793,-3.0,-2.857422,-2.539062,-2.384766,0.023041,-0.050446,0.493408,0.257568,3.447266,1.919922,1.526367,0.557129,2.683594,0.076904,0.06781,1.977539,-0.865234,3.027344,-0.235718,-0.094238,0.228516,0.377197,3.775391,7.210938,3.423828,3.023438,6.386719,3.544922,5.601562,2.976562,4.964844,0.066895,0.067139,0.840332,-0.686523,2.998047,-0.240112,-0.095947,0.222534,0.376221,2.783203,2.710938,3.253906,3.017578,6.207031,4.679688,4.289062,3.318359,5.445312,-0.005825,0.017029,-0.018829,0.001498,-0.184082,-0.186157,-0.184082,-0.186401,-0.184082,-0.186157,-0.184082,-0.185547,-0.184082,-0.184814,-0.184082,-0.184326,-0.184082,-0.184082,-0.184082,-0.184082,-0.184082,-0.184082,-0.184082,-0.184082,0.0,0.002165,-0.0,0.002415,0.0,0.002041,0.0,0.001249,0.0,0.000498,0.0,8.9e-05,0.0,-9e-06,0.0,-2e-06,0.0,-0.0,0.0,-2.4e-05,-0.184082,0.0,-0.184082,0.0,-0.184082,0.0,-0.184082,0.0,-0.184082,0.0
1,0.0002,-2.855469,0.0,-2.847656,1.0,-2.759766,,,-2.423828,-3.130859,-3.144531,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.847656,-2.683594,-2.693359,-0.783691,-3.625,0.266602,-2.996094,-2.855469,-2.53125,-2.382812,1.015625,4.449219,0.663086,0.262451,3.625,0.783691,2.841797,0.216064,2.205078,-2.693359,-2.693359,-1.919922,-3.447266,0.237793,-3.0,-2.857422,-2.539062,-2.384766,0.023041,-0.050446,0.493408,0.257568,3.447266,1.919922,1.526367,0.557129,2.683594,0.164307,0.155151,2.064453,-0.777832,3.115234,-0.148315,-0.006844,0.315918,0.4646,3.863281,7.296875,3.511719,3.111328,6.472656,3.630859,5.691406,3.064453,5.050781,0.154297,0.154541,0.927734,-0.599121,3.085938,-0.152832,-0.008598,0.309814,0.463623,2.871094,2.796875,3.341797,3.105469,6.296875,4.769531,4.375,3.404297,5.53125,0.011208,-0.001787,-0.017319,0.013031,-0.184082,-0.186157,-0.184204,-0.186523,-0.184204,-0.186279,-0.184326,-0.185669,-0.18457,-0.185059,-0.184814,-0.184692,-0.185059,-0.18457,-0.185547,-0.184204,-0.186157,-0.182861,-0.187012,-0.181396,-0.005733,-0.003681,-0.00568,-0.003395,-0.005592,-0.00371,-0.005455,-0.00441,-0.005245,-0.005005,-0.004936,-0.005241,-0.004486,-0.005508,-0.003857,-0.006699,-0.002966,-0.008476,-0.001461,-0.004765,-0.187256,0.00412,-0.187012,0.00412,-0.187012,0.00412,-0.187012,0.00412,-0.187012,0.00412
2,0.0003,-2.408203,0.0,-2.423828,1.0,-2.847656,-2.759766,,-3.130859,-3.144531,-2.650391,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-2.423828,-2.683594,-2.693359,-0.783691,-3.625,0.266602,-2.996094,-2.855469,-2.53125,-2.382812,1.015625,4.449219,0.663086,0.262451,3.625,0.783691,2.841797,0.216064,2.205078,-2.693359,-2.693359,-1.919922,-3.447266,0.237793,-3.0,-2.857422,-2.539062,-2.384766,0.023041,-0.050446,0.493408,0.257568,3.447266,1.919922,1.526367,0.557129,2.683594,-0.259521,-0.268555,1.640625,-1.201172,2.691406,-0.572266,-0.43042,-0.107849,0.040894,3.439453,6.875,3.087891,2.6875,6.050781,3.207031,5.265625,2.640625,4.628906,-0.269531,-0.269287,0.503906,-1.022461,2.662109,-0.57666,-0.432373,-0.113831,0.039917,2.447266,2.373047,2.917969,2.681641,5.871094,4.34375,3.951172,2.982422,5.109375,-0.009399,-0.017609,0.007244,0.017929,-0.183838,-0.186157,-0.183838,-0.186523,-0.183594,-0.186401,-0.183472,-0.185913,-0.183105,-0.185425,-0.182495,-0.185425,-0.181763,-0.185669,-0.180542,-0.185791,-0.178711,-0.184448,-0.175781,-0.18042,0.022247,0.024551,0.022141,0.024872,0.021973,0.024643,0.021698,0.024078,0.02124,0.023743,0.020508,0.023941,0.019333,0.024246,0.017456,0.023239,0.014328,0.019135,0.007812,0.00766,-0.177002,0.015732,-0.178223,0.015083,-0.178345,0.014999,-0.178467,0.014931,-0.178467,0.014923
3,0.0004,-3.140625,0.0,-3.130859,1.0,-2.423828,-2.847656,-2.759766,-3.144531,-2.650391,-2.697266,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-3.130859,-2.683594,-2.693359,-0.783691,-3.625,0.266602,-2.996094,-2.855469,-2.53125,-2.382812,1.015625,4.449219,0.663086,0.262451,3.625,0.783691,2.841797,0.216064,2.205078,-2.693359,-2.693359,-1.919922,-3.447266,0.237793,-3.0,-2.857422,-2.539062,-2.384766,0.023041,-0.050446,0.493408,0.257568,3.447266,1.919922,1.526367,0.557129,2.683594,0.446289,0.437256,2.345703,-0.49585,3.396484,0.133667,0.275146,0.597656,0.746582,4.144531,7.578125,3.792969,3.392578,6.757812,3.914062,5.972656,3.345703,5.335938,0.436279,0.436523,1.209961,-0.316895,3.367188,0.129272,0.273438,0.591797,0.745605,3.152344,3.080078,3.623047,3.386719,6.578125,5.050781,4.65625,3.6875,5.8125,-0.024017,0.012703,0.018539,-0.008575,-0.18396,-0.186157,-0.183838,-0.186523,-0.183716,-0.186523,-0.183594,-0.186157,-0.183472,-0.185913,-0.183228,-0.186401,-0.182983,-0.187866,-0.182861,-0.190308,-0.18335,-0.193359,-0.185181,-0.197144,-0.02475,-0.022507,-0.024826,-0.022125,-0.024948,-0.022247,-0.025116,-0.022629,-0.02536,-0.022568,-0.02562,-0.021484,-0.025787,-0.01918,-0.025497,-0.015915,-0.023682,-0.011742,-0.015701,-0.004917,-0.187378,0.021194,-0.186279,0.019745,-0.186157,0.019562,-0.186035,0.019424,-0.186035,0.019394
4,0.0005,-3.152344,0.0,-3.144531,1.0,-3.130859,-2.423828,-2.847656,-2.650391,-2.697266,-2.595703,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-3.144531,-2.683594,-2.693359,-0.783691,-3.625,0.266602,-2.996094,-2.855469,-2.53125,-2.382812,1.015625,4.449219,0.663086,0.262451,3.625,0.783691,2.841797,0.216064,2.205078,-2.693359,-2.693359,-1.919922,-3.447266,0.237793,-3.0,-2.857422,-2.539062,-2.384766,0.023041,-0.050446,0.493408,0.257568,3.447266,1.919922,1.526367,0.557129,2.683594,0.461182,0.451904,2.361328,-0.480957,3.412109,0.148438,0.290039,0.612793,0.76123,4.160156,7.59375,3.808594,3.408203,6.769531,3.927734,5.988281,3.361328,5.347656,0.451172,0.451416,1.224609,-0.302002,3.382812,0.144043,0.28833,0.606445,0.760254,3.167969,3.09375,3.638672,3.402344,6.59375,5.066406,4.671875,3.701172,5.828125,0.016006,0.01947,-0.009903,-0.015884,-0.184692,-0.186157,-0.185059,-0.186523,-0.185547,-0.186523,-0.186401,-0.186279,-0.187622,-0.186157,-0.189575,-0.186646,-0.192261,-0.188232,-0.196289,-0.19104,-0.201904,-0.195312,-0.209229,-0.201172,-0.024963,-0.023514,-0.024551,-0.023087,-0.023911,-0.023163,-0.022858,-0.023483,-0.02121,-0.023376,-0.018631,-0.022186,-0.014687,-0.019547,-0.008919,-0.01519,-0.00093,-0.00885,0.007572,-0.001156,-0.193848,0.020752,-0.191284,0.02005,-0.19104,0.019943,-0.190796,0.019852,-0.190796,0.019836


In [11]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
target = 'open_channels'

In [12]:
for fold, (trn_ndcs, vld_ndcs) in enumerate(kf.split(pre_train, pre_train[target])):
    x_trn, x_vld = pre_train[features].iloc[trn_ndcs], pre_train[features].iloc[vld_ndcs]
    y_trn, y_vld = pre_train[target][trn_ndcs], pre_train[target][vld_ndcs]
    trn_set = lgb.Dataset(x_trn, y_trn)
    vld_set = lgb.Dataset(x_vld, y_vld)
    break

In [13]:
model = lgb.train(params, trn_set, num_boost_round=10000, early_stopping_rounds=100, valid_sets=[vld_set], verbose_eval=100)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.237965
[200]	valid_0's rmse: 0.155734
[300]	valid_0's rmse: 0.1549
[400]	valid_0's rmse: 0.154751
[500]	valid_0's rmse: 0.154648
[600]	valid_0's rmse: 0.154568
[700]	valid_0's rmse: 0.154504
[800]	valid_0's rmse: 0.154467
[900]	valid_0's rmse: 0.154425
[1000]	valid_0's rmse: 0.154396
[1100]	valid_0's rmse: 0.154366
[1200]	valid_0's rmse: 0.154335
[1300]	valid_0's rmse: 0.154302
[1400]	valid_0's rmse: 0.154274
[1500]	valid_0's rmse: 0.154262
[1600]	valid_0's rmse: 0.154255
[1700]	valid_0's rmse: 0.154241
[1800]	valid_0's rmse: 0.154232
[1900]	valid_0's rmse: 0.154198
[2000]	valid_0's rmse: 0.154196
Early stopping, best iteration is:
[1941]	valid_0's rmse: 0.154196


In [15]:
vld_pred = model.predict(x_vld)
vld_pred = np.round(np.clip(vld_pred, 0, 10)).astype(int)

In [16]:
f1 = metrics.f1_score(y_vld.astype(int), vld_pred, average = 'macro')
print(f1)

0.9382256376476771
