# Environment
- GPU : NVIDIA GeForce GTX 1060
- CPU : Intel CORE i5 8th Gen

## Library version check

In [None]:
import sys
import tqdm as tq
import lightgbm as lgb
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np
print("-------------------------- Python & library version --------------------------")
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("numpy version: {}".format(np.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("tqdm version: {}".format(tq.__version__))
print("lightgbm version: {}".format(lgb.__version__))
print("seaborn version: {}".format(sns.__version__))
print("scikit-learn version: {}".format(skl.__version__))
print("------------------------------------------------------------------------------")

## Import

In [None]:
import random
import glob
import re
import os
from datetime import datetime, timedelta
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tqdm import tqdm
from lightgbm import LGBMRegressor
from sklearn.cluster import KMeans
import warnings

pd.set_option('display.max_columns', 30)
warnings.filterwarnings(action='ignore')

## Load Data

In [None]:
train_paths = glob.glob('./train/*.csv')
test_paths = pd.read_csv('./test.csv')['data_path'].values

## Data Reconstructure

In [None]:
train = pd.DataFrame()
for path in tqdm(train_paths):
        driver = str(path.split('/')[-1].split('.')[0].split('_')[1][0])
        data = pd.read_csv(path)
        
        data['diff_1'] = (data['Signal A'] - data['Signal A'].shift(1)).fillna(0)
        data['diff_2'] = (data['Signal B'] - data['Signal B'].shift(1)).fillna(0)
        data['diff_3'] = (data['Signal C'] - data['Signal C'].shift(1)).fillna(0)
        data['diff_4'] = (data['Sensor A'] - data['Sensor A'].shift(1)).fillna(0)
        data['diff_5'] = (data['Sensor B'] - data['Sensor B'].shift(1)).fillna(0)
        data['diff_6'] = (data['Sensor C'] - data['Sensor C'].shift(1)).fillna(0)
        data['diff_7'] = (data['Sensor D'] - data['Sensor D'].shift(1)).fillna(0)

        data['rolling_diff_1'] = data['diff_1'].rolling(5).sum().bfill()
        data['rolling_diff_2'] = data['diff_2'].rolling(5).sum().bfill()
        data['rolling_diff_3'] = data['diff_3'].rolling(5).sum().bfill()
        data['rolling_diff_4'] = data['diff_4'].rolling(5).sum().bfill()
        data['rolling_diff_5'] = data['diff_5'].rolling(5).sum().bfill()
        data['rolling_diff_6'] = data['diff_6'].rolling(5).sum().bfill()
        data['rolling_diff_7'] = data['diff_7'].rolling(5).sum().bfill()
        
        data['rolling_diff_1_std'] = data['diff_1'].rolling(5).std().bfill()
        data['rolling_diff_2_std'] = data['diff_2'].rolling(5).std().bfill()
        data['rolling_diff_3_std'] = data['diff_3'].rolling(5).std().bfill()
        data['rolling_diff_4_std'] = data['diff_4'].rolling(5).std().bfill()
        data['rolling_diff_5_std'] = data['diff_5'].rolling(5).std().bfill()
        data['rolling_diff_6_std'] = data['diff_6'].rolling(5).std().bfill()
        data['rolling_diff_7_std'] = data['diff_7'].rolling(5).std().bfill()
        
        data['driver'] = 0 if driver == 'A' else 1
        label = float(path.split('\\')[-1].split('.')[0].split('_')[0][:-2])
        data['label'] = label
        train = pd.concat([train, data], axis = 0)
        
train.columns = ['time','signal_A','signal_B','signal_C','sensor_A','sensor_B','sensor_C','sensor_D',
                 'diff_1','diff_2','diff_3','diff_4','diff_5','diff_6','diff_7',
                 'rolling_diff_1','rolling_diff_2','rolling_diff_3','rolling_diff_4','rolling_diff_5','rolling_diff_6','rolling_diff_7',
                 'rolling_diff_1_std','rolling_diff_2_std','rolling_diff_3_std','rolling_diff_4_std','rolling_diff_5_std','rolling_diff_6_std','rolling_diff_7_std',
                 'driver','label']
train = train.reset_index(drop=True)
train

In [None]:
test = []
for path in tqdm(test_paths):
        driver = str(path.split('/')[-1].split('.')[0].split('_')[1][0])
        data = pd.read_csv(path)
        
        data['diff_1'] = (data['Signal A'] - data['Signal A'].shift(1)).fillna(0)
        data['diff_2'] = (data['Signal B'] - data['Signal B'].shift(1)).fillna(0)
        data['diff_3'] = (data['Signal C'] - data['Signal C'].shift(1)).fillna(0)
        data['diff_4'] = (data['Sensor A'] - data['Sensor A'].shift(1)).fillna(0)
        data['diff_5'] = (data['Sensor B'] - data['Sensor B'].shift(1)).fillna(0)
        data['diff_6'] = (data['Sensor C'] - data['Sensor C'].shift(1)).fillna(0)
        data['diff_7'] = (data['Sensor D'] - data['Sensor D'].shift(1)).fillna(0)

        data['rolling_diff_1'] = data['diff_1'].rolling(5).sum().bfill()
        data['rolling_diff_2'] = data['diff_2'].rolling(5).sum().bfill()
        data['rolling_diff_3'] = data['diff_3'].rolling(5).sum().bfill()
        data['rolling_diff_4'] = data['diff_4'].rolling(5).sum().bfill()
        data['rolling_diff_5'] = data['diff_5'].rolling(5).sum().bfill()
        data['rolling_diff_6'] = data['diff_6'].rolling(5).sum().bfill()
        data['rolling_diff_7'] = data['diff_7'].rolling(5).sum().bfill()
        
        data['rolling_diff_1_std'] = data['diff_1'].rolling(5).std().bfill()
        data['rolling_diff_2_std'] = data['diff_2'].rolling(5).std().bfill()
        data['rolling_diff_3_std'] = data['diff_3'].rolling(5).std().bfill()
        data['rolling_diff_4_std'] = data['diff_4'].rolling(5).std().bfill()
        data['rolling_diff_5_std'] = data['diff_5'].rolling(5).std().bfill()
        data['rolling_diff_6_std'] = data['diff_6'].rolling(5).std().bfill()
        data['rolling_diff_7_std'] = data['diff_7'].rolling(5).std().bfill()
        
        data['driver'] = 0 if driver == 'A' else 1
        
        test.append(data)

## Feature Engineering

In [None]:
def diff(df, column1, column2):
    df[column1 + '-' + column2] = 0
    df[column1 + '-' + column2] = df[column1] - df[column2]
    return df[column1 + '-' + column2]

def sum_2(df, column1, column2):
    df[column1 + '+' + column2] = 0
    df[column1 + '+' + column2] = df[column1] + df[column2]
    return df[column1 + '+' + column2]

def sum_3(df, column1, column2, column3):
    df[column1 + '+' + column2 + '+' + column3] = 0
    df[column1 + '+' + column2 + '+' + column3] = df[column1] + df[column2] + df[column3]
    return df[column1 + '+' + column2 + '+' + column3]

def sum_4(df, column1, column2, column3, column4):
    df[column1 + '+' + column2 + '+' + column3 + '+' + column4] = 0
    df[column1 + '+' + column2 + '+' + column3 + '+' + column4] = df[column1] + df[column2] + df[column3] + df[column4]
    return df[column1 + '+' + column2 + '+' + column3 + '+' + column4]

In [None]:
train['time_to_minute'] = train['time']//60
train['time_to_second'] = train['time']%60

train['sigA-sigB'] = diff(train,'signal_A','signal_B')
train['sigB-sigC'] = diff(train,'signal_B','signal_C')
train['sigC-sigA'] = diff(train,'signal_C','signal_A')

train['sigA+sigB'] = sum_2(train,'signal_A','signal_B')
train['sigB+sigC'] = sum_2(train,'signal_B','signal_C')
train['sigC+sigA'] = sum_2(train,'signal_C','signal_A')

train['sigA+sigB+sigC'] = sum_3(train,'signal_A','signal_B','signal_C')

train['senA-senB'] = diff(train,'sensor_A','sensor_B')
train['senA-senC'] = diff(train,'sensor_A','sensor_C')
train['senA-senD'] = diff(train,'sensor_A','sensor_D')
train['senB-senC'] = diff(train,'sensor_B','sensor_C')
train['senB-senD'] = diff(train,'sensor_B','sensor_D')
train['senC-senD'] = diff(train,'sensor_C','sensor_D')

train['senA+senB'] = sum_2(train,'sensor_A','sensor_B')
train['senA+senC'] = sum_2(train,'sensor_A','sensor_C')
train['senA+senD'] = sum_2(train,'sensor_A','sensor_D')
train['senB+senC'] = sum_2(train,'sensor_B','sensor_C')
train['senB+senD'] = sum_2(train,'sensor_B','sensor_D')
train['senC+senD'] = sum_2(train,'sensor_C','sensor_D')

train['senA+senB+senC'] = sum_3(train,'sensor_A','sensor_B','sensor_C')
train['senA+senB+senD'] = sum_3(train,'sensor_A','sensor_B','sensor_D')
train['senB+senC+senD'] = sum_3(train,'sensor_B','sensor_C','sensor_D')

train['senA+senB+senC+senD'] = sum_4(train,'sensor_A','sensor_B','sensor_C','sensor_D')

train = train.drop(columns = ['time'])

In [None]:
test_new = []
for data in tqdm(test):
    data.columns = ['time','signal_A','signal_B','signal_C','sensor_A','sensor_B','sensor_C','sensor_D',
                    'diff_1', 'diff_2', 'diff_3', 'diff_4','diff_5', 'diff_6', 'diff_7', 
                    'rolling_diff_1', 'rolling_diff_2','rolling_diff_3', 'rolling_diff_4', 'rolling_diff_5', 'rolling_diff_6','rolling_diff_7',
                    'rolling_diff_1_std','rolling_diff_2_std','rolling_diff_3_std','rolling_diff_4_std','rolling_diff_5_std','rolling_diff_6_std','rolling_diff_7_std',
                    'driver']
    data['time_to_minute'] = data['time']//60
    data['time_to_second'] = data['time']%60
    
    data['sigA-sigB'] = diff(data,'signal_A','signal_B')
    data['sigB-sigC'] = diff(data,'signal_B','signal_C')
    data['sigC-sigA'] = diff(data,'signal_C','signal_A')

    data['sigA+sigB'] = sum_2(data,'signal_A','signal_B')
    data['sigB+sigC'] = sum_2(data,'signal_B','signal_C')
    data['sigC+sigA'] = sum_2(data,'signal_C','signal_A')

    data['sigA+sigB+sigC'] = sum_3(data,'signal_A','signal_B','signal_C')

    data['senA-senB'] = diff(data,'sensor_A','sensor_B')
    data['senA-senC'] = diff(data,'sensor_A','sensor_C')
    data['senA-senD'] = diff(data,'sensor_A','sensor_D')
    data['senB-senC'] = diff(data,'sensor_B','sensor_C')
    data['senB-senD'] = diff(data,'sensor_B','sensor_D')
    data['senC-senD'] = diff(data,'sensor_C','sensor_D')

    data['senA+senB'] = sum_2(data,'sensor_A','sensor_B')
    data['senA+senC'] = sum_2(data,'sensor_A','sensor_C')
    data['senA+senD'] = sum_2(data,'sensor_A','sensor_D')
    data['senB+senC'] = sum_2(data,'sensor_B','sensor_C')
    data['senB+senD'] = sum_2(data,'sensor_B','sensor_D')
    data['senC+senD'] = sum_2(data,'sensor_C','sensor_D')

    data['senA+senB+senC'] = sum_3(data,'sensor_A','sensor_B','sensor_C')
    data['senA+senB+senD'] = sum_3(data,'sensor_A','sensor_B','sensor_D')
    data['senB+senC+senD'] = sum_3(data,'sensor_B','sensor_C','sensor_D')

    data['senA+senB+senC+senD'] = sum_4(data,'sensor_A','sensor_B','sensor_C','sensor_D')
    data = data.drop(columns = ['time'])
    test_new.append(data)

## Modeling - LGBM

In [None]:
X = train.drop(columns= ['label'])
y = train['label'].values
X_test = test_new.copy()

In [None]:
# 경고 끄기
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings(action='ignore')

models = []
mae_scores = []

X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size = 0.02, stratify = y, random_state=42)

model = LGBMRegressor(boosting_type='gbdt',
                    objective='tweedie', 
                    n_estimators=1500,
                    max_depth=10,
                    learning_rate=0.3,
                    colsample_bytree=0.9,
                    subsample=1.0,
                    min_child_weight=150,
                    num_leaves=16,
                    reg_alpha=20,
                    n_jobs=-1,
                    random_state=42) 

model.fit(X_train, Y_train, eval_set=[(X_train, Y_train), (X_valid, Y_valid)], early_stopping_rounds=50, verbose=100)

pred = model.predict(X_valid)
score = mean_absolute_error(Y_valid, pred)
print(f"Tweedie Validation MAE score: {score}")
models.append(model)

model = LGBMRegressor(boosting_type='gbdt',
                    objective='poisson', 
                    n_estimators=1500,
                    max_depth=10,
                    learning_rate=0.3,
                    colsample_bytree=0.9,
                    subsample=1.0,
                    min_child_weight=150,
                    num_leaves=16,
                    reg_alpha=20,
                    n_jobs=-1,
                    random_state=42) 

model.fit(X_train, Y_train, eval_set=[(X_train, Y_train), (X_valid, Y_valid)], early_stopping_rounds=50, verbose=100)

pred = model.predict(X_valid)
score = mean_absolute_error(Y_valid, pred)
print(f"Poisson Validation MAE score: {score}")
models.append(model)

In [None]:
#feature importance
predictors = X.columns
tmp = pd.DataFrame({'Feature': predictors, 'Feature importance': models[0].feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=7)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90, size=10)
plt.show()

In [None]:
#feature importance
predictors = X.columns
tmp = pd.DataFrame({'Feature': predictors, 'Feature importance': models[1].feature_importances_})
tmp = tmp.sort_values(by='Feature importance',ascending=False)
plt.figure(figsize = (7,4))
plt.title('Features importance',fontsize=7)
s = sns.barplot(x='Feature',y='Feature importance',data=tmp)
s.set_xticklabels(s.get_xticklabels(),rotation=90, size=10)
plt.show()

## Test Inference

In [None]:
preds_final = []
for test in tqdm(X_test):
    test.columns = X.columns
    pred = models[0].predict(test).mean()*0.5 + models[1].predict(test).mean()*0.5
    preds_final.append(pred)

## Submission

In [None]:
submit = pd.read_csv('./sample_submission.csv')

# 결과 후처리
preds_final_new = np.round(preds_final, 0).astype(int)

submit['weight'] = preds_final_new
submit.loc[submit['weight'] > 400, 'weight'] = submit.loc[submit['weight'] > 400,]['weight'].apply(lambda x : np.round(x, -2))

submit.to_csv('./LGBM_58.csv', index=False)