# Environment
- CPU : AMD Ryzen 9 6900HX with Radeon Graphics

## Import

In [1]:
import random
import glob
import os
import re
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

In [2]:
import sys
import tqdm as tq
import catboost as cat
import matplotlib
import seaborn as sns
import sklearn as skl
import pandas as pd
import numpy as np
print("-------------------------- Python & library version --------------------------")
print("Python version: {}".format(sys.version))
print("pandas version: {}".format(pd.__version__))
print("numpy version: {}".format(np.__version__))
print("matplotlib version: {}".format(matplotlib.__version__))
print("tqdm version: {}".format(tq.__version__))
print("catboost version: {}".format(cat.__version__))
print("seaborn version: {}".format(sns.__version__))
print("scikit-learn version: {}".format(skl.__version__))
print("------------------------------------------------------------------------------")

-------------------------- Python & library version --------------------------
Python version: 3.9.13 (main, Aug 25 2022, 23:51:50) [MSC v.1916 64 bit (AMD64)]
pandas version: 2.0.3
numpy version: 1.21.5
matplotlib version: 3.5.2
tqdm version: 4.64.1
catboost version: 1.1.1
seaborn version: 0.11.2
scikit-learn version: 1.0.2
------------------------------------------------------------------------------


In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cpu')

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

In [5]:
import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", message="DataFrame is highly fragmented")

## Hyperparameter Setting

### 데이터 불러오기

In [9]:
train_paths = glob.glob('./train/*.csv')
test_paths = pd.read_csv('./test.csv')['data_path'].values

### 데이터 전처리

In [10]:
extremes = {
    'Time[s]': {'min': float('inf'), 'max': float('-inf')},
    'Signal A': {'min': float('inf'), 'max': float('-inf')},
    'Signal B': {'min': float('inf'), 'max': float('-inf')},
    'Signal C': {'min': float('inf'), 'max': float('-inf')},
    'Sensor A': {'min': float('inf'), 'max': float('-inf')},
    'Sensor B': {'min': float('inf'), 'max': float('-inf')},
    'Sensor C': {'min': float('inf'), 'max': float('-inf')},
    'Sensor D': {'min': float('inf'), 'max': float('-inf')}
}

# 주어진 모든 경로에 대해 반복
for path in train_paths:
    data = pd.read_csv(path)

    # 각 열에 대해 최소값과 최대값 업데이트
    for column in extremes.keys():
        extremes[column]['min'] = min(extremes[column]['min'], data[column].min())
        extremes[column]['max'] = max(extremes[column]['max'], data[column].max())

In [8]:
datas = []

In [10]:
def make_train_data(train_paths):
    sequences = []
    sequence_labels = []
    for path in tqdm(train_paths):
        driver = str(path.split('/')[-1].split('.')[0].split('_')[1][0])
        data = pd.read_csv(path)
        data['driver'] = 0 if driver == 'A' else 1
        data['A'] = data['Signal A']  + data['Signal B']
        data['B'] = data['Signal A']  + data['Signal C']
        data['C'] = data['Signal B']  + data['Signal C']
        data['D'] = data['Sensor A']  + data['Sensor B']
        data['E'] = data['Sensor A']  + data['Sensor C']
        data['F'] = data['Sensor A']  + data['Sensor D']
        data['G'] = data['Sensor B']  + data['Sensor C']
        data['H'] = data['Sensor B']  + data['Sensor D']
        data['I'] = data['Sensor C']  + data['Sensor D']

        data['A_1'] =  np.abs(data['Signal A']  - data['Signal B'])
        data['B_1'] =  np.abs(data['Signal A']  - data['Signal C'])
        data['C_1'] =  np.abs(data['Signal B']  - data['Signal C'])
        data['D_1'] =  np.abs(data['Sensor A']  - data['Sensor B'])
        data['E_1'] =  np.abs(data['Sensor A']  - data['Sensor C'])
        data['F_1'] =  np.abs(data['Sensor A']  - data['Sensor D'])
        data['G_1'] =  np.abs(data['Sensor B']  - data['Sensor C'])
        data['H_1'] =  np.abs(data['Sensor B']  - data['Sensor D'])
        data['I_1'] =  np.abs(data['Sensor C']  - data['Sensor D'])
        
        data['diff_1'] = (data['Signal A'] - data['Signal A'].shift(1)).fillna(0)
        data['diff_2'] = (data['Signal B'] - data['Signal B'].shift(1)).fillna(0)
        data['diff_3'] = (data['Signal C'] - data['Signal C'].shift(1)).fillna(0)
        data['diff_4'] = (data['Sensor A'] - data['Sensor A'].shift(1)).fillna(0)
        data['diff_5'] = (data['Sensor B'] - data['Sensor B'].shift(1)).fillna(0)
        data['diff_6'] = (data['Sensor C'] - data['Sensor C'].shift(1)).fillna(0)
        data['diff_7'] = (data['Sensor D'] - data['Sensor D'].shift(1)).fillna(0)

        data['rolling_diff_1'] = data['diff_1'].rolling(5).sum().bfill()
        data['rolling_diff_2'] = data['diff_2'].rolling(5).sum().bfill()
        data['rolling_diff_3'] = data['diff_3'].rolling(5).sum().bfill()
        data['rolling_diff_4'] = data['diff_4'].rolling(5).sum().bfill()
        data['rolling_diff_5'] = data['diff_5'].rolling(5).sum().bfill()
        data['rolling_diff_6'] = data['diff_6'].rolling(5).sum().bfill()
        data['rolling_diff_7'] = data['diff_7'].rolling(5).sum().bfill()

        data['rolling_diff_1B'] = data['diff_1'].rolling(20).sum().bfill()
        data['rolling_diff_2B'] = data['diff_2'].rolling(20).sum().bfill()
        data['rolling_diff_3B'] = data['diff_3'].rolling(20).sum().bfill()
        data['rolling_diff_4B'] = data['diff_4'].rolling(20).sum().bfill()
        data['rolling_diff_5B'] = data['diff_5'].rolling(20).sum().bfill()
        data['rolling_diff_6B'] = data['diff_6'].rolling(20).sum().bfill()
        data['rolling_diff_7B'] = data['diff_7'].rolling(20).sum().bfill()
        
        data['rolling_diff_1C'] = data['diff_1'].rolling(50).sum().bfill()
        data['rolling_diff_2C'] = data['diff_2'].rolling(50).sum().bfill()
        data['rolling_diff_3C'] = data['diff_3'].rolling(50).sum().bfill()
        data['rolling_diff_4C'] = data['diff_4'].rolling(50).sum().bfill()
        data['rolling_diff_5C'] = data['diff_5'].rolling(50).sum().bfill()
        data['rolling_diff_6C'] = data['diff_6'].rolling(50).sum().bfill()
        data['rolling_diff_7C'] = data['diff_7'].rolling(50).sum().bfill()

        data['rolling_Signal A_5'] = data['Signal A'].rolling(5).mean().bfill()
        data['rolling_Signal A_3'] = data['Signal A'].rolling(3).mean().bfill()
        data['rolling_Signal A_7'] = data['Signal A'].rolling(7).mean().bfill()
        data['rolling_Signal A_10'] = data['Signal A'].rolling(10).mean().bfill()
        data['rolling_Signal A_20'] = data['Signal A'].rolling(20).mean().bfill()
        data['rolling_Signal A_50'] = data['Signal A'].rolling(50).mean().bfill()
        data['rolling_Signal A_100'] = data['Signal A'].rolling(100).mean().bfill()

        data['rolling_Signal B_5'] = data['Signal B'].rolling(5).mean().bfill()
        data['rolling_Signal B_3'] = data['Signal B'].rolling(3).mean().bfill()
        data['rolling_Signal B_7'] = data['Signal B'].rolling(7).mean().bfill()        
        data['rolling_Signal B_10'] = data['Signal B'].rolling(10).mean().bfill()
        data['rolling_Signal B_20'] = data['Signal B'].rolling(20).mean().bfill()
        data['rolling_Signal B_50'] = data['Signal B'].rolling(50).mean().bfill()
        data['rolling_Signal B_100'] = data['Signal B'].rolling(100).mean().bfill()

        data['rolling_Signal C_5'] = data['Signal C'].rolling(5).mean().bfill()
        data['rolling_Signal C_3'] = data['Signal C'].rolling(3).mean().bfill()
        data['rolling_Signal C_7'] = data['Signal C'].rolling(7).mean().bfill()        
        data['rolling_Signal C_10'] = data['Signal C'].rolling(10).mean().bfill()
        data['rolling_Signal C_20'] = data['Signal C'].rolling(20).mean().bfill()
        data['rolling_Signal C_50'] = data['Signal C'].rolling(50).mean().bfill()
        data['rolling_Signal C_100'] = data['Signal C'].rolling(100).mean().bfill()

        data['rolling_Sensor A_5'] = data['Sensor A'].rolling(5).mean().bfill()
        data['rolling_Sensor A_3'] = data['Sensor A'].rolling(3).mean().bfill()
        data['rolling_Sensor A_7'] = data['Sensor A'].rolling(7).mean().bfill()        
        data['rolling_Sensor A_10'] = data['Sensor A'].rolling(10).mean().bfill()
        data['rolling_Sensor A_20'] = data['Sensor A'].rolling(20).mean().bfill()
        data['rolling_Sensor A_50'] = data['Sensor A'].rolling(50).mean().bfill()
        data['rolling_Sensor A_100'] = data['Sensor A'].rolling(100).mean().bfill()

        data['rolling_Sensor B_5'] = data['Sensor B'].rolling(5).mean().bfill()
        data['rolling_Sensor B_3'] = data['Sensor B'].rolling(3).mean().bfill()
        data['rolling_Sensor B_7'] = data['Sensor B'].rolling(7).mean().bfill()        
        data['rolling_Sensor B_10'] = data['Sensor B'].rolling(10).mean().bfill()
        data['rolling_Sensor B_20'] = data['Sensor B'].rolling(20).mean().bfill()
        data['rolling_Sensor B_50'] = data['Sensor B'].rolling(50).mean().bfill()
        data['rolling_Sensor B_100'] = data['Sensor B'].rolling(100).mean().bfill()

        data['rolling_Sensor C_5'] = data['Sensor C'].rolling(5).mean().bfill()
        data['rolling_Sensor C_3'] = data['Sensor C'].rolling(3).mean().bfill()
        data['rolling_Sensor C_7'] = data['Sensor C'].rolling(7).mean().bfill()        
        data['rolling_Sensor C_10'] = data['Sensor C'].rolling(10).mean().bfill()
        data['rolling_Sensor C_20'] = data['Sensor C'].rolling(20).mean().bfill()
        data['rolling_Sensor C_50'] = data['Sensor C'].rolling(50).mean().bfill()
        data['rolling_Sensor C_100'] = data['Sensor C'].rolling(100).mean().bfill()

        data['rolling_Sensor D_5'] = data['Sensor D'].rolling(5).mean().bfill()
        data['rolling_Sensor D_3'] = data['Sensor D'].rolling(3).mean().bfill()
        data['rolling_Sensor D_7'] = data['Sensor D'].rolling(7).mean().bfill()        
        data['rolling_Sensor D_10'] = data['Sensor D'].rolling(10).mean().bfill()
        data['rolling_Sensor D_20'] = data['Sensor D'].rolling(20).mean().bfill()
        data['rolling_Sensor D_50'] = data['Sensor D'].rolling(50).mean().bfill()
        data['rolling_Sensor D_100'] = data['Sensor D'].rolling(100).mean().bfill()

        label = float(path.split('\\')[-1].split('.')[0].split('_')[0][:-2])
        #label = label / 902.  Label 정규화
        data['label'] = label        
        datas.append(data)
    return datas,data

In [11]:
train_window_data, train_labels = make_train_data(train_paths)

  0%|          | 0/16 [00:00<?, ?it/s]

In [12]:
tests = []
def make_predict_data(test_paths):
    '''
        본 함수는 Test Sample들이 대부분 500개의 Time Step으로 되어있다는 정보를 안다는 가정하에 구현되었습니다.
        추론 Window Size : 500에 최적화
    '''
    sequences = []
    for path in tqdm(test_paths):
        driver = str(path.split('/')[-1].split('.')[0].split('_')[1][0])
        data = pd.read_csv(path)
        data['driver'] = 0 if driver == 'A' else 1
        data['A'] = data['Signal A']  + data['Signal B']
        data['B'] = data['Signal A']  + data['Signal C']
        data['C'] = data['Signal B']  + data['Signal C']
        data['D'] = data['Sensor A']  + data['Sensor B']
        data['E'] = data['Sensor A']  + data['Sensor C']
        data['F'] = data['Sensor A']  + data['Sensor D']
        data['G'] = data['Sensor B']  + data['Sensor C']
        data['H'] = data['Sensor B']  + data['Sensor D']
        data['I'] = data['Sensor C']  + data['Sensor D']

        data['A_1'] =  np.abs(data['Signal A']  - data['Signal B'])
        data['B_1'] =  np.abs(data['Signal A']  - data['Signal C'])
        data['C_1'] =  np.abs(data['Signal B']  - data['Signal C'])
        data['D_1'] =  np.abs(data['Sensor A']  - data['Sensor B'])
        data['E_1'] =  np.abs(data['Sensor A']  - data['Sensor C'])
        data['F_1'] =  np.abs(data['Sensor A']  - data['Sensor D'])
        data['G_1'] =  np.abs(data['Sensor B']  - data['Sensor C'])
        data['H_1'] =  np.abs(data['Sensor B']  - data['Sensor D'])
        data['I_1'] =  np.abs(data['Sensor C']  - data['Sensor D'])
        
        data['diff_1'] = (data['Signal A'] - data['Signal A'].shift(1)).fillna(0)
        data['diff_2'] = (data['Signal B'] - data['Signal B'].shift(1)).fillna(0)
        data['diff_3'] = (data['Signal C'] - data['Signal C'].shift(1)).fillna(0)
        data['diff_4'] = (data['Sensor A'] - data['Sensor A'].shift(1)).fillna(0)
        data['diff_5'] = (data['Sensor B'] - data['Sensor B'].shift(1)).fillna(0)
        data['diff_6'] = (data['Sensor C'] - data['Sensor C'].shift(1)).fillna(0)
        data['diff_7'] = (data['Sensor D'] - data['Sensor D'].shift(1)).fillna(0)

        data['rolling_diff_1'] = data['diff_1'].rolling(5).sum().bfill()
        data['rolling_diff_2'] = data['diff_2'].rolling(5).sum().bfill()
        data['rolling_diff_3'] = data['diff_3'].rolling(5).sum().bfill()
        data['rolling_diff_4'] = data['diff_4'].rolling(5).sum().bfill()
        data['rolling_diff_5'] = data['diff_5'].rolling(5).sum().bfill()
        data['rolling_diff_6'] = data['diff_6'].rolling(5).sum().bfill()
        data['rolling_diff_7'] = data['diff_7'].rolling(5).sum().bfill()

        data['rolling_diff_1B'] = data['diff_1'].rolling(20).sum().bfill()
        data['rolling_diff_2B'] = data['diff_2'].rolling(20).sum().bfill()
        data['rolling_diff_3B'] = data['diff_3'].rolling(20).sum().bfill()
        data['rolling_diff_4B'] = data['diff_4'].rolling(20).sum().bfill()
        data['rolling_diff_5B'] = data['diff_5'].rolling(20).sum().bfill()
        data['rolling_diff_6B'] = data['diff_6'].rolling(20).sum().bfill()
        data['rolling_diff_7B'] = data['diff_7'].rolling(20).sum().bfill()
        
        data['rolling_diff_1C'] = data['diff_1'].rolling(50).sum().bfill()
        data['rolling_diff_2C'] = data['diff_2'].rolling(50).sum().bfill()
        data['rolling_diff_3C'] = data['diff_3'].rolling(50).sum().bfill()
        data['rolling_diff_4C'] = data['diff_4'].rolling(50).sum().bfill()
        data['rolling_diff_5C'] = data['diff_5'].rolling(50).sum().bfill()
        data['rolling_diff_6C'] = data['diff_6'].rolling(50).sum().bfill()
        data['rolling_diff_7C'] = data['diff_7'].rolling(50).sum().bfill()

        data['rolling_Signal A_5'] = data['Signal A'].rolling(5).mean().bfill()
        data['rolling_Signal A_3'] = data['Signal A'].rolling(3).mean().bfill()
        data['rolling_Signal A_7'] = data['Signal A'].rolling(7).mean().bfill()
        data['rolling_Signal A_10'] = data['Signal A'].rolling(10).mean().bfill()
        data['rolling_Signal A_20'] = data['Signal A'].rolling(20).mean().bfill()
        data['rolling_Signal A_50'] = data['Signal A'].rolling(50).mean().bfill()
        data['rolling_Signal A_100'] = data['Signal A'].rolling(100).mean().bfill()

        data['rolling_Signal B_5'] = data['Signal B'].rolling(5).mean().bfill()
        data['rolling_Signal B_3'] = data['Signal B'].rolling(3).mean().bfill()
        data['rolling_Signal B_7'] = data['Signal B'].rolling(7).mean().bfill()        
        data['rolling_Signal B_10'] = data['Signal B'].rolling(10).mean().bfill()
        data['rolling_Signal B_20'] = data['Signal B'].rolling(20).mean().bfill()
        data['rolling_Signal B_50'] = data['Signal B'].rolling(50).mean().bfill()
        data['rolling_Signal B_100'] = data['Signal B'].rolling(100).mean().bfill()

        data['rolling_Signal C_5'] = data['Signal C'].rolling(5).mean().bfill()
        data['rolling_Signal C_3'] = data['Signal C'].rolling(3).mean().bfill()
        data['rolling_Signal C_7'] = data['Signal C'].rolling(7).mean().bfill()        
        data['rolling_Signal C_10'] = data['Signal C'].rolling(10).mean().bfill()
        data['rolling_Signal C_20'] = data['Signal C'].rolling(20).mean().bfill()
        data['rolling_Signal C_50'] = data['Signal C'].rolling(50).mean().bfill()
        data['rolling_Signal C_100'] = data['Signal C'].rolling(100).mean().bfill()

        data['rolling_Sensor A_5'] = data['Sensor A'].rolling(5).mean().bfill()
        data['rolling_Sensor A_3'] = data['Sensor A'].rolling(3).mean().bfill()
        data['rolling_Sensor A_7'] = data['Sensor A'].rolling(7).mean().bfill()        
        data['rolling_Sensor A_10'] = data['Sensor A'].rolling(10).mean().bfill()
        data['rolling_Sensor A_20'] = data['Sensor A'].rolling(20).mean().bfill()
        data['rolling_Sensor A_50'] = data['Sensor A'].rolling(50).mean().bfill()
        data['rolling_Sensor A_100'] = data['Sensor A'].rolling(100).mean().bfill()

        data['rolling_Sensor B_5'] = data['Sensor B'].rolling(5).mean().bfill()
        data['rolling_Sensor B_3'] = data['Sensor B'].rolling(3).mean().bfill()
        data['rolling_Sensor B_7'] = data['Sensor B'].rolling(7).mean().bfill()        
        data['rolling_Sensor B_10'] = data['Sensor B'].rolling(10).mean().bfill()
        data['rolling_Sensor B_20'] = data['Sensor B'].rolling(20).mean().bfill()
        data['rolling_Sensor B_50'] = data['Sensor B'].rolling(50).mean().bfill()
        data['rolling_Sensor B_100'] = data['Sensor B'].rolling(100).mean().bfill()

        data['rolling_Sensor C_5'] = data['Sensor C'].rolling(5).mean().bfill()
        data['rolling_Sensor C_3'] = data['Sensor C'].rolling(3).mean().bfill()
        data['rolling_Sensor C_7'] = data['Sensor C'].rolling(7).mean().bfill()        
        data['rolling_Sensor C_10'] = data['Sensor C'].rolling(10).mean().bfill()
        data['rolling_Sensor C_20'] = data['Sensor C'].rolling(20).mean().bfill()
        data['rolling_Sensor C_50'] = data['Sensor C'].rolling(50).mean().bfill()
        data['rolling_Sensor C_100'] = data['Sensor C'].rolling(100).mean().bfill()

        data['rolling_Sensor D_5'] = data['Sensor D'].rolling(5).mean().bfill()
        data['rolling_Sensor D_3'] = data['Sensor D'].rolling(3).mean().bfill()
        data['rolling_Sensor D_7'] = data['Sensor D'].rolling(7).mean().bfill()        
        data['rolling_Sensor D_10'] = data['Sensor D'].rolling(10).mean().bfill()
        data['rolling_Sensor D_20'] = data['Sensor D'].rolling(20).mean().bfill()
        data['rolling_Sensor D_50'] = data['Sensor D'].rolling(50).mean().bfill()
        data['rolling_Sensor D_100'] = data['Sensor D'].rolling(100).mean().bfill()

        tests.append(data)
        
test_window_data = make_predict_data(test_paths)

  0%|          | 0/4048 [00:00<?, ?it/s]

In [13]:
tests[0]

Unnamed: 0,Time[s],Signal A,Signal B,Signal C,Sensor A,Sensor B,Sensor C,Sensor D,driver,A,...,rolling_Sensor C_20,rolling_Sensor C_50,rolling_Sensor C_100,rolling_Sensor D_5,rolling_Sensor D_3,rolling_Sensor D_7,rolling_Sensor D_10,rolling_Sensor D_20,rolling_Sensor D_50,rolling_Sensor D_100
0,66.60,330.275608,329.843770,316.071968,642.526231,322.752290,517.600667,800.830710,1,660.119378,...,502.378405,446.213083,405.252331,799.411471,800.270229,797.979132,794.961560,773.306578,670.642527,641.356178
1,66.61,330.275608,329.953464,316.071968,641.724363,320.124408,516.997709,800.700080,1,660.229072,...,502.378405,446.213083,405.252331,799.411471,800.270229,797.979132,794.961560,773.306578,670.642527,641.356178
2,66.62,330.289650,330.054150,316.071968,644.527953,319.735722,517.944497,799.279896,1,660.343800,...,502.378405,446.213083,405.252331,799.411471,800.270229,797.979132,794.961560,773.306578,670.642527,641.356178
3,66.63,330.296104,330.168447,316.071968,645.168381,323.620266,514.881934,799.437182,1,660.464551,...,502.378405,446.213083,405.252331,799.411471,799.805719,797.979132,794.961560,773.306578,670.642527,641.356178
4,66.64,330.296104,330.280189,316.071968,646.044131,322.744516,514.881934,796.809486,1,660.576293,...,502.378405,446.213083,405.252331,799.411471,798.508855,797.979132,794.961560,773.306578,670.642527,641.356178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,71.55,320.905565,316.071968,329.845195,595.542540,411.795108,354.209073,436.397324,1,636.977533,...,355.009286,360.453832,374.520388,437.690482,437.127598,438.005468,438.358802,437.161503,450.922496,478.302721
496,71.56,320.888498,316.071968,329.845195,593.506277,414.645874,356.074786,435.651041,1,636.960466,...,354.987000,360.095872,374.040534,437.113728,436.371202,437.586357,438.067175,437.026946,450.139124,477.398645
497,71.57,320.846629,316.071968,329.885346,594.515078,415.334919,355.099270,434.862886,1,636.918597,...,354.888778,359.749145,373.556785,436.379344,435.637084,436.995191,437.640248,436.956436,449.336343,476.476350
498,71.58,320.827011,316.071968,329.906682,595.949953,417.845948,355.816696,434.145461,1,636.898979,...,354.826427,359.416766,373.098144,435.624391,434.886463,436.368141,437.069766,436.939784,448.504561,475.554054


In [14]:
df = pd.DataFrame()

In [15]:
for data in datas:
    df = pd.concat([df,data],axis=0)

In [16]:
df = df.rename(columns={'Time[s]': 'time'})


In [17]:
x_train = df.drop(columns=['label'])
y_train = df['label'].astype(int)

In [20]:
iterations = 50000
patience = 200
is_holdout = False


In [21]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
import lightgbm as lgb
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split  # 수정된 부분

# 경고 끄기
pd.set_option('mode.chained_assignment', None)

from catboost import CatBoostRegressor, Pool

model = CatBoostRegressor(iterations=iterations, learning_rate=0.1, depth=6, random_seed=42,
                         custom_metric=['MAE'],  # You can define a custom MAE metric
                           # Provide your categorical feature indices
                         loss_function='MAE') 

X_train, X_valid, Y_train, Y_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=42)  # 수정된 부분

train_pool = Pool(X_train, label=Y_train)
validation_pool = Pool(X_valid, label=Y_valid)

model.fit(train_pool, eval_set=validation_pool, early_stopping_rounds=patience, verbose=100)

0:	learn: 262.6374496	test: 262.7748793	best: 262.7748793 (0)	total: 217ms	remaining: 3h 47s
100:	learn: 120.5559049	test: 119.9679415	best: 119.9679415 (100)	total: 4.43s	remaining: 36m 29s
200:	learn: 107.0709503	test: 106.7740958	best: 106.7740958 (200)	total: 8.37s	remaining: 34m 33s
300:	learn: 99.3679014	test: 99.2467724	best: 99.2467724 (300)	total: 12.4s	remaining: 34m 6s
400:	learn: 90.8670001	test: 90.9035982	best: 90.9035982 (400)	total: 16.5s	remaining: 33m 56s
500:	learn: 86.8087320	test: 86.9311341	best: 86.9311341 (500)	total: 20.5s	remaining: 33m 44s
600:	learn: 82.0834412	test: 82.4706086	best: 82.4706086 (600)	total: 24.5s	remaining: 33m 36s
700:	learn: 78.1790099	test: 78.7380005	best: 78.7380005 (700)	total: 28.5s	remaining: 33m 25s
800:	learn: 74.1812786	test: 74.8946139	best: 74.8946139 (800)	total: 32.5s	remaining: 33m 18s
900:	learn: 71.1825157	test: 72.0708143	best: 72.0708143 (900)	total: 36.7s	remaining: 33m 18s
1000:	learn: 65.9531259	test: 66.9366043	best: 

8500:	learn: 13.6933728	test: 17.1492504	best: 17.1492504 (8500)	total: 6m 6s	remaining: 29m 49s
8600:	learn: 13.5608486	test: 17.0253384	best: 17.0253384 (8600)	total: 6m 11s	remaining: 29m 46s
8700:	learn: 13.4340417	test: 16.9097411	best: 16.9097411 (8700)	total: 6m 15s	remaining: 29m 43s
8800:	learn: 13.3184308	test: 16.8064473	best: 16.8064473 (8800)	total: 6m 20s	remaining: 29m 39s
8900:	learn: 13.1847585	test: 16.6797958	best: 16.6797958 (8900)	total: 6m 24s	remaining: 29m 35s
9000:	learn: 13.0739883	test: 16.5748350	best: 16.5748350 (9000)	total: 6m 28s	remaining: 29m 31s
9100:	learn: 12.9627470	test: 16.4783379	best: 16.4783379 (9100)	total: 6m 33s	remaining: 29m 27s
9200:	learn: 12.8484890	test: 16.3810506	best: 16.3810506 (9200)	total: 6m 37s	remaining: 29m 23s
9300:	learn: 12.7204247	test: 16.2618257	best: 16.2618257 (9300)	total: 6m 42s	remaining: 29m 20s
9400:	learn: 12.6157797	test: 16.1699553	best: 16.1699553 (9400)	total: 6m 46s	remaining: 29m 15s
9500:	learn: 12.53048

16800:	learn: 8.9838186	test: 13.3230944	best: 13.3229560 (16797)	total: 11m 52s	remaining: 23m 27s
16900:	learn: 8.9535912	test: 13.2999715	best: 13.2999715 (16900)	total: 11m 56s	remaining: 23m 23s
17000:	learn: 8.9298018	test: 13.2837905	best: 13.2837905 (17000)	total: 12m	remaining: 23m 18s
17100:	learn: 8.9060582	test: 13.2665297	best: 13.2664136 (17098)	total: 12m 4s	remaining: 23m 13s
17200:	learn: 8.8819043	test: 13.2506317	best: 13.2506317 (17200)	total: 12m 8s	remaining: 23m 8s
17300:	learn: 8.8568647	test: 13.2335935	best: 13.2335783 (17299)	total: 12m 12s	remaining: 23m 4s
17400:	learn: 8.8364473	test: 13.2225437	best: 13.2225414 (17399)	total: 12m 16s	remaining: 22m 59s
17500:	learn: 8.8128128	test: 13.2074892	best: 13.2074892 (17500)	total: 12m 20s	remaining: 22m 54s
17600:	learn: 8.7922113	test: 13.1945768	best: 13.1945768 (17600)	total: 12m 24s	remaining: 22m 50s
17700:	learn: 8.7681718	test: 13.1790662	best: 13.1790662 (17700)	total: 12m 28s	remaining: 22m 45s
17800:	l

25100:	learn: 7.6158636	test: 12.4549960	best: 12.4548582 (25098)	total: 17m 24s	remaining: 17m 16s
25200:	learn: 7.6066203	test: 12.4495029	best: 12.4495029 (25200)	total: 17m 28s	remaining: 17m 11s
25300:	learn: 7.5956501	test: 12.4440736	best: 12.4440606 (25298)	total: 17m 32s	remaining: 17m 7s
25400:	learn: 7.5841264	test: 12.4365586	best: 12.4365586 (25400)	total: 17m 36s	remaining: 17m 3s
25500:	learn: 7.5733749	test: 12.4304538	best: 12.4304538 (25500)	total: 17m 40s	remaining: 16m 58s
25600:	learn: 7.5613132	test: 12.4222380	best: 12.4222380 (25600)	total: 17m 44s	remaining: 16m 54s
25700:	learn: 7.5522169	test: 12.4177364	best: 12.4175665 (25697)	total: 17m 48s	remaining: 16m 50s
25800:	learn: 7.5417647	test: 12.4120704	best: 12.4120704 (25800)	total: 17m 52s	remaining: 16m 45s
25900:	learn: 7.5278589	test: 12.4026758	best: 12.4026758 (25900)	total: 17m 56s	remaining: 16m 41s
26000:	learn: 7.5174992	test: 12.3959734	best: 12.3959310 (25998)	total: 18m	remaining: 16m 37s
26100:

33400:	learn: 6.8979486	test: 12.0529622	best: 12.0529622 (33400)	total: 22m 55s	remaining: 11m 23s
33500:	learn: 6.8911855	test: 12.0490765	best: 12.0490727 (33499)	total: 22m 59s	remaining: 11m 19s
33600:	learn: 6.8838035	test: 12.0442440	best: 12.0442440 (33600)	total: 23m 3s	remaining: 11m 15s
33700:	learn: 6.8775189	test: 12.0405747	best: 12.0405747 (33700)	total: 23m 7s	remaining: 11m 11s
33800:	learn: 6.8709598	test: 12.0373193	best: 12.0371459 (33792)	total: 23m 11s	remaining: 11m 6s
33900:	learn: 6.8632146	test: 12.0348774	best: 12.0348513 (33888)	total: 23m 15s	remaining: 11m 2s
34000:	learn: 6.8567710	test: 12.0317125	best: 12.0317125 (34000)	total: 23m 19s	remaining: 10m 58s
34100:	learn: 6.8508050	test: 12.0286259	best: 12.0286259 (34100)	total: 23m 23s	remaining: 10m 54s
34200:	learn: 6.8443611	test: 12.0253195	best: 12.0253195 (34200)	total: 23m 27s	remaining: 10m 50s
34300:	learn: 6.8386768	test: 12.0222073	best: 12.0218946 (34289)	total: 23m 31s	remaining: 10m 45s
3440

41700:	learn: 6.4347055	test: 11.8088818	best: 11.8088818 (41700)	total: 28m 27s	remaining: 5m 39s
41800:	learn: 6.4305364	test: 11.8068475	best: 11.8068475 (41800)	total: 28m 31s	remaining: 5m 35s
41900:	learn: 6.4257977	test: 11.8041708	best: 11.8041667 (41899)	total: 28m 35s	remaining: 5m 31s
42000:	learn: 6.4211994	test: 11.8011549	best: 11.8011365 (41999)	total: 28m 39s	remaining: 5m 27s
42100:	learn: 6.4161636	test: 11.7986770	best: 11.7986458 (42094)	total: 28m 43s	remaining: 5m 23s
42200:	learn: 6.4115316	test: 11.7965384	best: 11.7963608 (42176)	total: 28m 47s	remaining: 5m 19s
42300:	learn: 6.4057645	test: 11.7930025	best: 11.7930025 (42300)	total: 28m 51s	remaining: 5m 15s
42400:	learn: 6.4012727	test: 11.7908338	best: 11.7908338 (42400)	total: 28m 55s	remaining: 5m 11s
42500:	learn: 6.3962436	test: 11.7880325	best: 11.7880325 (42500)	total: 28m 59s	remaining: 5m 6s
42600:	learn: 6.3908668	test: 11.7851169	best: 11.7849658 (42591)	total: 29m 3s	remaining: 5m 2s
42700:	learn:

<catboost.core.CatBoostRegressor at 0x2034adfd3a0>

In [22]:
preds = []
for test in tqdm(tests):
    test = test.rename(columns={'Time[s]': 'time'})

    pred = model.predict(test)
    preds.append(pred)

100%|██████████████████████████████████████████████████████████████████████████████| 4048/4048 [01:32<00:00, 43.92it/s]


In [23]:
value = []
for i in range(len(preds)):
    pred = preds[i]
    sorted_preds = sorted(pred)
    remove_count = len(pred) // 10
    filtered_values = sorted_preds[remove_count:-remove_count]
    result = np.mean(filtered_values)
    value.append(result)

In [24]:
# 결과 후처리
value = np.round(value, 0).astype(int)

In [25]:
value.shape

(4048,)

## Submission

In [26]:
submit = pd.read_csv('./sample_submission.csv')

In [27]:
submit['weight'] = value

In [28]:
submit.loc[submit['weight'] > 902, 'weight'] = 902
submit.loc[submit['weight'] < 0, 'weight'] = np.abs(submit['weight'])


In [29]:
submit['weight'].describe()

count    4048.000000
mean      444.099308
std       178.624310
min        38.000000
25%       331.750000
50%       408.000000
75%       600.000000
max       902.000000
Name: weight, dtype: float64

In [30]:
submit.to_csv('./상준_CAT_15호.csv', index=False)

In [12]:
submit = pd.read_csv('./sample_submission.csv')