<a href="https://colab.research.google.com/github/timeseriesAI/tsai/blob/master/tutorial_nbs/10_Time_Series_Classification_and_Regression_with_MiniRocket.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [108]:
#Base & visualization
import pandas as pd
import random
import os
import numpy as np
import warnings

#sklearn module & utils
from sklearn.model_selection import StratifiedKFold , KFold, train_test_split, cross_val_score, cross_validate
warnings.filterwarnings('ignore') 

# hyperparameter
import optuna
from optuna.samplers import TPESampler

#Scaling
from sklearn.preprocessing import StandardScaler


#Modeling
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.impute import KNNImputer
from pathlib import Path

import torch
from pytorch_tabnet.tab_model import TabNetRegressor, TabModel, TabNetClassifier
from pytorch_tabnet.augmentations import ClassificationSMOTE
DATA_PATH = Path('dataset')

In [109]:
def seed_everything(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
SEED = 25
seed_everything(SEED)

In [111]:
for col in train:
    if train[col].nunique() < 2:
        train.drop(columns=col, inplace=True)
        test.drop(columns=col, inplace=True)

In [112]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

dup = ~train.T.duplicated()
train = train.loc[:, dup]
test = test.loc[:, dup]

scaler = StandardScaler()
imputer = KNNImputer()

num_features = T.select_dtypes(exclude=['object']).columns.to_list()
T[num_features] = scaler.fit_transform(T[num_features])
T[num_features] = imputer.fit_transform(T[num_features])

num_features = A.select_dtypes(exclude=['object']).columns.to_list()
A[num_features] = scaler.fit_transform(A[num_features])
A[num_features] = imputer.fit_transform(A[num_features])

In [113]:
print(T.shape, A.shape)

(349, 442) (249, 1058)


In [114]:
new_train = pd.merge(T, A, how='outer').sort_values("PRODUCT_ID").reset_index(drop=True)
new_train.drop(columns=["TIMESTAMP", "PRODUCT_ID", "LINE", "PRODUCT_CODE"], inplace=True)
tmp = pd.read_csv('dataset/train.csv')
y = tmp['Y_Class']

In [89]:
new_train.fillna(0, inplace=True)

In [91]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

train.drop(columns=["PRODUCT_ID", "TIMESTAMP", 'LINE', 'PRODUCT_CODE'], inplace=True)
test.drop(columns=["PRODUCT_ID", "TIMESTAMP", 'LINE', 'PRODUCT_CODE'], inplace=True)
y = train['Y_Class']

num_features = test.select_dtypes(exclude=['object']).columns.to_list()

#for col in num_features:
#    train[col] = train[col].fillna(train[col].median())

scaler = StandardScaler()
#scaler = QuantileTransformer()
train[num_features] = scaler.fit_transform(train[num_features])
test[num_features] = scaler.transform(test[num_features])

X = train.drop(columns=['Y_Class', 'Y_Quality'])
X_test = test

#from math import *
corr = pd.read_csv('correlation/correlation.csv')
# Y_Quality 제거
corr = corr.iloc[:-1,:]
important = list(corr[abs(corr['correlation'])>=0.1]['feature'])
#important
X = X[important]
X_test = X_test[important]

dup = ~X.T.duplicated()
X = X.loc[:, dup]
X_test = X_test.loc[:, dup]

#num_features = X_test.select_dtypes(exclude=['object']).columns.to_list()
#scaler = StandardScaler()
#X[num_features] = scaler.fit_transform(X[num_features])
#X_test[num_features] = scaler.transform(X_test[num_features])

imputer = KNNImputer()
X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

In [103]:
def evaluate_macroF1_lgb(truth, predictions):  
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True) 

In [115]:
from optuna.pruners import SuccessiveHalvingPruner

def objective(trial):
  params = {
    'num_leaves': trial.suggest_int('num_leaves', 8, 50, step=1, log=True), 
    'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True), 
    "metric": "multiclass",
    'class_weight': trial.suggest_categorical('class_weight', ['balanced']),
    'min_child_samples': trial.suggest_int('min_child_samples', 5, 30, step=1, log=False), 
    'random_state': SEED
  }

  kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
  scores = []
  for train_index, valid_index in kf.split(new_train, y):
    X_train, X_valid = new_train.values[train_index], new_train.values[valid_index]
    y_train, y_valid = y.values[train_index], y.values[valid_index]
    model = LGBMClassifier(n_estimators=2000, boost_from_average=False, early_stopping_rounds=50, verbose=-1, colsample_bytree=0.7, subsample=0.7, **params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric=evaluate_macroF1_lgb)
    pred = model.predict(X_valid)
    f1 = f1_score(y_valid, pred, average='macro')
    scores.append(f1)

  return np.mean(scores)

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=10000)

[32m[I 2023-02-22 13:50:08,256][0m A new study created in memory with name: no-name-6585d6f0-6ed1-4763-96d2-d0ddbbde0c5e[0m
[32m[I 2023-02-22 13:50:14,123][0m Trial 0 finished with value: 0.5772307350648576 and parameters: {'num_leaves': 39, 'learning_rate': 0.014606758782620575, 'class_weight': 'balanced', 'min_child_samples': 12}. Best is trial 0 with value: 0.5772307350648576.[0m
[32m[I 2023-02-22 13:50:17,780][0m Trial 1 finished with value: 0.5692617020270537 and parameters: {'num_leaves': 11, 'learning_rate': 0.00664049196977075, 'class_weight': 'balanced', 'min_child_samples': 8}. Best is trial 0 with value: 0.5772307350648576.[0m
[32m[I 2023-02-22 13:50:22,165][0m Trial 2 finished with value: 0.5689833763088578 and parameters: {'num_leaves': 28, 'learning_rate': 0.007502778397963552, 'class_weight': 'balanced', 'min_child_samples': 19}. Best is trial 0 with value: 0.5772307350648576.[0m
[32m[I 2023-02-22 13:50:26,651][0m Trial 3 finished with value: 0.608806353949

KeyboardInterrupt: 

In [None]:
trial = optim.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
kf = StratifiedKFold(n_splits=10, shuffle = True, random_state = SEED)
scores = []
models = []
# split 개수 스텝 만큼 train, test 데이터셋을 매번 분할
for train_index, valid_index in kf.split(train_x, train_y):
    X_train, X_test = train_x.iloc[train_index], train_x.iloc[valid_index]
    y_train, y_test = train_y[train_index], train_y[valid_index]
    params = trial.params
    #params['learning_rate'] = 0.20
    model = LGBMClassifier(device="gpu", verbose=-1, random_state=SEED, **params)
    model.fit(X_train, y_train)
    models.append(model)
    y_pred = model.predict(X_test) # 예측 라벨
    output = f1_score(y_test, y_pred, average="macro")
    scores.append(output)

print("각 분할의 정확도 기록 :", scores)
print("평균 정확도 :", np.mean(scores))

In [None]:
from mlxtend.classifier import EnsembleVoteClassifier
ensemble = EnsembleVoteClassifier(clfs=models, weights=[1]*10, voting='soft', fit_base_estimators=False)
ensemble.fit(None,np.array([0,1,2]))

In [None]:
#Submission file 준비
submit = pd.read_csv('sample_submission.csv')

In [None]:
#test predict
pred = ensemble.predict(test_x)
submit['Y_Class'] = pred

In [None]:
submit.to_csv("submission.csv", index=False)