<a href="https://colab.research.google.com/github/timeseriesAI/tsai/blob/master/tutorial_nbs/10_Time_Series_Classification_and_Regression_with_MiniRocket.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###  Prepare data

In [5]:
import sklearn
import pandas as pd
import random
import os
import numpy as np
import cv2
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# hyperparameter
import optuna
from optuna.samplers import TPESampler
warnings.filterwarnings("ignore")

#Modeling
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, VotingClassifier,RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import Pool,CatBoostClassifier, CatBoostRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, QuantileTransformer, RobustScaler

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from fancyimpute import IterativeImputer, KNN
from lightgbm import LGBMRegressor
from optuna.pruners import SuccessiveHalvingPruner
from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler

In [None]:
train = pd.read_csv('dataset/train_reduce.csv')
test = pd.read_csv('dataset/test_reduce.csv')

train = train.drop(columns=['TIMESTAMP', 'PRODUCT_ID', 'LINE', 'PRODUCT_CODE'])
test = test.drop(columns=['TIMESTAMP', 'PRODUCT_ID', 'LINE', 'PRODUCT_CODE'])

cat_features = ['LINE', 'PRODUCT_CODE']
num_features = [i for i in test.columns if i not in cat_features]

y = train['Y_Quality']
y_class = train["Y_Class"]

for col in num_features:
    train[col] = train[col].fillna(train[col].median())

scaler = RobustScaler()
#scaler = QuantileTransformer()
train[num_features] = scaler.fit_transform(train[num_features])
test[num_features] = scaler.transform(test[num_features])

use_cat = True
if use_cat: 
    X = train.drop(columns=['Y_Class','Y_Quality'])
    X_test = test
else: 
    X = train[num_features]
    X_test = test[num_features]
    
corr = pd.read_csv('correlation/correlation_spearman.csv')
# Y_Quality 제거
corr = corr.iloc[:-1,:]
important = list(corr[abs(corr['correlation'])>=0.01]['feature'])
X = X[important]
X_test = X_test[important]
X


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knnclassifier = KNeighborsClassifier(1)
knnclassifier.fit(y.to_numpy().reshape(-1,1), y_class.to_numpy().reshape(-1,1))

### Optuna를 통한 Hyperparameter Search

In [None]:
def cb_optimization(trial):
    score = []
    kf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)

    # split 개수 스텝 만큼 train, test 데이터셋을 매번 분할
    for train_index, valid_index in kf.split(X, y_class):
        X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_valid = y.values[train_index], y.values[valid_index]
        

        params = {
            'num_leaves': trial.suggest_int('num_leaves', 100, 200, step=5), 
            'max_depth': trial.suggest_int('max_depth', 10, 30, step=1, log=False), 
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True), 
            'n_estimators': trial.suggest_int('n_estimators', 1000, 3000, step=1, log=True), 
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 70, step=1, log=False), 
            'subsample': trial.suggest_uniform('subsample', 0.2, 0.6), 
            'random_state': 45
        }
        model = LGBMRegressor(verbose=-1, **params)
        model.fit(X_train, y_train,
                eval_set=[(X_valid, y_valid)],
            )
        lgbm_output = model.predict(X_valid)
        score.append(mean_squared_error(y_valid, lgbm_output) ** 0.5)
    return np.mean(score)

In [None]:
sampler = TPESampler(seed=42)
optim = optuna.create_study(
    study_name="LGBM_parameter_opt",
    direction="minimize",
    sampler=sampler,
    pruner=SuccessiveHalvingPruner()
)
optim.optimize(cb_optimization, n_trials=1000) # 실제 Train에서는 10000~99999 사용 
print("best nrmse:", optim.best_value)

In [None]:
def evaluate_macroF1_lgb(truth, predictions):  
    # this follows the discussion in https://github.com/Microsoft/LightGBM/issues/1483
    pred_labels = predictions.reshape(len(np.unique(truth)),-1).argmax(axis=0)
    f1 = f1_score(truth, pred_labels, average='macro')
    return ('macroF1', f1, True) 

In [None]:
import time
import joblib

skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True) #총 6번의 fold 진행
n = 0 #x번째 fold인지 기록

fold_target_pred = []

fold_score = []

#파일 디렉토리 생성
model_dir = f'./model'
os.makedirs(model_dir, exist_ok=True)

for train_index, valid_index in skf.split(X, y): #label 기준으로 stratified k fold 진행
    n += 1
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    val_pred_name = [] #validation pred model 이름 저장
    val_pred = []      #validation set pred 결과 저장
    arg_max_pred = []
    target_pred = []   #test set pred 결과 저장
    
    ### Create Model ###
    #CAT model
    {'iterations': 144, 'learning_rate': 0.001081938118227713, 'depth': 9, 'od_wait': 186}
    start_time_cat = time.time()
    model_cat = CatBoostClassifier(verbose = 0,
                            learning_rate = 0.02,
                            eval_metric="TotalF1:average=Macro",
                            random_seed = 42,
                            iterations = 5000,
                            #ignored_features = [8, 9, 31, 32, 33, 34, 45, 50, 51, 53, 54, 55],
                            od_wait = 200,
                            use_best_model=True,
                            depth = 9)
    
    model_cat.fit(X_train, y_train, 
                  eval_set=(X_valid, y_valid))
    end_time_cat = time.time()
    
    
    #model cat 저장
    cat_path = './model/cat_{}'.format(n)
    model_cat.save_model(cat_path)
    
    #model cat 불러오기
    #model_cat.load_model(cat_path)
    output = model_cat.predict_proba(X_valid)
    val_pred_name.append("CatBoostClassifier")  # 모델 이름 저장
    val_pred.append(output)   # validation set pred 결과 저장
    arg_max_pred.append(np.argmax(output, axis=1))
    
    target_pred.append(model_cat.predict_proba(X_test)) # test set pred 결과 저장
    
    ### LGBM model
    start_time_lgb = time.time()
    lgbmparams = {'num_leaves': 17,
                'max_depth': 11,
                'learning_rate': 0.07028290319049474,
                'n_estimators': 78,
                'class_weight': 'balanced',
                'min_child_samples': 12,
                'subsample': 0.831632859850219,
                'colsample_bytree': 0.9362544923583181,
                'reg_alpha': 0.01941513921336218,
                'reg_lambda': 0.0021722692515700652}
    
    model_lgbm = LGBMClassifier(n_estimators = 2000, 
                                learning_rate = 0.01,
                                max_depth = 16,
                                min_child_samples = 56,
                                subsample = 0.4,
                                num_leaves = 160,
                                random_state = 42,
                                n_jobs = 8,
                                verbose=-1,
                                )

    fit_params = dict(
        eval_set=[(X_valid, y_valid)],
        eval_metric = evaluate_macroF1_lgb,
        )
    
    model_lgbm.fit(X_train, y_train, **fit_params)
    end_time_lgb = time.time()
    
    output = model_lgbm.predict_proba(X_valid)
    val_pred_name.append("LGBMClassifier")  # 모델 이름 저장
    val_pred.append(output)   # validation set pred 결과 저장
    arg_max_pred.append(np.argmax(output, axis=1))
    
    target_pred.append(model_lgbm.predict_proba(X_test)) # test set pred 결과 저장
    
    #model lgbm 저장
    lgbm_path = './model/lgbm_{}.pkl'.format(n)
    
    # save model
    joblib.dump(model_lgbm, lgbm_path)
    # load model
    #gbm_pickle = joblib.load('lgb.pkl')
    #model_lgbm.save_model(lgbm_path)
    
    #model lgbm 불러오기
    #model_lgbm.load(lgbm_path)

    ### XGB model
    start_time_xgb = time.time()
    model_xgb = XGBClassifier(n_estimators = 3000,
                              random_state = 42,
                              eval_metric = evaluate_macroF1_lgb, 
                              learning_rate=0.006,
                              subsample=0.75, 
                              colsample_bytree = 0.86,
                              max_depth=9,
                              tree_method='gpu_hist',
                              gpu_id = 0)
    
    model_xgb.fit(X_train, y_train, verbose=0)
    end_time_xgb = time.time()
    
    output = model_xgb.predict_proba(X_valid)
    val_pred_name.append("XGBClassifier")  # 모델 이름 저장
    val_pred.append(output)   # validation set pred 결과 저장
    arg_max_pred.append(np.argmax(output, axis=1))
    target_pred.append(model_xgb.predict_proba(X_test)) # test set pred 결과 저장
    
    #model xgb 저장
    xgb_path = './model/xgb_{}.pkl'.format(n)
    joblib.dump(model_xgb, xgb_path)
    #model_xgb.save(xgb_path)
    
    #model xgb 불러오기
    #model_xgb.load(xgb_path)
  
    ### average validation pred ###
    preds = np.array(val_pred[0])
    for i in range(1, len(val_pred)):
        preds += val_pred[i]
    preds = preds/len(val_pred)
    preds = np.argmax(preds,axis=1)

    ### average target pred ###
    target_preds = target_pred[0]
    for i in range(1, len(target_pred)):
        target_preds += target_pred[i]
    target_preds = target_preds/len(target_pred)
    fold_target_pred.append(target_preds) # append final target pred
    
    print("========== fold %d ==========" %(n))
    for i in range(len(val_pred)):
        print("%s model F1 : %0.4f" %(val_pred_name[i], f1_score(y_valid, arg_max_pred[i], average="macro")))
        
    print('CAT 코드 실행 시간: %10ds' % (end_time_cat - start_time_cat))
    print('LGB 코드 실행 시간: %10ds' % (end_time_lgb - start_time_lgb))
    print('XGB 코드 실행 시간: %10ds' % (end_time_xgb - start_time_xgb))
    print("average model F1 : %0.4f" %(f1_score(y_valid, preds, average='macro')))
    fold_score.append(f1_score(y_valid, preds, average='macro'))

total_score = fold_score[0]
for i in range(1, len(fold_score)):
    total_score += fold_score[i]
    
total_score = total_score/len(fold_score)

print("==============================")
print("Model Sum Average F1 %0.4f" %(total_score))

In [None]:
final_pred = np.array(fold_target_pred[0])

for i in range(1, 5):
    final_pred += fold_target_pred[i]

final_pred = final_pred/5
pred = np.argmax(final_pred, axis=1)
submit = pd.read_csv("baseline_submission.csv")
submit['Y_Class'] = pred
submit.to_csv("submission.csv", index=False)
pred

In [None]:
##custom error function
def custom_error(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

In [8]:
from sklearn.impute import KNNImputer
import pandas as pd

train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

train.drop(columns=["PRODUCT_ID", "TIMESTAMP", 'LINE', 'PRODUCT_CODE'], inplace=True)
test.drop(columns=["PRODUCT_ID", "TIMESTAMP", 'LINE', 'PRODUCT_CODE'], inplace=True)
y = train['Y_Class']

num_features = test.select_dtypes(exclude=['object']).columns.to_list()


#for col in num_features:
#    train[col] = train[col].fillna(train[col].median())

scaler = StandardScaler()
#scaler = QuantileTransformer()
train[num_features] = scaler.fit_transform(train[num_features])
test[num_features] = scaler.transform(test[num_features])

X = train.drop(columns=['Y_Class', 'Y_Quality'])
X_test = test

#from math import *
#corr = pd.read_csv('correlation/correlation.csv')
# Y_Quality 제거
#corr = corr.iloc[:-1,:]
#important = list(corr[abs(corr['correlation'])>=0.1]['feature'])
#important
#X = X[important]
#X_test = X_test[important]

dup = ~X.T.duplicated()
X = X.loc[:, dup]
X_test = X_test.loc[:, dup]

X_columns = X.columns
#num_features = X_test.select_dtypes(exclude=['object']).columns.to_list()
#scaler = StandardScaler()
#X[num_features] = scaler.fit_transform(X[num_features])
#X_test[num_features] = scaler.transform(X_test[num_features])
imputer = KNNImputer()
X = imputer.fit_transform(X)
X_test = imputer.transform(X_test)

In [13]:
accuracy_history = []
kf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)

scores = []
models = []

# split 개수 스텝 만큼 train, test 데이터셋을 매번 분할
idx = 0
for train_index, valid_index in kf.split(X, y):
    idx += 1
    print(f"\n=========== step {idx} ===========\n")
    X_train, X_test = X[train_index], X[valid_index]
    y_train, y_test = y.iloc[train_index], y.iloc[valid_index]
    params = {

    }

    model = CatBoostClassifier(task_type="GPU",
                               eval_metric="TotalF1:average=Macro", 
                               classes_count=3, 
                               use_best_model=True,
                               random_state=42,
                               loss_function='MultiClass',
                               **params)
    model.fit(X_train, y_train, 
            eval_set=[(X_test, y_test)], 
            verbose = 0
        )
    #model.fit(X_train, y_train) # 모델 학습
    models.append(model)
    scores.append(model.get_best_score()["validation"]["TotalF1:average=Macro"])

print("각 분할의 정확도 기록 :", scores)
print("평균 정확도 :", np.mean(scores))












각 분할의 정확도 기록 : [0.7663720308608225, 0.6241908575241909, 0.6288019603184586, 0.646969696969697, 0.679968140183194]
평균 정확도 : 0.6692605371712725


In [None]:
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
kf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 24)
idx = 0
for train_index, valid_index in kf.split(X, y):
    idx += 1
    train_X, test_X = X.iloc[train_index], X.iloc[valid_index]
    train_y, test_y = y.iloc[train_index], y.iloc[valid_index]
    if idx == 1:
        break

#train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)
feature_names = ['F{}'.format(i) for i in range(train_X.shape[1])]


model = CatBoostClassifier(task_type="GPU",
                            eval_metric="TotalF1:average=Macro", 
                            loss_function='MultiClass',
                            verbose = 0,
                            learning_rate = 0.02,
                            random_seed = 42,
                            iterations = 500,
                            od_wait = 200,
                            use_best_model=True,
                            depth = 9)

summary = model.select_features(
    train_X,
    train_y,
    eval_set=(test_X, test_y),
    features_for_select=X.columns,
    num_features_to_select=150,
    steps=3,
    algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
    shap_calc_type=EShapCalcType.Regular,
    train_final_model=True,
    logging_level='Silent',
    plot=True
)

In [None]:
from catboost import CatBoostRegressor, Pool, EShapCalcType, EFeaturesSelectionAlgorithm
from collections import defaultdict
kf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 42)
idx = 0
not_imp = []
count = defaultdict(int)
for train_index, valid_index in kf.split(X, y):
    idx += 1
    train_X, valid_X = X.iloc[train_index], X.iloc[valid_index]
    train_y, valid_y = y.iloc[train_index], y.iloc[valid_index]

    model = CatBoostClassifier(task_type="GPU",
                                eval_metric="TotalF1:average=Macro", 
                                loss_function='MultiClass',
                                verbose = 0,
                                learning_rate = 0.02,
                                random_seed = 42,
                                iterations = 500,
                                od_wait = 200,
                                use_best_model=True,
                                depth = 9)
    model.fit(X, y, eval_set=[(valid_X, valid_y)], verbose = 0)
    important = pd.DataFrame({'feature_importance': model.get_feature_importance(Pool(X)), 
                'feature_names': X.columns}).sort_values(by=['feature_importance'], 
                                                            ascending=False)
    for col in important[important["feature_importance"] == 0]['feature_names']:
        count[col] += 1
        
for key in count.keys():
    if count[key] == 5:
        not_imp.append(key)

In [None]:
import json
selected_feature_names = {
    "features": not_imp
}
with open("correlation/drop_features.json", 'w') as fp:
    json.dump(selected_feature_names, fp, indent='\t')


In [None]:
params = {
    'iterations': 207,
    'learning_rate': 0.09355513642675106,
    'depth': 10,
    'l2_leaf_reg': 0.00018514951960546424,
    'bootstrap_type': 'Bayesian',
    'random_strength': 1.023676953660676,
    'bagging_temperature': 0.06544495923088894,
    'od_type': 'IncToDec',
    'od_wait': 49
}
model = CatBoostClassifier(task_type="GPU",
                            eval_metric="TotalF1:average=Macro", 
                            thread_count=4,
                            classes_count=3, 
                            random_state=23,
                            loss_function='MultiClass',
                            cat_features= cat_features,
                            **params)

model.fit(X, y, verbose = 50)

In [None]:
important = pd.DataFrame({'feature_importance': model.get_feature_importance(Pool(X, cat_features=cat_features)), 
              'feature_names': X.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)
important.to_csv("correlation/important.csv")

In [None]:
from mlxtend.classifier import EnsembleVoteClassifier
ensemble = EnsembleVoteClassifier(clfs=models, weights=[1]*10,voting='soft', fit_base_estimators=False)
ensemble.fit(None,np.array([0,1,2]))

In [None]:
#test predict
pred = ensemble.predict(test_x)
pred

In [None]:
#Submission file 준비
submit = pd.read_csv('sample_submission.csv')
submit['Y_Class'] = pred

In [None]:
submit.to_csv('submission.csv',index=False)