# Abstract
#### Setting
        1. 1차 시도 통계적 기법으로 큐톤 신호를 라벨링하여 이상탐지 
        2. 채널별로 데이터 저장
        3. 데이터 라벨링 후 LightGBM 모델링

#### Method
        1. 시스템화를 위해 모듈화를 진행
        2. mlflow_Logging함수 이용하여 mlflow에 로깅

#### Result

In [4]:
import sys
import os

In [5]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score


sys.path.append('../Common/Library/')
from mlflow_logging import mlflow_logging, binary_classification_metric

import joblib

In [6]:
df_origin = pd.read_csv('../Common/data/Type_A_csv/GOLF&PBA.csv')

In [7]:
df_origin.head()

Unnamed: 0,시간,요일,채널명,label
0,00:00:42,화,GOLF&PBA,1
1,00:02:59,수,GOLF&PBA,1
2,00:05:06,수,GOLF&PBA,1
3,00:12:55,화,GOLF&PBA,1
4,00:13:25,수,GOLF&PBA,1


## Function

## 전처리

In [8]:
df = pd.concat([pd.DataFrame(df_origin['시간'].str.split(':').tolist(), columns=['시', '분', '초']),df_origin.iloc[:, 1:]], axis=1)

In [9]:
df['요일'] = df['요일'].astype('category')
df['시'] = df['시'].astype('int')
df['분'] = df['분'].astype('int')
df['초'] = df['초'].astype('int')

## Data Segmentation

In [10]:
X, y = df.drop(['label', '채널명'], axis=1), df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022, shuffle=True, stratify=y)

In [11]:
X_train

Unnamed: 0,시,분,초,요일
248,8,40,10,수
361,14,3,16,토
233,8,18,18,수
271,9,39,53,금
101,4,1,11,월
...,...,...,...,...
4,0,13,25,수
345,13,25,10,화
50,2,24,53,화
230,8,15,24,목


## Modeling

In [12]:
model = LGBMClassifier(random_state=2022)
model.fit(X_train, y_train)

prediction = model.predict(X_test)
prediction_proba = model.predict_proba(X_test)[:, 1]


accuracy = accuracy_score(y_test, prediction)
precision = precision_score(y_test, prediction)
recall = recall_score(y_test, prediction)
auc = roc_auc_score(y_test, prediction_proba)

In [13]:
for name, metric in zip(['accuracy', 'precision', 'recall', 'auc'], [accuracy, precision, recall, auc]):
    print(f"{name}: {metric}")

accuracy: 0.967391304347826
precision: 0.9866666666666667
recall: 0.9736842105263158
auc: 0.966282894736842


## MLflow에 로깅

In [14]:
metric_dict= binary_classification_metric(y_test, prediction, prediction_proba)  ### metric 정의
reduced_features = tag_dict = {f"{idx+1}" :columns for idx, columns in enumerate(X.columns)} ## reduced_X 자리에 모델에 입력된 X값을 넣어주면 됨

print(f"model: {model}\n")
print(f"model_params_dict: {model.get_params()}\n")  ## 모델 안에 있는 파라미터를 가져옴
print(f"metric_dict: {metric_dict}\n")
print(f"tags_dict: {reduced_features}")

model: LGBMClassifier(random_state=2022)

model_params_dict: {'boosting_type': 'gbdt', 'class_weight': None, 'colsample_bytree': 1.0, 'importance_type': 'split', 'learning_rate': 0.1, 'max_depth': -1, 'min_child_samples': 20, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 100, 'n_jobs': -1, 'num_leaves': 31, 'objective': None, 'random_state': 2022, 'reg_alpha': 0.0, 'reg_lambda': 0.0, 'silent': 'warn', 'subsample': 1.0, 'subsample_for_bin': 200000, 'subsample_freq': 0}

metric_dict: {'Accuracy': 0.967391304347826, 'Precision': 0.9866666666666667, 'Recall': 0.9736842105263158, 'F1_Score': 0.9801324503311258, 'AUC': 0.955592105263158}

tags_dict: {'1': '시', '2': '분', '3': '초', '4': '요일'}


In [15]:
# mlflow_logging(experiment='Quetone', run_name=df['채널명'][0],                         
#                model=model, model_name='LGBM', 
#                model_params_dict=model.get_params(),
#                metric_dict=metric_dict,
#                tag = X
#                #img_dict={'시각화 자료':fig}
#               )

## 모듈화

In [None]:
path = '../Common/data/Type_A_csv/'
file_list = os.listdir(path)
file_list_py = [file for file in file_list if file.endswith('.csv')] ## 파일명 끝이 .csv인 경우

for i in file_list_py:
    ## 데이터 초기화
    df = None

    # 데이터 로드
    df = pd.read_csv(path + i)
    df = pd.concat([pd.DataFrame(df['시간'].str.split(':').tolist(), columns=['시', '분', '초']),df.iloc[:, 1:]], axis=1)
    print(df['채널명'][0])
    
    # 카테고리화
    df['요일'] = df['요일'].astype('category')
    df['시'] = df['시'].astype('int')
    df['분'] = df['분'].astype('int')
    df['초'] = df['초'].astype('int')

    # 데이터 스플릿
    X, y = df.drop(['label', '채널명'], axis=1), df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022, shuffle=True, stratify=y)

    # 모델링
    model = LGBMClassifier(random_state=2022)
    model.fit(X_train, y_train)

    # 추론
    prediction = model.predict(X_test)
    prediction_proba = model.predict_proba(X_test)[:, 1]


    accuracy = accuracy_score(y_test, prediction)
    precision = precision_score(y_test, prediction)
    recall = recall_score(y_test, prediction)
    try:
        auc = roc_auc_score(y_test, prediction_proba)   
    except:    # 예외가 발생했을 때 실행됨
        print('실측치가 1만 존재합니다.')
    

    # mlflow metric 정의
    metric_dict= binary_classification_metric(y_test, prediction, prediction_proba) 
    reduced_features = tag_dict = {f"{idx+1}" :columns for idx, columns in enumerate(X.columns)}

    # mlflow
    mlflow_logging(experiment='Quetone', run_name= df['채널명'][0],                         
               model=model, model_name='LGBM', 
               model_params_dict=model.get_params(),
               metric_dict=metric_dict,
               tag = X
               #img_dict={'시각화 자료':fig}
              )
    # 모델 저장
    joblib.dump(model, str('../Model/' + df['채널명'][0])+'_model' + '.pkl')
#df = df.reset_index(drop = True)