In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import warnings
import seaborn as sns

warnings.filterwarnings('ignore')
%matplotlib inline

card_df = pd.read_csv('./creditcard.csv')
card_df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [2]:
card_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [3]:
# 특정 컬럼만 스케일링하기 위한 함수
def preprocess(df=None, scaling_column=None, scaler=None):
    df_copy = df.copy()
    if scaling_column is not None and scaler is not None:
        scaled_column = scaler.fit_transform(df[scaling_column].values.reshape(-1, 1))
        df_copy.insert(0, f'{scaling_column}_scaled', scaled_column)
        df_copy.drop(scaling_column, axis=1, inplace=True)
    df_copy.drop('Time', axis=1, inplace=True)
    
    return df_copy

In [4]:
from sklearn.model_selection import train_test_split

def split_train_and_test(df, scaling_column=None, scaler=None):
    df = preprocess(df, scaling_column, scaler)
    X_df = df.iloc[:, :-1]
    y_df = df.iloc[:, -1]

    return train_test_split(X_df, y_df, test_size=0.3, random_state=0, stratify=y_df)

In [5]:
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.metrics import fbeta_score, confusion_matrix
from mlflow import log_metric

def get_clf_eval(y_test, pred, pred_proba):
    confusion = confusion_matrix(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f2 = fbeta_score(y_test, pred, beta=2)
    roc_auc = roc_auc_score(y_test, pred_proba)
    log_metric('ROC AUC score', roc_auc)
    log_metric('precision', precision)
    log_metric('recall', recall)
    log_metric('f2_score', f2)

    print('confusion matirx')
    print(confusion)
    print(f'AUC: {roc_auc:.4f}, F2 score: {f2:.4f}, 정밀도: {precision:.4f}, 재현율: {recall:.4f}')

In [6]:
from datetime import datetime

def eval_model(model, X_train, X_test, y_train, y_test, tag='default', experiment_id=None):
    now = datetime.now()
    algorithm_name = model.__class__.__name__
    run_name = f'{algorithm_name}_{tag}_{now.strftime("%H:%M:%S")}'
    with mlflow.start_run(run_name=run_name, experiment_id=experiment_id):
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        pred_proba = model.predict_proba(X_test)[:, 1]
        get_clf_eval(y_test, pred, pred_proba)

In [7]:
def get_experiment_id(experiment_name):
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        return mlflow.create_experiment(experiment_name)
    else:
        return experiment.experiment_id

In [9]:
def train_model(df, model, experiment_name, column=None, scaler=None):
    experiment_id = get_experiment_id(experiment_name)
    X_train, X_test, y_train, y_test = split_train_and_test(df, column, scaler)
    eval_model(model, X_train, X_test, y_train, y_test, column, experiment_id)

In [None]:
# baseline model
from lightgbm import LGBMClassifier

lgbm_clf = LGBMClassifier(n_estimators=500, num_leaves=64, n_jobs=-1, boost_from_average=False)

train_model(card_df, lgbm_clf, 'baseline')

In [None]:
# LogisticRegression과 LightGBM Classifier를 가지고 각 컬럼을 스케일링했을 때 성능 확인

from joblib import Parallel, delayed
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

experiment_name = "StandardScaler experiment"
scaler = StandardScaler()
columns = card_df.columns.tolist()
columns.remove('Time')
columns.remove('Class')
columns.append(None)
lr_clf = LogisticRegression(max_iter=1000)
lgbm_clf = LGBMClassifier(n_estimators=500, num_leaves=64, n_jobs=-1, boost_from_average=False)
models = [lr_clf, lgbm_clf]
results = Parallel(n_jobs=-1, verbose=10)(
    delayed(train_model)(card_df, model, experiment_name, column, scaler)
    for column in columns for model in models
)