# Module import

In [1]:
import numpy as np 
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.covariance import EllipticEnvelope
from tqdm.notebook import tqdm

import lightgbm 
from lightgbm import LGBMClassifier
import optuna
from optuna.samplers import TPESampler

import torch

import warnings
warnings.filterwarnings(action='ignore')



In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time
import collections
from collections import Counter

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, RobustScaler, StandardScaler

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedShuffleSplit, StratifiedKFold, cross_val_score

import xgboost as xgb
import optuna
import umap

# Data Load

In [2]:
train = pd.read_csv('./dacon/train.csv')
valid = pd.read_csv('./dacon/val.csv')
test = pd.read_csv('./dacon/test.csv')

In [3]:
trainset = train.drop(['ID'] , axis = 1) 
testset = test.drop(['ID'] , axis = 1) 

In [4]:
fraud_ratio = valid['Class'].values.sum() / len(valid)
print(fraud_ratio)

0.0010540369615627855


In [5]:
model = EllipticEnvelope(support_fraction = 0.994, contamination = fraud_ratio, random_state = 42) 
model.fit(trainset)

EllipticEnvelope(contamination=0.0010540369615627855, random_state=42,
                 support_fraction=0.994)

# Ensemble을 위한 test prediction value 1 획득

In [6]:
def get_pred_label(model, x, k):
  prob = model.score_samples(x)
  prob = torch.tensor(prob, dtype = torch.float)
  topk_indices = torch.topk(prob, k = k, largest = False).indices

  pred = torch.zeros(len(x), dtype = torch.long)
  pred[topk_indices] = 1
  return pred , prob

In [7]:
test_pred, _ = get_pred_label(model, testset, 313)

In [8]:
envelope_pred = np.array(test_pred)

# 분류 모델링을 위한 trainset label 임의 획득

In [9]:
# valid와 동일한 비율 118~120개 사이의 fraud label을 가질 거라 가정하여 label 획득
# trainset의 label을 임의로 준 이유는 지도학습의 결과를 함께 앙상블하기 위함
# 물론, 일정 부분 잘못된 label을 부여하고 학습을 하는 것이 논리에 어긋나지만 
# 하나라도 사기 거래를 잘 찾아내는 것에 집중하기 위해 잘못된 label을 주고 모델 학습

train_pred, _ = get_pred_label(model, trainset, 118)
Y = np.array(train_pred)

# 모델 최적화 (optuna module 사용)

In [None]:
def RF_objective(trial: optuna.trial.Trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric':'auc',
        'booster':'gbtree',
        'n_estimators': trial.suggest_int("n_estimators", 100, 1000, step=10),
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        
        'tree_method':'gpu_hist', 'predictor':'gpu_predictor', 'gpu_id': 0, # GPU 사용시
        # "tree_method": 'exact', 'gpu_id': -1,  # CPU 사용시
        
        "gamma": trial.suggest_float("gamma", 0.1, 1.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.6, 1.0, 0.05),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-2, 1),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-2, 1),
        'scale_pos_weight': scale_pos_weight
        # 'early_stopping_rounds': trial.suggest_int('early_stopping_rounds', 5, 20, step=5)
        
        # "learning_rate": trial.suggest_uniform('learning_rate', 0.0001, 0.99),
        # 'num_parallel_tree': trial.suggest_int("num_parallel_tree", 1, 500) 추가하면 느려짐.
    }

    model = xgb.XGBClassifier(**params, random_state = 42, use_label_encoder = False, n_jobs=-1)
    
    cv = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=42)
    score = cross_val_score(model, X, y, cv=cv, scoring="f1", n_jobs=-1)
    f1_mean = score.mean()

    return f1_mean

In [37]:
# optuna.logging.set_verbosity(optuna.logging.WARNING)

In [None]:
#Execute optuna and set hyperparameters
RF_study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
RF_study.optimize(RF_objective, n_trials=1000)

In [47]:
RF_study.best_params

{'n_estimators': 460,
 'max_depth': 4,
 'gamma': 0.7155935590497422,
 'min_child_weight': 1,
 'subsample': 0.95,
 'colsample_bytree': 0.8516143607917493,
 'colsample_bylevel': 0.8499089569307289,
 'colsample_bynode': 0.7783533814866374,
 'reg_lambda': 0.20646353054278152,
 'reg_alpha': 0.031101827218005206}

In [48]:
#Create an instance with tuned hyperparameters
rfc = xgb.XGBClassifier(**RF_study.best_params,
                        objective='binary:logistic',
                        eval_metric='auc',
                        booster='gbtree',
                        scale_pos_weight=scale_pos_weight
                        random_state = 42,
                        use_label_encoder = False, n_jobs=-1)

In [49]:
rfc.fit(trainset, Y)

In [50]:
y_pred = rfc.predict(testset)

# Ensemble을 위한 test prediction value 2 획득

In [14]:
# Ensemble 시,  AorB : true 조건을 사용한 이유는 
# 두 예측 시스템에서 최소 1번이라도 fraud로 예측된 example은 fraud로 의사 결정을 내리기 위함
# 이유 :: 사기 거래 탐지는 보수적인 의사 결정을 해야 옳다고 생각했기 때문에.

sub = pd.read_csv('./dacon/sample_submission.csv')
sub['Class'] = envelope_pred|y_pred # Ensemble 
sub.to_csv('./dacon/AorBans_result.csv', index = False)