<a href="https://colab.research.google.com/github/ttogle918/AI_practice/blob/main/%EC%8B%A0%EC%9A%A9%EC%B9%B4%EB%93%9C_%EC%82%AC%EA%B8%B0_%EA%B1%B0%EB%9E%98_%ED%83%90%EC%A7%80_AI_%EA%B2%BD%EC%A7%84%EB%8C%80%ED%9A%8C/02_dbscan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [None]:
!pip install wandb
!wandb login

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA, LatentDirichletAllocation, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

import requests
import os 
import zipfile
from zipfile import ZipFile

## Data Load

분석하기 위한 데이터를 항상 압축된 형태로 보관하기 위해 **압축을 풀지 않고 zip 파일 내의 파일들을 읽어 데이터로 로딩**

압축풀기 : https://gmnam.tistory.com/256

In [None]:
zipfile = '/content/drive/MyDrive/Colab Notebooks/dataset/dacon/open.zip'

In [None]:
from io import BytesIO

data_filename = []
with ZipFile(zipfile, 'r') as zipObj:
    listOfFileNames = zipObj.namelist()
    for fileName in listOfFileNames:
        if fileName.endswith('csv'): 
            print(fileName)
            zipRead = zipObj.read(fileName)
            if fileName == 'train.csv' :
              train_df = pd.read_csv(BytesIO(zipRead))
            elif fileName == 'val.csv' :
              val_df = pd.read_csv(BytesIO(zipRead))
            elif fileName == 'test.csv' :
              test_df = pd.read_csv(BytesIO(zipRead))
            elif fileName == 'sample_submission.csv' :
              submission_df = pd.read_csv(BytesIO(zipRead))
            
            data_filename.append(fileName)

sample_submission.csv
test.csv
train.csv
val.csv


In [None]:
train_df.head(3)

In [None]:
print(val_df.Class.unique())
val_df.head(3)

In [None]:
test_df.head(3)

## Train/Validation Feature 분포 확인

In [None]:
train_df.drop(columns=['ID']).hist(bins = 50, figsize = (20,20))
plt.show()

In [None]:
val_df.drop(columns=['ID', 'Class']).hist(bins = 50, figsize = (20,20))
plt.show()

## Validation set 사기 거래 비율

Validation set의 사기 거래 비율이 다른 데이터집합에서도 비슷하게 발생할 것이라고 가정

In [None]:
val_normal, val_fraud = val_df['Class'].value_counts()
val_contamination = val_fraud / val_normal
print(f'Validation contamination : [{val_contamination}]')

Validation contamination : [0.0010551491277433877]


In [None]:
# Train dataset은 Label이 존재하지 않음
train_x = train_df.drop(columns=['ID']) # Input Data
val_x = val_df.drop(columns=['ID', 'Class'])
test_x = test_df.drop(columns=['ID'])

## 차원 축소

### pca

In [None]:
def pca(data, length=1) :
  if length == 1 :
    pca = PCA(n_components=len(data.columns)-1)
    pca.fit(data)
    print(pca.explained_variance_ratio_)
  else :
    pca = PCA(n_components=length)
  
  plt.title('Scree Plot')
  plt.xlabel('Number of components')
  plt.ylabel('explained_variance_')
  scree = np.cumsum(pca.explained_variance_ratio_)
  plt.plot(scree, 'o-')
  return pca.fit_transform(data)

In [None]:
pca(val_x)

### LDA

In [None]:
!pip install pyLDAvis

In [None]:
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(ldamodel, corpus, dictionary)
pyLDAvis.display(vis)

In [None]:
# class가 있어야 판단 가능
def lda(data_x, data_y, test_x) :
  # 선형 판별 분석
  lda = LinearDiscriminantAnalysis(n_components=1)    # 2 이상 불가, data_y의 class가 2라서 2-1이하만 가능
  lda.fit(data_x, data_y)
  transformed = lda.transform(data_x)
  transformed_test_x = lda.transform(test_x)
  print(data_x.shape, transformed.shape)
  print(test_x.shape, transformed_test_x.shape)
  return transformed, transformed_test_x

### t-SNE

In [None]:
def t_sne(data_x) :
  # 선형 판별 분석
  lda = TSNE(n_components=1)
  lda.fit(data_x)
  transformed = lda.transform(data_x)
  print(data_x.shape, transformed.shape)
  return transformed

### svd

In [None]:
def svd(data) :
  svd = TruncatedSVD(n_components=len(data.columns)-1)
  svd.fit(data)
  plt.title('Scree Plot')
  plt.xlabel('Number of components')
  plt.ylabel('explained_variance_')
  print(f'explained_variance_ratio_ : {svd.explained_variance_ratio_}')
  print(sum(svd.explained_variance_ratio_))
  print(f'singular_values_ : {svd.singular_values_}')
  plt.plot(svd.explained_variance_, 'o-')

# Model Define & Fit

In [None]:
# Train dataset은 Label이 존재하지 않음
val_x = val_df.drop(columns=['ID', 'Class']) # Input Data
val_y = val_df['Class']

In [None]:
acc = 0.0

In [None]:
def get_dbscan_valid(eps, min_samples, data=val_x, data_y = val_y, acc=acc) :
  y_pred = DBSCAN(eps=eps, min_samples = min_samples, metric='euclidean').fit_predict(data)
  anomaly, accuracy = 0, 0
  for i, n in enumerate(y_pred) :
    if n == -1 :
      if data_y[i] == 1 :
        accuracy += 1
    else :
      if data_y[i] == 0 :
        accuracy += 1

  accuracy /= len(y_pred)

  if accuracy > acc :
    acc = accuracy
    print(eps, min_samples, acc)
  return acc

def get_dbscan_percent(eps, min_samples, data=train_x) :
  y_pred = DBSCAN(eps=eps, min_samples = min_samples, metric='euclidean').fit_predict(data)
  anomaly = 0
  for n in y_pred :
    if n == -1 :
      anomaly += 1
  print(anomaly / len(y_pred))
  return y_pred

## 차원 감소 적용 후 dbscan

In [None]:
transformed_lda_x, transformed_test_x = lda(val_x, val_y, test_x)

(28462, 30) (28462, 1)
(142503, 30) (142503, 1)


In [None]:
acc = get_dbscan_valid(0.5, 2, transformed_lda_x, acc=acc)
acc = get_dbscan_valid(1.5, 2, transformed_lda_x, acc=acc)

(28462, 30) (28462, 1)
0.5 2 0.9992270395615206


In [None]:
acc, get_dbscan_valid(0.2, 2, transformed_lda_x, acc=acc)
acc = get_dbscan_valid(0.2, 12, transformed_lda_x, acc=acc)
acc = get_dbscan_valid(1.0, 12, transformed_lda_x, acc=acc)
acc = get_dbscan_valid(1.0, 8, transformed_lda_x, acc=acc)

0.2 2 0.9994027123884478
0.2 12 0.9994027123884478
1.0 12 0.9996486543461457


In [None]:
acc = get_dbscan_valid(0.5, 12, transformed_lda_x, acc=acc)
acc = get_dbscan_valid(0.8, 12, transformed_lda_x, acc=acc)
acc = get_dbscan_valid(0.8, 8, transformed_lda_x, acc=acc)
acc = get_dbscan_valid(0.8, 10, transformed_lda_x, acc=acc)

In [None]:
acc = get_dbscan_valid(0.5, 6, transformed_lda_x, acc=acc)


In [None]:
del transformed_lda_x

In [None]:
y_pred = get_dbscan_percent(0.2, 2, transformed_test_x)

# Evaluation : Validation set

In [None]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [None]:
val_x = val_df.drop(columns=['ID', 'Class']) # Input Data
val_y = val_df['Class'] # Label

val_pred = model.fit_predict(val_x) # model prediction
val_pred = get_pred_label(val_pred)
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))

## Inference : Test set

In [None]:
test_x = test_df.drop(columns=['ID'])

In [None]:
test_pred = model.predict(test_x) # model prediction
test_pred = get_pred_label(test_pred)

# Submission

In [None]:
submission_df['Class'] = test_pred
submission_df['Class'].value_counts()

0    142305
1       198
Name: Class, dtype: int64

In [None]:
submission_df.to_csv('./submit.csv', index=False)