Imports

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import sys, os
import seaborn as sns
from scipy import stats
from tqdm import tqdm
import random
import time

import tensorflow as tf
from tensorflow.keras import Model, layers, losses

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import( confusion_matrix,accuracy_score,precision_score,f1_score
                            ,recall_score,precision_recall_curve,
                            average_precision_score,roc_curve)

from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest, VotingClassifier
from sklearn.cluster import KMeans
from sklearn.svm import OneClassSVM
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import SGDOneClassSVM
from sklearn.pipeline import make_pipeline


warnings.filterwarnings(action='ignore')

mse = tf.keras.losses.mean_squared_error

random seed 고정

In [47]:
random_seed = 42

random.seed(random_seed)
np.random.seed(random_seed)
os.environ["PYTHONHASHSEED"] = str(random_seed)
tf.random.set_seed(random_seed)

csv read

In [48]:
train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/val.csv')
test = pd.read_csv('data/test.csv')
val_contamination=0.001055
train_in = train.drop(['ID'], axis=1)
val_in = val.drop(['ID','Class'], axis=1)
val_Y = val['Class']
test_in = test.drop(columns=['ID'])

#연습1

In [49]:
#train.info()
#val.info()
#test.info()
#val_in.info()

val_in.head()
train_in.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30
0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,4.983721,-0.994972
1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,1.418291,-0.994972
2,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,-0.256131,-0.99496
3,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,1.249376,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,0.262698,-0.994901
4,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,-0.41043,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,0.9949,-0.994901


Autoencoder

In [50]:
class Autoencoder(Model):
    def __init__(self, encoding_dim,input_shape):
        super(Autoencoder, self).__init__()
        self.latent_dim = encoding_dim
        self.encoder = tf.keras.Sequential([
        layers.Dense(64, activation='sigmoid'),
        layers.Dense(32, activation='sigmoid'),
        layers.Dense(self.latent_dim, activation='sigmoid'),
        ])
        self.decoder = tf.keras.Sequential([
        layers.Dense(32, activation='sigmoid'),
        layers.Dense(64, activation='sigmoid'),
        layers.Dense(input_shape)
        ])
        self.mse = tf.keras.losses.mean_squared_error
    def call(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded
    def reconstruction_loss(self,x): 
        """
        생성된 데이터와, 원래 데이터 사이의 mse 계산
        """
        out = self.call(x)
        err = mse(x,out)
        return err
    def encode(self, x): # encoding
        return self.encoder(x)

train_ae = train.drop(columns=['ID']).to_numpy()

AE = Autoencoder(16,len(train_ae[0]))

AE.compile(optimizer='SGD',
              loss='mse'
              )
AE.fit(train_ae,train_ae,batch_size=1024,epochs=200,verbose=0)

<keras.callbacks.History at 0x1d707d44ee0>

preprocessing

In [51]:
def update_dis(df):
    cols = list(df.columns)
    cols = [col for col in cols if col.startswith('V')]
    x = df[cols].to_numpy()
    dis = AE.reconstruction_loss(x)
    df['AE'] = dis
      
ss = StandardScaler()

train_in = ss.fit_transform(train_in)
val_in = ss.transform(val_in)
test_in = ss.transform(test_in)

평가 지표

In [52]:
def get_clf_eval(y_test , pred):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred, average='macro')
    recall = recall_score(y_test , pred, average='macro')
    f1 = f1_score(y_test,pred, average='macro')
    print('오차 행렬')
    print(confusion)
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1:{3:.4f}'.format(accuracy, precision, recall, f1))

models
참고 링크, sklearn 공식
https://scikit-learn.org/stable/auto_examples/miscellaneous/plot_anomaly_comparison.html#sphx-glr-auto-examples-miscellaneous-plot-anomaly-comparison-py

In [53]:
kmeans = KMeans(n_clusters=2, random_state=42)
EE = EllipticEnvelope(contamination=val_contamination, random_state=42)
iF = IsolationForest(n_estimators=125, max_samples=len(train_in), contamination=val_contamination, random_state=42, verbose=0)
lof = LocalOutlierFactor(novelty=True,contamination=val_contamination)
svm = OneClassSVM(nu=val_contamination, kernel="rbf",)
svm_sgd =  make_pipeline(
            Nystroem(gamma=0.1, random_state=42, n_components=42),
            SGDOneClassSVM(
                nu=val_contamination,
                shuffle=True,
                fit_intercept=True,
                random_state=42,
                tol=1e-6,
            ))

models = [iF,lof,kmeans,svm,EE,svm_sgd]
for m in models:
    print(m)
    start = time.time()
    m.fit(train_in)
    execute_time = time.time() - start
    print('Execute time : {:.4f}'.format(execute_time))
    print('-'*50)

IsolationForest(contamination=0.001055, max_samples=113842, n_estimators=125,
                random_state=42)
Execute time : 12.3511
--------------------------------------------------
LocalOutlierFactor(contamination=0.001055, novelty=True)
Execute time : 336.2634
--------------------------------------------------
KMeans(n_clusters=2, random_state=42)
Execute time : 3.7474
--------------------------------------------------
OneClassSVM(nu=0.001055)
Execute time : 27.7271
--------------------------------------------------
EllipticEnvelope(contamination=0.001055, random_state=42)
Execute time : 25.3529
--------------------------------------------------
Pipeline(steps=[('nystroem',
                 Nystroem(gamma=0.1, n_components=42, random_state=42)),
                ('sgdoneclasssvm',
                 SGDOneClassSVM(nu=0.001055, random_state=42, tol=1e-06))])
Execute time : 0.9543
--------------------------------------------------


앙상블(vote)

In [54]:
class ensemble():
    def __init__(self, models, prior_distribution=None):
        super(ensemble, self).__init__()
        self.models = models
        self.vote_weights = np.zeros(len(models))

    def get_pred_label(self, model_pred, unique):
        '''
        ano : 1, real : 0
        '''
        unique_dict = {}
        unique = sorted(unique.items(), key = lambda item: item[1])
        return_type = ['ano','real']
        return_v = [1,0]
        return_dict = {}
        for k,v in zip(return_type,return_v):
            return_dict[k] = v 

        for idx, (k,v) in enumerate(unique):
            unique_dict[k] = return_type[idx]
        model_pred = np.vectorize(unique_dict.get)(model_pred)
        model_pred = np.vectorize(return_dict.get)(model_pred)
        return model_pred
    def predict(self, x):
        """
        total predict
        """
        self.outs = []
        dist = []
        for model in self.models:
            out = model.predict(x)
            unique, counts = np.unique(out, return_counts=True)
            uni = dict(zip(unique, counts))
            out = self.get_pred_label(out, uni)
            self.outs.append(out)
        return self.outs
    def mode(self, row):
        """
        get mode
        """
        data = row.to_numpy()
        out = stats.mode(data)[0][0]
        return out
    def get_df(self,x):
        """
        each model predict dataframe
        """
        df = {}
        data = self.predict(x)
        for model, v in zip(self.models, data):
            try:
                df[str(model.__class__())[:-2]] = v
            except:
                df[str(model[1].__class__())[:-2]] = v
        df = pd.DataFrame(df)
        df['vote'] = df.apply(lambda x: self.mode(x),axis=1)
        return df

    def drop_model(self,idx):
        """
        use model idx
        """
        self.model.pop(idx)

In [55]:
ens = ensemble(models)
out = ens.get_df(val_in)

AttributeError: 'NoneType' object has no attribute 'split'

모델별 평가

In [None]:
for col in list(out.columns):
    print(col)
    pred = out[col].to_numpy()
    get_clf_eval(val_Y,pred)
    print('-'*50)

Submit

In [None]:
df = ens.get_df(test_in)
submit = pd.read_csv('data/sample_submission.csv')
submit.head()

submit['Class'] = df['vote'].to_numpy()
submit.to_csv('./submit.csv', index=False)