In [1]:
from google.colab import drive
# drive.mount('/content/drive/MyDrive/IITP/sohyun/creditcard_prediction/data')
drive.mount('/content/drive')

%cd drive/MyDrive/IITP/sohyun/creditcard_prediction/data

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/IITP/sohyun/creditcard_prediction/data


In [None]:
!pip install wandb -qqq
import wandb
wandb.login()

[K     |████████████████████████████████| 1.8 MB 7.4 MB/s 
[K     |████████████████████████████████| 181 kB 56.2 MB/s 
[K     |████████████████████████████████| 146 kB 68.0 MB/s 
[K     |████████████████████████████████| 63 kB 1.9 MB/s 
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

# seed 고정
import random
random.seed(1004)

#-------------------#
#---# Data Load #---#
#-------------------#
train_df = pd.read_csv('./train.csv') # Train
val_df = pd.read_csv('./val.csv') # Validation
test_df = pd.read_csv('./test.csv') # Test

# validation set 사기 거래 비율
val_normal, val_fraud = val_df['Class'].value_counts()
val_contamination = val_fraud / val_normal

train_x = train_df.drop(columns=['ID']) # Input Data # Train dataset은 Label이 존재하지 않음
val_x = val_df.drop(columns=['ID', 'Class']) # Input Data
val_y = val_df['Class'] # Label
test_x = test_df.drop(columns=['ID'])
submit = pd.read_csv('./sample_submission.csv')

### train + validation
x_t = train_df.drop(columns=['ID']) # Input Data
x_v = val_df.drop(columns=['ID']) # Input Data
y_v = val_df['Class'] # Label
x_t['Class'] = 0
tv = pd.concat([x_t, x_v]) # train + validation dataset (train label은 0으로 우선 넣음)
tv = tv.reset_index(drop = True)
y_tv = tv['Class']
x_tv = tv.drop(columns=['Class'])

#-------------------#
#---# Normalize #---#
#-------------------#
# case 1 - standardscaler
scaler_n = StandardScaler()
scaler_n.fit(train_x)

train_x_scaleN = pd.DataFrame(scaler_n.transform(train_x), columns = train_x.columns) # 확인 : train_x_scaleN.mean(), train_x_scaleN.var()
val_x_scaleN = pd.DataFrame(scaler_n.transform(val_x), columns = val_x.columns)
test_x_scaleN = pd.DataFrame(scaler_n.transform(test_x), columns = test_x.columns)

# case 2 - minmax scaler
scaler_m = MinMaxScaler()
scaler_m.fit(train_x)

train_x_scaleM = pd.DataFrame(scaler_m.transform(train_x), columns = train_x.columns)
val_x_scaleM = pd.DataFrame(scaler_m.transform(val_x), columns = val_x.columns)
test_x_scaleM = pd.DataFrame(scaler_m.transform(test_x), columns = test_x.columns)

# train + validation - case 1
scaler_n_all = StandardScaler()
scaler_n_all.fit(x_tv)
x_scaleN = pd.DataFrame(scaler_n_all.transform(x_tv), columns = x_tv.columns) # scaler_n_all.transform(x_tv) : 결과 ndarray

# train + validation - case 2
scaler_m_all = MinMaxScaler()
scaler_m_all.fit(x_tv)
x_scaleM = pd.DataFrame(scaler_m_all.transform(x_tv), columns = x_tv.columns)


#-----------------------------#
#---# Dimension Reduction #---#
#-----------------------------#
n_pca = 5
# pca = PCA() # n_componenets : 투영할 차원의 수
# pca.fit(train_x_scaleN)
# train_x_pca_scaleN = pd.DataFrame(pca.transform(train_x_scaleN), columns = train_x.columns)
# val_x_pca_scaleN = pd.DataFrame(pca.transform(val_x_scaleN), columns = val_x.columns)
# test_x_pca_scaleN = pd.DataFrame(pca.transform(test_x_scaleN), columns = test_x.columns)

# pca2 = PCA()
# pca2.fit(train_x_scaleM)
# train_x_pca_scaleM = pd.DataFrame(pca2.transform(train_x_scaleM), columns = train_x.columns)
# val_x_pca_scaleM = pd.DataFrame(pca2.transform(val_x_scaleM), columns = val_x.columns)
# test_x_pca_scaleM = pd.DataFrame(pca2.transform(test_x_scaleM), columns = test_x.columns)

# train + validation
pca3 = PCA(n_pca)
pca3.fit(x_scaleN)
x_pca_scaleN = pd.DataFrame(pca3.transform(x_scaleN))
test_x_pca_scaleN_all = pd.DataFrame(pca3.transform(test_x_scaleN)) ###

pca4 = PCA(n_pca)
pca4.fit(x_scaleM)
x_pca_scaleM = pd.DataFrame(pca4.transform(x_scaleM))
test_x_pca_scaleM_all = pd.DataFrame(pca4.transform(test_x_scaleM))

from sklearn.mixture import GaussianMixture

N_mixture = 8
gmm = GaussianMixture(n_components=N_mixture, random_state = 1004, covariance_type = 'full').fit(x_pca_scaleN)
gmm_labels = gmm.predict(x_pca_scaleN)

print(f"0의 개수 : {list(gmm_labels).count(0)}, 1의 개수 : {list(gmm_labels).count(1)}")

# means : 평균 / cov : 공분산 / std : 표준편차
means = gmm.means_
cov = gmm.covariances_
# std = [np.sqrt(np.trace(cov[i]/N_mixture)) for i in range(0, N_mixture)]
std = []
for i in range(len(cov)) :
  each_g = []
  for j in range(n_pca) :
    each_g.append(np.sqrt(cov[i][j, j]))
  std.append(each_g)

anomal_idx = list(y_tv == 1)
nomal_idx_ = list(y_tv == 0)

anomal_x = x_pca_scaleN.loc[anomal_idx, :]
anomal_x = anomal_x.reset_index(drop=True)
# anomal_x = anomal_x.drop(columns=['index'])

probability = gmm.predict_proba(anomal_x) # Evaluate the components’ density for each sample.
print(pd.DataFrame(probability).idxmax(axis = 1).value_counts())
print(pd.DataFrame(probability).idxmax(axis = 1).value_counts().idxmax()) 
outlier_dist = list(pd.DataFrame(probability).idxmax(axis = 1).value_counts().nlargest(1).index) # 가장 많은 이상값을 가진 분포
print(f"outlier_dist:{outlier_dist}")
## test
# p_test = gmm.predict_proba(test_x_pca_scaleN_all)
# p_test_df = pd.DataFrame(p_test).idxmax(axis = 1).to_frame()
# out_dist = list(pd.DataFrame(probability).idxmax(axis = 1).value_counts().nlargest(3).index)

p_train = gmm.predict_proba(x_pca_scaleN)
p_train_df = pd.DataFrame(p_train).idxmax(axis = 1).to_frame()

only_normal = p_train_df[~p_train_df.loc[:,0].isin(outlier_dist)] 
# p_df['Class'] = np.where(p_df[[0]] == 6, 1, 0)

0의 개수 : 36951, 1의 개수 : 31175
2    15
4     7
7     3
3     3
0     1
1     1
dtype: int64
2
outlier_dist:[2]


In [3]:
# p_df.loc[:,0].isin(out_dist)
only_normal.loc[:,0] = 0 
only_normal_idx = only_normal.index.tolist()
train_df = x_tv.loc[only_normal_idx,:]

In [4]:
import random
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

EPOCHS = 500
LR = 1e-2
BS = 15000 # 16384
SEED = 1004

# wandb.init(project="") # wandb init

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED) # Seed 고정

# train_df = pd.read_csv('./train.csv')
# train_df = train_df.drop(columns=['ID'])
val_df = pd.read_csv('./val.csv')
val_df = val_df.drop(columns=['ID'])

In [29]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

train_dataset = MyDataset(df=train_df, eval_mode=False)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True)

val_dataset = MyDataset(df = val_df, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False)


class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(30,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
            # nn.Linear(30,20),
            # nn.BatchNorm1d(20),
            # nn.ReLU(),

            # nn.Linear(20,10),
            # nn.BatchNorm1d(10),
            # nn.ReLU(),

            # nn.Linear(10,5),
            # nn.BatchNorm1d(5),
            # nn.ReLU()

        #     nn.Linear(64,128),
        #     nn.BatchNorm1d(128),
        #     nn.LeakyReLU(),

        )
        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,30),
            # nn.Linear(5,10),
            # nn.BatchNorm1d(10),
            # nn.ReLU(),
            # nn.Linear(10,20),
            # nn.BatchNorm1d(20),
            # nn.ReLU(),
            # nn.Linear(20,30)

            # nn.Linear(128,64),
            # nn.BatchNorm1d(64),
            # nn.LeakyReLU(),
            # nn.Linear(64,30),
        )
        
    def forward(self, x, device):
        x_enc = self.Encoder(x)
        x_dec = self.Decoder(x_enc)
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        diff = cos(x, x_dec).cpu().tolist()
        diff = torch.tensor(diff, device = device) # .reshape(-1, 1)
        print(x.shape)
        print(diff.shape)
        concat_vector = torch.stack([x.reshape(-1,1).squeeze(), diff], dim = 0)
        return x, concat_vector

class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        # self.criterion = nn.MSELoss().to(self.device)
    
    def fit(self):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x, self.device)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()
                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)

            # wandb.log({"f1-score":score, "train_loss":loss})

            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x, self.device)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr,1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

model = nn.DataParallel(AutoEncoder())
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

model = AutoEncoder()
model.load_state_dict(torch.load('./best_model.pth'))
model = nn.DataParallel(model)
model.eval()

test_df = pd.read_csv('./test.csv')
test_df = test_df.drop(columns=['ID'])

test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=BS, shuffle=False, num_workers=6)

def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

preds = prediction(model, 0.95, test_loader, device)

torch.Size([15000, 30])
torch.Size([15000])


RuntimeError: ignored

In [None]:
submit = pd.read_csv('./sample_submission22.csv')
submit['Class'] = preds
submit.to_csv('./submit_autoencoder.csv', index=False)