In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

#-------------------#
#---# Data Load #---#
#-------------------#
train_df = pd.read_csv('drive/MyDrive/IITP/sohyun/creditcard_prediction/data/train.csv') # Train
val_df = pd.read_csv('drive/MyDrive/IITP/sohyun/creditcard_prediction/data/val.csv') # Validation
test_df = pd.read_csv('drive/MyDrive/IITP/sohyun/creditcard_prediction/data/test.csv') # Test

# validation set 사기 거래 비율
val_normal, val_fraud = val_df['Class'].value_counts()
val_contamination = val_fraud / val_normal

train_x = train_df.drop(columns=['ID']) # Input Data # Train dataset은 Label이 존재하지 않음
val_x = val_df.drop(columns=['ID', 'Class']) # Input Data
val_y = val_df['Class'] # Label
test_x = test_df.drop(columns=['ID'])
submit = pd.read_csv('drive/MyDrive/IITP/sohyun/creditcard_prediction/data/sample_submission.csv')

### train + validation
x_t = train_df.drop(columns=['ID']) # Input Data
x_v = val_df.drop(columns=['ID']) # Input Data
y_v = val_df['Class'] # Label
x_t['Class'] = 0
tv = pd.concat([x_t, x_v]) # train + validation dataset (train label은 0으로 우선 넣음)
y_tv = tv['Class']
x_tv = tv.drop(columns=['Class'])

#-------------------#
#---# Normalize #---#
#-------------------#
# case 1 - standardscaler
scaler_n = StandardScaler()
scaler_n.fit(train_x)

train_x_scaleN = pd.DataFrame(scaler_n.transform(train_x), columns = train_x.columns) # 확인 : train_x_scaleN.mean(), train_x_scaleN.var()
val_x_scaleN = pd.DataFrame(scaler_n.transform(val_x), columns = val_x.columns)
test_x_scaleN = pd.DataFrame(scaler_n.transform(test_x), columns = test_x.columns)

# case 2 - minmax scaler
scaler_m = MinMaxScaler()
scaler_m.fit(train_x)

train_x_scaleM = pd.DataFrame(scaler_m.transform(train_x), columns = train_x.columns)
val_x_scaleM = pd.DataFrame(scaler_m.transform(val_x), columns = val_x.columns)
test_x_scaleM = pd.DataFrame(scaler_m.transform(test_x), columns = test_x.columns)

# train + validation - case 1
scaler_n_all = StandardScaler()
scaler_n_all.fit(x_tv)
x_scaleN = pd.DataFrame(scaler_n_all.transform(x_tv), columns = x_tv.columns) # scaler_n_all.transform(x_tv) : 결과 ndarray

# train + validation - case 2
scaler_m_all = MinMaxScaler()
scaler_m_all.fit(x_tv)
x_scaleM = pd.DataFrame(scaler_m_all.transform(x_tv), columns = x_tv.columns)


#-----------------------------#
#---# Dimension Reduction #---#
#-----------------------------#
n_pca = 5
# pca = PCA() # n_componenets : 투영할 차원의 수
# pca.fit(train_x_scaleN)
# train_x_pca_scaleN = pd.DataFrame(pca.transform(train_x_scaleN), columns = train_x.columns)
# val_x_pca_scaleN = pd.DataFrame(pca.transform(val_x_scaleN), columns = val_x.columns)
# test_x_pca_scaleN = pd.DataFrame(pca.transform(test_x_scaleN), columns = test_x.columns)

# pca2 = PCA()
# pca2.fit(train_x_scaleM)
# train_x_pca_scaleM = pd.DataFrame(pca2.transform(train_x_scaleM), columns = train_x.columns)
# val_x_pca_scaleM = pd.DataFrame(pca2.transform(val_x_scaleM), columns = val_x.columns)
# test_x_pca_scaleM = pd.DataFrame(pca2.transform(test_x_scaleM), columns = test_x.columns)

# train + validation
pca3 = PCA(n_pca)
pca3.fit(x_scaleN)
x_pca_scaleN = pd.DataFrame(pca3.transform(x_scaleN))
test_x_pca_scaleN_all = pd.DataFrame(pca3.transform(test_x_scaleN)) ###

pca4 = PCA(n_pca)
pca4.fit(x_scaleM)
x_pca_scaleM = pd.DataFrame(pca4.transform(x_scaleM))
test_x_pca_scaleM_all = pd.DataFrame(pca4.transform(test_x_scaleM))

In [None]:
from sklearn.mixture import GaussianMixture

N_mixture = 8
gmm = GaussianMixture(n_components=N_mixture, random_state = 1004, covariance_type = 'full').fit(x_pca_scaleN)
gmm_labels = gmm.predict(x_pca_scaleN)

print(f"0의 개수 : {list(gmm_labels).count(0)}, 1의 개수 : {list(gmm_labels).count(1)}")

# means : 평균 / cov : 공분산 / std : 표준편차
means = gmm.means_
cov = gmm.covariances_
# std = [np.sqrt(np.trace(cov[i]/N_mixture)) for i in range(0, N_mixture)]
std = []
for i in range(len(cov)) :
  each_g = []
  for j in range(n_pca) :
    each_g.append(np.sqrt(cov[i][j, j]))
  std.append(each_g)

anomal_idx = list(y_tv == 1)
anomal_x = x_pca_scaleN.loc[anomal_idx, :]
anomal_x = anomal_x.reset_index(drop=True)
# anomal_x = anomal_x.drop(columns=['index'])

probability = gmm.predict_proba(anomal_x) # Evaluate the components’ density for each sample.
print(pd.DataFrame(probability).idxmax(axis = 1).value_counts())
print(pd.DataFrame(probability).idxmax(axis = 1).value_counts().idxmax()) 
outlier_dist = pd.DataFrame(probability).idxmax(axis = 1).value_counts().idxmax() # 가장 많은 이상값을 가진 분포

0의 개수 : 6326, 1의 개수 : 31052
6    14
3     6
2     4
5     3
7     2
0     1
dtype: int64
6


In [None]:
p = gmm.predict_proba(test_x_pca_scaleN_all)
p_df = pd.DataFrame(p).idxmax(axis = 1).to_frame()
# print(pd.DataFrame(p).idxmax(axis = 1).value_counts())

In [None]:
p_df['Class'] = np.where(p_df[[0]] == 6, 1, 0)

In [None]:
p_df

Unnamed: 0,0,Class
0,3,0
1,1,0
2,7,0
3,1,0
4,1,0
...,...,...
142498,2,0
142499,6,1
142500,4,0
142501,2,0


In [None]:
submit = pd.read_csv('drive/MyDrive/IITP/sohyun/creditcard_prediction/data/sample_submission.csv')
submit['Class'] = p_df['Class']
submit.to_csv('./submit.csv', index=False)