In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

# Data Load

In [2]:
train_df = pd.read_csv('drive/MyDrive/IITP/sohyun/creditcard_prediction/data/train.csv') # Train
val_df = pd.read_csv('drive/MyDrive/IITP/sohyun/creditcard_prediction/data/val.csv') # Validation
test_df = pd.read_csv('drive/MyDrive/IITP/sohyun/creditcard_prediction/data/test.csv') # Test

In [None]:
# 데이터 분포 확인
# train_df.drop(columns=['ID']).hist(bins = 50, figsize = (20,20))
# val_df.drop(columns=['ID', 'Class']).hist(bins = 50, figsize = (20,20))
# plt.show()

In [3]:
# validation set 사기 거래 비율
val_normal, val_fraud = val_df['Class'].value_counts()
val_contamination = val_fraud / val_normal
print(f'Validation contamination : [{val_contamination}]')

Validation contamination : [0.0010551491277433877]


In [4]:
# Train dataset은 Label이 존재하지 않음
train_x = train_df.drop(columns=['ID']) # Input Data
val_x = val_df.drop(columns=['ID', 'Class']) # Input Data
val_y = val_df['Class'] # Label
test_x = test_df.drop(columns=['ID'])
submit = pd.read_csv('drive/MyDrive/IITP/sohyun/creditcard_prediction/data/sample_submission.csv')

Normalize

In [5]:
# case 1 - standardscaler
from sklearn.preprocessing import StandardScaler
scaler_n = StandardScaler()
scaler_n.fit(train_x)
scaled = scaler_n.transform(train_x) # 결과 ndarray
train_x_scaleN = pd.DataFrame(scaled, columns = train_x.columns) # 확인 : train_x_scaleN.mean(), train_x_scaleN.var()

scaled = scaler_n.transform(val_x)
val_x_scaleN = pd.DataFrame(scaled, columns = val_x.columns)

scaled = scaler_n.transform(test_x)
test_x_scaleN = pd.DataFrame(scaled, columns = test_x.columns)

In [6]:
# case 2 - minmax scaler
from sklearn.preprocessing import MinMaxScaler
scaler_m = MinMaxScaler()
scaler_m.fit(train_x)

train_x_scaleM = pd.DataFrame(scaler_m.transform(train_x), columns = train_x.columns)
val_x_scaleM = pd.DataFrame(scaler_m.transform(val_x), columns = val_x.columns)
test_x_scaleM = pd.DataFrame(scaler_m.transform(test_x), columns = test_x.columns)

In [7]:
### 합친 버전 ### train + validation

x_t = train_df.drop(columns=['ID']) # Input Data
x_v = val_df.drop(columns=['ID']) # Input Data
y_v = val_df['Class'] # Label
x_t['Class'] = 0
tv = pd.concat([x_t, x_v]) # train + validation dataset (train label은 0으로 우선 넣음)
y_tv = tv['Class']
x_tv = tv.drop(columns=['Class'])

# case 1 - standardscaler
from sklearn.preprocessing import StandardScaler
scaler_n = StandardScaler()
scaler_n.fit(x_tv)
scaled = scaler_n.transform(x_tv) # 결과 ndarray
x_scaleN = pd.DataFrame(scaled, columns = x_tv.columns) 

# case 2 - minmax scaler
from sklearn.preprocessing import MinMaxScaler
scaler_m = MinMaxScaler()
scaler_m.fit(x_tv)

x_scaleM = pd.DataFrame(scaler_m.transform(x_tv), columns = x_tv.columns)

from sklearn.decomposition import PCA
pca = PCA(3) # n_componenets : 투영할 차원의 수
pca.fit(x_scaleN)
x_pca_scaleN = pd.DataFrame(pca.transform(x_scaleN))

pca2 = PCA(3) # n_componenets : 투영할 차원의 수
pca2.fit(x_scaleM)
x_pca_scaleM = pd.DataFrame(pca.transform(x_scaleM))

차원 축소

In [8]:
from sklearn.decomposition import PCA
pca = PCA() # n_componenets : 투영할 차원의 수
pca.fit(train_x_scaleN)
train_x_pca_scaleN = pd.DataFrame(pca.transform(train_x_scaleN), columns = train_x.columns)
val_x_pca_scaleN = pd.DataFrame(pca.transform(val_x_scaleN), columns = val_x.columns)
test_x_pca_scaleN = pd.DataFrame(pca.transform(test_x_scaleN), columns = test_x.columns)

pca2 = PCA()
pca2.fit(train_x_scaleM)
train_x_pca_scaleM = pd.DataFrame(pca2.transform(train_x_scaleM), columns = train_x.columns)
val_x_pca_scaleM = pd.DataFrame(pca2.transform(val_x_scaleM), columns = val_x.columns)
test_x_pca_scaleM = pd.DataFrame(pca2.transform(test_x_scaleM), columns = test_x.columns)

# from sklearn.manifold import TSNE
# tsne = TSNE(n_components = 2).fit_transform(train_x_scaleN)
# train_x_tsne_scaleN = pd.

In [None]:
# scaler 저장
# train_x_scaleM.to_csv("train_minmax.tsv", index = False, sep='\t')
# scaler 저장
# train_x_scaleN.to_csv("train_normalization.csv", index = False)

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
x_embedded = TSNE(n_components=2, perplexity=30, learning_rate=10, n_iter=1000).fit_transform(val_x_scaleN)
plt.figure(figsize = (12, 8))
scatter = plt.scatter(x_embedded[:,0], x_embedded[:,1], s=10, c=val_y, cmap='rainbow')
plt.savefig(f"./normalization_standardscaler")

x_embedded = TSNE(n_components=2, perplexity=30, learning_rate=10, n_iter=1000).fit_transform(val_x_scaleM)
plt.figure(figsize = (12, 8))
scatter = plt.scatter(x_embedded[:,0], x_embedded[:,1], s=10, c=val_y, cmap='rainbow')
plt.savefig(f"./normalization_minmaxscaler")

x_embedded = TSNE(n_components=2, perplexity=30, learning_rate=10, n_iter=1000).fit_transform(val_x_pca_scaleN)
plt.figure(figsize = (12, 8))
scatter = plt.scatter(x_embedded[:,0], x_embedded[:,1], s=10, c=val_y, cmap='rainbow')
plt.savefig(f"./normalization_standardscaler_pca")

x_embedded = TSNE(n_components=2, perplexity=30, learning_rate=10, n_iter=1000).fit_transform(val_x_pca_scaleM)
plt.figure(figsize = (12, 8))
scatter = plt.scatter(x_embedded[:,0], x_embedded[:,1], s=10, c=val_y, cmap='rainbow')
plt.savefig(f"./normalization_minmaxscaler_pca")

In [None]:
# 3차원으로 tSNE 그려보려던 시도
# from sklearn.manifold import TSNE
# import matplotlib.pyplot as plt
# x_embedded = TSNE(n_components=3, perplexity=30, learning_rate=10, n_iter=1000).fit_transform(val_x_scaleN)

# Model

In [None]:
def print_score(val_y, val_pred) :
    val_score = f1_score(val_y, val_pred, average='macro')
    print(f'Validation F1 Score : [{val_score}]')
    print(classification_report(val_y, val_pred))

In [None]:
from sklearn.cluster import KMeans
def training_km (model, train_x, val_x, test_x, *args) : ## K-Means model
  model.fit(train_x)

  val_pred = model.fit_predict(val_x)
  print_score(val_y, val_pred)

  test_pred = model.fit_predict(test_x) # model prediction
  submit['Class'] = test_pred
  submit.to_csv(f'./{args[0]}.csv', index=False)
      
  print(model.labels_)
  print(f"0의 개수 : {list(model.labels_).count(0)}, 1의 개수 : {list(model.labels_).count(1)}")

In [None]:
k_means = KMeans(n_clusters = 2)
training_km(k_means, train_x_scaleN, val_x_scaleN, test_x_scaleN,'submit_KM1')

k_means2 = KMeans(n_clusters = 2)
training_km(k_means2, train_x_scaleM, val_x_scaleM, test_x_scaleM,'submit_KM2')

k_means3 = KMeans(n_clusters = 2)
training_km(k_means3, train_x_pca_scaleN, val_x_pca_scaleN, test_x_pca_scaleN,'submit_KM1_pca')

k_means4 = KMeans(n_clusters = 2)
training_km(k_means4, train_x_pca_scaleM, val_x_pca_scaleM, test_x_pca_scaleM,'submit_KM2_pca')

# k_means4 = KMeans(n_clusters = 2)
# training_km(k_means4, train_x_tsne_scaleM, val_x_tsne_scaleM, test_x_tsne_scaleM,'submit_KM2_tsne')

# k_means4 = KMeans(n_clusters = 2)
# training_km(k_means4, train_x_tsne_scaleM, val_x_tsne_scaleM, test_x_tsne_scaleM,'submit_KM2_tsne')

Validation F1 Score : [0.31434006938161824]
              precision    recall  f1-score   support

           0       1.00      0.46      0.63     28432
           1       0.00      0.50      0.00        30

    accuracy                           0.46     28462
   macro avg       0.50      0.48      0.31     28462
weighted avg       1.00      0.46      0.63     28462

[1 1 1 ... 0 0 0]
0의 개수 : 65258, 1의 개수 : 77245
Validation F1 Score : [0.3490369576387909]
              precision    recall  f1-score   support

           0       1.00      0.53      0.70     28432
           1       0.00      0.33      0.00        30

    accuracy                           0.53     28462
   macro avg       0.50      0.43      0.35     28462
weighted avg       1.00      0.53      0.70     28462

[1 1 1 ... 0 0 0]
0의 개수 : 52665, 1의 개수 : 61177
Validation F1 Score : [0.3529836935032736]
              precision    recall  f1-score   support

           0       1.00      0.54      0.70     28432
           1 

K-Means

DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
def training_ds (model, train_x, val_x, test_x, *args) : ## K-Means model
  model.fit(train_x)

  val_pred = model.fit_predict(val_x)
  val_pred = [0if i != -1 else 1 for i in val_pred]
  print_score(val_y, val_pred)

  test_pred = model.fit_predict(test_x) # model prediction
  test_pred = [0if i != -1 else 1 for i in test_pred]
  submit['Class'] = test_pred
  submit.to_csv(f'./{args[0]}.csv', index=False)
      
  print(model.labels_)
  print(f"0의 개수 : {list(model.labels_).count(0)}, 1의 개수 : {list(model.labels_).count(1)}")


  submit.to_csv('./submit_DBSCAN1.csv', index=False)

In [None]:
clusters = DBSCAN(eps = 1.5, min_samples = 2)
training_ds(clusters, train_x_scaleN, val_x_scaleN, test_x_scaleN,'submit_KM1')

clusters2 = DBSCAN(eps = 1.5, min_samples = 2)
training_ds(clusters2, train_x_scaleM, val_x_scaleM, test_x_scaleM,'submit_KM2')

clusters3 = DBSCAN(eps = 1.5, min_samples = 2)
training_km(clusters, train_x_pca_scaleN, val_x_pca_scaleN, test_x_pca_scaleN,'submit_KM1_pca')

clusters4 = DBSCAN(eps = 1.5, min_samples = 2)
training_km(clusters4, train_x_pca_scaleM, val_x_pca_scaleM, test_x_pca_scaleM,'submit_KM2_pca')

In [None]:
#---# standard scaler #---#
from sklearn.cluster import DBSCAN
clusters = DBSCAN(eps = 1.5, min_samples = 2).fit(train_x_scaleN)
print(clusters.labels_)
print(list(clusters.labels_).count(-1))

val_pred = clusters.fit_predict(val_x) # model prediction
val_pred = [0if i != -1 else 1 for i in val_pred]

print_score(val_y, val_pred)

test_pred = clusters.fit_predict(test_x) # model prediction
test_pred = [0if i != -1 else 1 for i in test_pred]
submit['Class'] = test_pred
submit.to_csv('./submit_DBSCAN1.csv', index=False)
################################################################################
print("=======================================================================")
#---# minmax scaler #---#
# clusters2 = DBSCAN(eps = 1.5, min_samples = 2).fit(train_x_scaleM)
# print(clusters2.labels_)
# print(list(clusters2.labels_).count(-1))

# val_pred = clusters2.fit_predict(val_x) # model prediction
# val_pred = [0if i != -1 else 1 for i in val_pred]

# print_score(val_y, val_pred)
# test_pred = clusters2.fit_predict(test_x) # model prediction
# test_pred = [0if i != -1 else 1 for i in test_pred]
# submit['Class'] = test_pred
# submit.to_csv('./submit_DBSCAN2.csv', index=False)



GMM

In [None]:
#---# standard scaler #---#
from sklearn.mixture import GaussianMixture

gmm = GaussianMixture(n_components = 2, random_state = 1004).fit(train_x_scaleN)
gmm_labels = gmm.predict(train_x_scaleN) # label 이용법

print(f"0의 개수 : {list(gmm_labels).count(0)}, 1의 개수 : {list(gmm_labels).count(1)}")

val_pred = gmm.predict(val_x_scaleN) # model prediction
print_score(val_y, val_pred)

test_pred = gmm.predict(test_x_scaleN) # model prediction
submit['Class'] = test_pred
submit.to_csv('./submit_GMM1.csv', index=False)
################################################################################
print("=======================================================================")
#---# minmax scaler #---#
gmm2 = GaussianMixture(n_components = 2, random_state = 1004).fit(train_x_scaleM)
gmm_labels = gmm2.predict(train_x_scaleM) # label 이용법
print(f"0의 개수 : {list(gmm_labels).count(0)}, 1의 개수 : {list(gmm_labels).count(1)}")

val_pred = gmm2.predict(val_x_scaleM) # model prediction
print_score(val_y, val_pred)

test_pred = gmm2.predict(test_x_scaleM) # model prediction
submit['Class'] = test_pred
submit.to_csv('./submit_GMM2.csv', index=False)
################################################################################
print("=======================================================================")

gmm3 = GaussianMixture(n_components = 2, random_state = 1004).fit(train_x_scaleN)
gmm_labels = gmm3.predict(train_x_pca_scaleN) # label 이용법

print(f"0의 개수 : {list(gmm_labels).count(0)}, 1의 개수 : {list(gmm_labels).count(1)}")

val_pred = gmm3.predict(val_x_pca_scaleN) # model prediction
print_score(val_y, val_pred)

test_pred = gmm3.predict(test_x_pca_scaleN) # model prediction
submit['Class'] = test_pred
submit.to_csv('./submit_GMM1_pca.csv', index=False)

################################################################################
print("=======================================================================")
gmm4 = GaussianMixture(n_components = 2, random_state = 1004).fit(train_x_scaleM)
gmm_labels = gmm4.predict(train_x_pca_scaleM) # label 이용법
print(f"0의 개수 : {list(gmm_labels).count(0)}, 1의 개수 : {list(gmm_labels).count(1)}")

val_pred = gmm4.predict(val_x_pca_scaleM) # model prediction
print_score(val_y, val_pred)

test_pred = gmm4.predict(test_x_pca_scaleM) # model prediction
submit['Class'] = test_pred
submit.to_csv('./submit_GMM2_pca.csv', index=False)

0의 개수 : 59454, 1의 개수 : 54388
Validation F1 Score : [0.3436073870021054]
              precision    recall  f1-score   support

           0       1.00      0.52      0.69     28432
           1       0.00      0.50      0.00        30

    accuracy                           0.52     28462
   macro avg       0.50      0.51      0.34     28462
weighted avg       1.00      0.52      0.68     28462

0의 개수 : 20761, 1의 개수 : 93081
Validation F1 Score : [0.1526835794354364]
              precision    recall  f1-score   support

           0       0.99      0.18      0.31     28432
           1       0.00      0.07      0.00        30

    accuracy                           0.18     28462
   macro avg       0.50      0.12      0.15     28462
weighted avg       0.99      0.18      0.30     28462

0의 개수 : 59983, 1의 개수 : 53859
Validation F1 Score : [0.34677865275890574]
              precision    recall  f1-score   support

           0       1.00      0.53      0.69     28432
           1       0

IsolationForest

In [None]:
# 가설 설정 : Train dataset도 Validation dataset과 동일한 비율로 사기거래가 발생 했을 것이다. -> model parameter : contamination=val_contamination(=0.001055) 적용
model = IsolationForest(n_estimators=125, max_samples=len(train_x_scaleN), contamination=val_contamination, random_state=42, verbose=0)
model.fit(train_x_scaleN)

model2 = IsolationForest(n_estimators=125, max_samples=len(train_x_scaleM), contamination=val_contamination, random_state=42, verbose=0)
model2.fit(train_x_scaleM)

model3 = IsolationForest(n_estimators=125, max_samples=len(train_x_scaleM), contamination=val_contamination, random_state=42, verbose=0)
model3.fit(train_x_scaleN)

model4 = IsolationForest(n_estimators=125, max_samples=len(train_x_scaleM), contamination=val_contamination, random_state=42, verbose=0)
model4.fit(train_x_scaleM)

IsolationForest(contamination=0.0010551491277433877, max_samples=113842,
                n_estimators=125, random_state=42)

In [None]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

#---# standard scaler #---#
val_pred = model.predict(val_x_scaleN) # model prediction
val_pred = get_pred_label(val_pred)
print_score(val_y, val_pred)

test_pred = model.predict(test_x_scaleN) # model prediction
test_pred = get_pred_label(test_pred)
submit['Class'] = test_pred
submit.to_csv('./submit_IF1.csv', index=False)

################################################################################
print("=======================================================================")
#---# minmax scaler #---#
val_pred = model2.predict(val_x_scaleM) # model prediction
val_pred = get_pred_label(val_pred)
print_score(val_y, val_pred)

test_pred = model2.predict(test_x_scaleM) # model prediction
test_pred = get_pred_label(test_pred)
submit['Class'] = test_pred
submit.to_csv('./submit_IF2.csv', index=False)

################################################################################
print("=======================================================================")
#---# standard scaler #---#
val_pred = model3.predict(val_x_pca_scaleN) # model prediction
val_pred = get_pred_label(val_pred)
print_score(val_y, val_pred)

test_pred = model3.predict(test_x_pca_scaleN) # model prediction
test_pred = get_pred_label(test_pred)
submit['Class'] = test_pred
submit.to_csv('./submit_IF1_pca.csv', index=False)

################################################################################
print("=======================================================================")
val_pred = model4.predict(val_x_pca_scaleM) # model prediction
val_pred = get_pred_label(val_pred)
print_score(val_y, val_pred)

test_pred = model4.predict(test_x_pca_scaleM) # model prediction
test_pred = get_pred_label(test_pred)
submit['Class'] = test_pred
submit.to_csv('./submit_IF2_pca.csv', index=False)

Validation F1 Score : [0.7030820840915222]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.41      0.40      0.41        30

    accuracy                           1.00     28462
   macro avg       0.71      0.70      0.70     28462
weighted avg       1.00      1.00      1.00     28462

Validation F1 Score : [0.7030820840915222]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.41      0.40      0.41        30

    accuracy                           1.00     28462
   macro avg       0.71      0.70      0.70     28462
weighted avg       1.00      1.00      1.00     28462

Validation F1 Score : [0.5302328161295079]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.06      0.07      0.06        30

    accuracy                           1.00     28462

# 7/12

In [45]:
from sklearn.svm import OneClassSVM
anomal_idx = list(y_tv == 1)
anomal_idx = (np.where(anomal_idx)[0]).tolist()
nomal_idx = list(set(range(len(y_tv))) - set(anomal_idx))
svm = OneClassSVM(kernel = 'rbf', nu = val_contamination, gamma = 0.00001)
svm.fit(x_scaleN.loc[nomal_idx])

OneClassSVM(gamma=1e-05, nu=0.0010551491277433877)

In [48]:
pred = svm.predict(test_x_scaleN)
pred = pred.tolist()
pred = i if i == 1 for 

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,


In [47]:
sum(svm.predict(test_x_scaleN))

142137