In [1]:
#Importing librairies
import pandas as pd
import numpy as np

# Scikit-learn library: For SVM
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn import svm

import itertools

# Matplotlib library to plot the charts
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

# Library for the statistic data vizualisation
import seaborn as sns

%matplotlib inline

In [2]:
# 데이터 불러오기
data = pd.read_csv('creditcard_data.csv')
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284806 entries, 0 to 284805
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284806 non-null  float64
 1   V1      284806 non-null  float64
 2   V2      284806 non-null  float64
 3   V3      284806 non-null  float64
 4   V4      284806 non-null  float64
 5   V5      284806 non-null  float64
 6   V6      284806 non-null  float64
 7   V7      284806 non-null  float64
 8   V8      284806 non-null  float64
 9   V9      284806 non-null  float64
 10  V10     284806 non-null  float64
 11  V11     284806 non-null  float64
 12  V12     284806 non-null  float64
 13  V13     284806 non-null  float64
 14  V14     284806 non-null  float64
 15  V15     284806 non-null  float64
 16  V16     284806 non-null  float64
 17  V17     284806 non-null  float64
 18  V18     284806 non-null  float64
 19  V19     284806 non-null  float64
 20  V20     284806 non-null  float64
 21  V21     28

In [4]:
data.shape

(284806, 31)

# 전처리

## 1) 중복 제거

In [5]:
# 중복된 row 이는지 확인
print(f"Total duplicate rows = {data.duplicated().sum()}")
print(f"Data shape before removing duplicates = {data.shape}")

Total duplicate rows = 1081
Data shape before removing duplicates = (284806, 31)


In [6]:
# 중복된 행 제거
data_cleaned = data.drop_duplicates()

# 중복 제거 후의 데이터셋 크기 확인
print(f"Data shape after removing duplicates = {data_cleaned.shape}")

Data shape after removing duplicates = (283725, 31)


## 데이터 불균형 처리 - SMOTE

In [7]:
!pip install imbalanced-learn




[notice] A new release of pip is available: 23.1.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
data_cleaned['Class'].value_counts()

Class
0    283252
1       473
Name: count, dtype: int64

In [9]:
y = data_cleaned['Class']
X = data_cleaned.drop(['Class'], axis = 1)
print(X.info())
print(y)

<class 'pandas.core.frame.DataFrame'>
Index: 283725 entries, 0 to 284805
Data columns (total 30 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    283725 non-null  float64
 1   V1      283725 non-null  float64
 2   V2      283725 non-null  float64
 3   V3      283725 non-null  float64
 4   V4      283725 non-null  float64
 5   V5      283725 non-null  float64
 6   V6      283725 non-null  float64
 7   V7      283725 non-null  float64
 8   V8      283725 non-null  float64
 9   V9      283725 non-null  float64
 10  V10     283725 non-null  float64
 11  V11     283725 non-null  float64
 12  V12     283725 non-null  float64
 13  V13     283725 non-null  float64
 14  V14     283725 non-null  float64
 15  V15     283725 non-null  float64
 16  V16     283725 non-null  float64
 17  V17     283725 non-null  float64
 18  V18     283725 non-null  float64
 19  V19     283725 non-null  float64
 20  V20     283725 non-null  float64
 21  V21     283725 

In [10]:
from imblearn.over_sampling import SMOTE
# SMOTE 객체 생성
smote = SMOTE(random_state=42)

# 데이터 업샘플링 수행
X_resampled, y_resampled = smote.fit_resample(X, y)

# 업샘플링된 데이터의 클래스 분포 확인
resampled_class_distribution = pd.Series(y_resampled).value_counts()

resampled_class_distribution

Class
0    283252
1    283252
Name: count, dtype: int64

In [11]:
data_cleaned = pd.concat([X_resampled, y_resampled], axis=1)
data_cleaned.shape

(566504, 31)

## 스케일링

In [12]:
from sklearn.preprocessing import MinMaxScaler
#꼬리가 오른쪽으로 긴 변수
pro_skew = ['V3', 'V5', 'V7','V9', 'V10', 'V12', 'V14', 'V15', 'V16', 'V17', 'V18', 'V24','V27', 'V28']
neg_skew = ['V2', 'V11', 'V19', 'V20', 'V21']
normal_col = ['V1', 'V4','V6', 'V8', 'V13', 'V22', 'V23', 'V25', 'V26' ]

# pro_skew 로그변환 적용
# 변수 내의 최소값이 0 이하인 경우, 모든 값을 조금 올려서 로그 변환이 가능하도록 조정
if (data_cleaned[pro_skew].min() <= 0).any():  # .any()를 추가하여 Series 전체에서 하나라도 조건을 만족하는지 확인
    for col in pro_skew:
        if data_cleaned[col].min() <= 0:
            data_cleaned[col] += abs(data_cleaned[col].min()) + 0.1

for col in pro_skew:
    data_cleaned[col + '_log_transformed'] = np.log(data_cleaned[col])


# neg_skew는 음의 왜도.
# 제곱 변환 적용
for col in neg_skew:
  data_cleaned[col + '_squared_transformed'] = data_cleaned[col] ** 2

#normal_col
# 정규화할 변수 리스트
scaler = MinMaxScaler()
for col in normal_col:
    data_cleaned[col + '_minmax_transformed'] = scaler.fit_transform(data_cleaned[col].values.reshape(-1, 1)) # 2차원 배열로 바꿔서 실행


# 적용 다한 column은 drop
data_cleaned.drop(columns = pro_skew + normal_col + neg_skew, inplace = True)

data_cleaned.head()

Unnamed: 0,Time,Amount,Class,V3_log_transformed,V5_log_transformed,V7_log_transformed,V9_log_transformed,V10_log_transformed,V12_log_transformed,V14_log_transformed,...,V21_squared_transformed,V1_minmax_transformed,V4_minmax_transformed,V6_minmax_transformed,V8_minmax_transformed,V13_minmax_transformed,V22_minmax_transformed,V23_minmax_transformed,V25_minmax_transformed,V26_minmax_transformed
0,0.0,149.62,0,3.931079,4.731847,3.781842,2.631734,3.209999,2.899547,2.944605,...,0.000335,0.935192,0.313023,0.267669,0.786444,0.371591,0.522992,0.663793,0.585122,0.394557
1,0.0,2.69,0,3.88346,4.73535,3.774563,2.586157,3.199542,2.988151,2.953375,...,0.050974,0.978542,0.271796,0.262192,0.786298,0.48619,0.480237,0.666938,0.58729,0.446013
2,1.0,378.66,0,3.915991,4.730393,3.794336,2.486523,3.214703,2.936502,2.952218,...,0.061503,0.935217,0.268766,0.281122,0.788042,0.503854,0.54603,0.678939,0.559515,0.402727
3,1.0,123.5,0,3.916385,4.734732,3.781797,2.497086,3.2041,2.942434,2.945828,...,0.011729,0.941878,0.213661,0.275559,0.789434,0.487635,0.510277,0.662607,0.614245,0.389197
4,2.0,69.99,0,3.911509,4.73124,3.789859,2.663876,3.236375,2.96124,2.901128,...,8.9e-05,0.938617,0.269796,0.263984,0.782484,0.552509,0.547271,0.663392,0.566343,0.507497


In [13]:
data_cleaned.shape

(566504, 31)

In [14]:
data_cleaned.drop(columns=['Time'], inplace = True)

data_cleaned.head()

Unnamed: 0,Amount,Class,V3_log_transformed,V5_log_transformed,V7_log_transformed,V9_log_transformed,V10_log_transformed,V12_log_transformed,V14_log_transformed,V15_log_transformed,...,V21_squared_transformed,V1_minmax_transformed,V4_minmax_transformed,V6_minmax_transformed,V8_minmax_transformed,V13_minmax_transformed,V22_minmax_transformed,V23_minmax_transformed,V25_minmax_transformed,V26_minmax_transformed
0,149.62,0,3.931079,4.731847,3.781842,2.631734,3.209999,2.899547,2.944605,1.802884,...,0.000335,0.935192,0.313023,0.267669,0.786444,0.371591,0.522992,0.663793,0.585122,0.394557
1,2.69,0,3.88346,4.73535,3.774563,2.586157,3.199542,2.988151,2.953375,1.655272,...,0.050974,0.978542,0.271796,0.262192,0.786298,0.48619,0.480237,0.666938,0.58729,0.446013
2,378.66,0,3.915991,4.730393,3.794336,2.486523,3.214703,2.936502,2.952218,1.937995,...,0.061503,0.935217,0.268766,0.281122,0.788042,0.503854,0.54603,0.678939,0.559515,0.402727
3,123.5,0,3.916385,4.734732,3.781797,2.497086,3.2041,2.942434,2.945828,1.378143,...,0.011729,0.941878,0.213661,0.275559,0.789434,0.487635,0.510277,0.662607,0.614245,0.389197
4,69.99,0,3.911509,4.73124,3.789859,2.663876,3.236375,2.96124,2.901128,1.563198,...,8.9e-05,0.938617,0.269796,0.263984,0.782484,0.552509,0.547271,0.663392,0.566343,0.507497


# Modeling

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
SEED = 42

# Split the data into features and target variable
X = data_cleaned.drop(columns=[ 'Class'])  # Features
y = data_cleaned['Class']
                   # Target variable
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 비선형 SVM 분류기 초기화 (RBF 커널 사용)
svm_model = SVC(kernel='poly', random_state=SEED)

# 모델 훈련
svm_model.fit(X_train, y_train)

# 검증 세트에 대한 예측 수행
y_val_pred = svm_model.predict(X_val)

# 검증 세트에 대한 모델 성능 평가
val_accuracy = accuracy_score(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)

print("Validation Accuracy:", val_accuracy)
print("Validation Classification Report:\n", val_classification_report)



Validation Accuracy: 0.6954395812923099
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.99      0.76     56517
           1       0.97      0.40      0.57     56784

    accuracy                           0.70    113301
   macro avg       0.80      0.70      0.67    113301
weighted avg       0.80      0.70      0.67    113301



In [16]:
print(val_accuracy)
print('---------------------------------------------------------')
print(val_classification_report)

0.6954395812923099
---------------------------------------------------------
              precision    recall  f1-score   support

           0       0.62      0.99      0.76     56517
           1       0.97      0.40      0.57     56784

    accuracy                           0.70    113301
   macro avg       0.80      0.70      0.67    113301
weighted avg       0.80      0.70      0.67    113301

