In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from imblearn.over_sampling import SMOTE
import pandas as pd

df_credit = pd.read_csv("../../data/creditcard.csv")
df_credit.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [2]:
X = df_credit.drop("Class", axis=1)
y = df_credit["Class"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

smote = SMOTE(random_state=11)
X_train_upsampled, y_train_upsampled = smote.fit_resample(X_train, y_train)


In [3]:
y_train_upsampled.value_counts()


Class
0    227461
1    227461
Name: count, dtype: int64

In [4]:
model = DecisionTreeClassifier(random_state=11)
param = model.get_params()
print(param)

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': 11, 'splitter': 'best'}


In [9]:
# 그리드 서치

# 하이퍼파라미터 설정
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [3, 5, 10, None],
    'max_leaf_nodes': [None, 10, 20, 50],
    'min_samples_leaf': [1, 2, 5]
}

# 평가지표 설정
scorer = make_scorer(f1_score, pos_label = 1)

In [10]:
from sklearn.model_selection import GridSearchCV

# 그리드서치 객체 생성성
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=scorer,
    cv=5,  # k-fold 분할 방식식
    n_jobs=-1,  # 
    verbose=2
)


In [11]:
# 그리드서치 학습
grid_search.fit(X_train_upsampled, y_train_upsampled)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


In [14]:
# 베스트 모델에 대한 파라미터

best_params = grid_search.best_params_
print(best_params)
best_model = grid_search.best_estimator_
print(best_model)


{'criterion': 'entropy', 'max_depth': None, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'splitter': 'best'}
DecisionTreeClassifier(criterion='entropy', random_state=11)


array([8.79405663e-03, 4.03669359e-03, 2.16468195e-03, 1.21420831e-02,
       6.16363583e-02, 2.01383926e-03, 1.54917427e-03, 8.85289288e-03,
       1.17564447e-02, 2.89325087e-03, 6.49826423e-03, 1.35729168e-02,
       5.50309166e-02, 7.97189744e-03, 7.46831925e-01, 1.04626390e-03,
       4.24773334e-03, 5.73790571e-03, 2.67991390e-03, 1.07452220e-03,
       2.95936051e-03, 5.29289228e-03, 4.32873615e-03, 3.03763143e-04,
       1.46860624e-03, 2.85177288e-03, 6.72706949e-03, 8.91681019e-04,
       7.28418221e-04, 1.39159657e-02])

In [20]:
y_pred = best_model.predict(X_test)
y_pred
print("정확도:", accuracy_score(y_test, y_pred))
print("재현율:", recall_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("f1점수:", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

정확도: 0.9977177767634563
재현율: 0.7777777777777778
정밀도: 0.4421052631578947
f1점수: 0.5637583892617449
[[56748   106]
 [   24    84]]


In [19]:
import numpy as np

# 샘플 직접 입력하기
my_data = np.array([[0, 1, 2, 3, 2, 1, 2, 1, 2, 1, 0, 1, 2, 3, 2, 1, 2, 1, 2, 1, 0, 1, 2, 3, 2, 1, 2, 1, 2, 10000]])
best_model.predict(my_data)



array([0])