# [과제 3] 로지스틱 회귀분석
### - sklearn 패키지를 사용해 로지스틱 회귀분석을 진행해주세요.
### - 성능지표를 계산하고 이에 대해 해석해주세요.
### - 성능 개선을 시도해주세요. (어떠한 성능지표를 기준으로 개선을 시도했는지, 그 이유도 함께 적어주세요.)
### - 주석으로 설명 및 근거 자세하게 달아주시면 감사하겠습니다. :)

## Data

출처 : https://www.kaggle.com/mlg-ulb/creditcardfraud


* V1 ~ V28 : 비식별화 된 개인정보
* **Class** : Target 변수  
  - 1 : fraudulent transactions (사기)
  - 0 : otherwise

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
data = pd.read_csv("C:/Users/inho0/OneDrive/문서/GitHub/tobigs-21st/Week2/Regression/assignment3_creditcard.csv")

In [4]:
data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,-1.848212,2.3849,0.379573,1.048381,-0.84507,2.537837,-4.542983,-10.201458,-1.504967,-2.234167,...,2.585817,-5.29169,0.859364,0.423231,-0.506985,1.020052,-0.627751,-0.017753,0.280982,0
1,2.071805,-0.477943,-1.444444,-0.548657,0.010036,-0.582242,-0.042878,-0.24716,1.171923,-0.342382,...,-0.077306,0.042858,0.390125,0.041569,0.598427,0.098803,0.979686,-0.093244,-0.065615,0
2,-2.985294,-2.747472,1.194068,-0.003036,-1.151041,-0.263559,0.5535,0.6356,0.438545,-1.806488,...,1.345776,0.37376,-0.385777,1.197596,0.407229,0.008013,0.762362,-0.299024,-0.303929,0
3,-1.479452,1.542874,0.290895,0.838142,-0.52929,-0.717661,0.484516,0.545092,-0.780767,0.324804,...,0.038397,0.116771,0.40556,-0.116453,0.541275,-0.216665,-0.415578,0.027126,-0.150347,0
4,-0.281976,-0.309699,-2.162299,-0.851514,0.106167,-1.483888,1.930994,-0.843049,-1.249272,1.079608,...,-0.875516,-0.004199,1.015108,-0.026748,0.077115,-1.468822,0.7517,0.496732,0.331001,0


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score


# 독립변수와 종속변수 분리
X = data.iloc[:, :-1]
y = data['Class']

# 학습 데이터와 테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2024)

# 기본 모델로 로지스틱 회귀 모델 학습
model = LogisticRegression()
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 성능 평가
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))
print("ROC AUC Score: ", roc_auc_score(y_test, y_pred))

Confusion Matrix: 
[[5685    1]
 [   9   41]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5686
           1       0.98      0.82      0.89        50

    accuracy                           1.00      5736
   macro avg       0.99      0.91      0.95      5736
weighted avg       1.00      1.00      1.00      5736

ROC AUC Score:  0.9099120647203658


In [None]:
# ROC AUC Score:  0.9099120647203658

In [7]:
# SMOTE를 사용하여 소수 클래스의 데이터를 증가시켜 클래스 불균형 문제를 해결
from imblearn.over_sampling import SMOTE

# 오버샘플링
smote = SMOTE(random_state=2024)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

# 로지스틱 회귀 모델 학습
model = LogisticRegression()
model.fit(X_smote, y_smote)

# 예측 및 성능 평가
y_pred = model.predict(X_test)
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))
print("ROC AUC Score: ", roc_auc_score(y_test, y_pred))


Confusion Matrix: 
[[5531  155]
 [   3   47]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.97      0.99      5686
           1       0.23      0.94      0.37        50

    accuracy                           0.97      5736
   macro avg       0.62      0.96      0.68      5736
weighted avg       0.99      0.97      0.98      5736

ROC AUC Score:  0.9563700316567008


In [None]:
# ROC AUC Score:  0.9563700316567008
# 사기 거래를 놓치는 것이 일반 거래를 사기로 잘못 분류하는 것보다 더 큰 문제가 될 수 있어서 재현율( 실제 Positive인 것 중에서 모델이 Positive라고 예측한 비율)을 높이는 것 중요
# -> 임계값 조정 방식 고려하기

# y_pred_proba = model.predict_proba(X_test)
# threshold = 0.7
# y_pred = (y_pred_proba[:,1] >= threshold).astype('int')

In [8]:
# 파라미터 개선

model = LogisticRegression(penalty='l2', C=0.1, class_weight='balanced')

# penalty: 규제의 종류를 설정합니다. 
# C: C 값이 작을수록 규제 강도가 크다.
# class_weight: 클래스의 가중치를 설정한다. 불균형 데이터셋을 다룰 때 유용하다. balanced를 설정시 클래스의 빈도에 반비례하게 가중치를 자동으로 조정
model.fit(X_smote, y_smote)

y_pred = model.predict(X_test)
print("Confusion Matrix: ")
print(confusion_matrix(y_test, y_pred))
print("Classification Report: ")
print(classification_report(y_test, y_pred))
print("ROC AUC Score: ", roc_auc_score(y_test, y_pred))

Confusion Matrix: 
[[5540  146]
 [   3   47]]
Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.97      0.99      5686
           1       0.24      0.94      0.39        50

    accuracy                           0.97      5736
   macro avg       0.62      0.96      0.69      5736
weighted avg       0.99      0.97      0.98      5736

ROC AUC Score:  0.9571614491734084


In [None]:
# ROC AUC Score:  0.9571614491734084 (약하게 개선, 불균형률 개선이 유의미하다.)