# SGDClassifier

In [1]:
from sklearn.linear_model import SGDClassifier
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error

In [4]:
data = pd.read_csv('data/Churn_Modelling.csv')
data.head()

## 1. 데이터 전처리
### 1-1 오버샘플링
def perform_oversampling(data, label_column):
    # 다수 클래스와 소수 클래스 분리
    majority = data[data[label_column] == 0]
    minority = data[data[label_column] == 1]

    # 소수 클래스 오버샘플링
    minority_upsampled = resample(
        minority,
        replace=True,  # 복원추출
        n_samples=len(majority),  # 다수클래스 수만큼 소수클래스 늘리기
        random_state=42
    )

    # 병합 및 셔플
    upsampled_data = pd.concat([majority, minority_upsampled])  # 기존다수의 클래스와 소수 클래스를 합치기
    return upsampled_data.sample(frac=1, random_state=42).reset_index(drop=True)


# 사용 예시
upsampled_df = perform_oversampling(data, label_column='Exited')
data = upsampled_df

### 1-2. feature engineering
# 지역 원핫인코딩 실시
data = pd.get_dummies(data, columns=['Geography', 'Gender'], dtype=int)

# 연속형 변수들을 로그 스케일링 실시 ('Age','Balance')
data['LogAge'] = data['Age'].apply(lambda x: np.log1p(x))
data['LogBalance'] = data['Balance'].apply(lambda x: np.log1p(x))

### 1-3 데이터분할 및 스케일링
# 안쓰는 칼럼 제거
data = data.drop(columns=['RowNumber', 'CustomerId', 'Surname'])

# 로그칼럼들을 적용
X = data.drop(columns=['Exited', 'Balance', 'Age'])
y = data['Exited']

Banking_df = pd.DataFrame(X, columns=X.columns)  # 칼럼이름 보존

X_train, X_test, y_train, y_test = train_test_split(Banking_df, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 하이퍼파라미터 그리드 정의
param_grid = {
    'loss': ['log_loss', 'hinge'],  # 'log_loss' = 로지스틱 회귀, 'hinge' = SVM 방식
    'penalty': ['l2', 'l1', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive'],
    'eta0': [0.01, 0.1],
    'max_iter': [1000],
    'tol': [1e-3]
}

# GridSearchCV 설정
grid_search = GridSearchCV(
    estimator=SGDClassifier(random_state=42),
    param_grid=param_grid,
    scoring='accuracy',  # 또는 'f1', 'roc_auc'
    cv=5,
    n_jobs=-1,
    verbose=2
)

# 학습 수행
grid_search.fit(X_train, y_train)

print('best_params_:', grid_search.best_params_)
print('best_score_:', grid_search.best_score_)

best_model = grid_search.best_estimator_

# 테스트셋 예측 및 성능 평가
y_pred = best_model.predict(X_test)
print("Accuracy (Test):", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV] END alpha=0.0001, eta0=0.01, learning_rate=constant, loss=log_loss, max_iter=1000, penalty=l1, tol=0.001; total time=   0.0s
[CV] END alpha=0.0001, eta0=0.01, learning_rate=constant, loss=hinge, max_iter=1000, penalty=elasticnet, tol=0.001; total time=   0.0s
[CV] END alpha=0.0001, eta0=0.01, learning_rate=adaptive, loss=log_loss, max_iter=1000, penalty=l2, tol=0.001; total time=   0.1s
[CV] END alpha=0.0001, eta0=0.01, learning_rate=adaptive, loss=log_loss, max_iter=1000, penalty=elasticnet, tol=0.001; total time=   0.1s
[CV] END alpha=0.0001, eta0=0.01, learning_rate=adaptive, loss=hinge, max_iter=1000, penalty=elasticnet, tol=0.001; total time=   0.0s
[CV] END alpha=0.0001, eta0=0.01, learning_rate=adaptive, loss=hinge, max_iter=1000, penalty=elasticnet, tol=0.001; total time=   0.0s
[CV] END alpha=0.0001, eta0=0.1, learning_rate=constant, loss=log_loss, max_iter=1000, penalty=elasticnet, tol=0.001; total time=   0.1