# 머신러닝 로지스틱 회귀

In [None]:
import os
import pandas as pd
import numpy as np
from plt_rcs import *
import hds

In [None]:
df = pd.read_csv('https://bit.ly/UnivAdmit')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df['rank'] = df['rank'].astype(str)

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
df.describe(include=object)

In [None]:
df['rank'].value_counts().sort_index()

In [None]:
df['rank'].value_counts(normalize=True).sort_index()

In [None]:
plt.rc(group='figure', figsize=(4,4))

In [None]:
hds.plot.bar_freq(data=df, x='admit', palette=['skyblue', 'orange'])

In [None]:
hds.plot.box_group(data=df, x='admit', y='gpa', palette=['skyblue', 'orange'])

In [None]:
hds.plot.bar_dodge_freq(data=df, x='rank', g='admit')

In [None]:
hds.plot.bar_stack_freq(data=df, x='rank', g='admit')

In [None]:
hds.plot.bar_stack_prop(data=df, x='rank', g='admit', palette=['skyblue', 'orange'])

## 원 핫 인코딩

In [None]:
df = pd.get_dummies(data=df, columns=['rank'], dtype=int)
df.head()

In [None]:
yvar = 'admit'
X = df.drop(columns=yvar)
y = df[yvar].copy()
display(X)
display(y)

## 데이터 분할

In [None]:
from sklearn.model_selection import train_test_split

- train_test_split 함수의 stratify 속성에 범주형 시리즈를 지정하면 해당 변수의 원소별 상대도수 기준으로 층화추출을 실행

In [None]:
X_train_1, X_valid_1, y_train_1, y_valid_1 = train_test_split(X, y, test_size=0.2, random_state=1, stratify=df['admit'])

display(y_train_1.value_counts(normalize=True).sort_index())
# admit
# Fail    0.6894
# Pass    0.3106
# Name: proportion, dtype: float64
display(y_valid_1.value_counts(normalize=True).sort_index())
# admit
# Fail    0.689349
# Pass    0.310651
# Name: proportion, dtype: float64

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1234)

In [None]:
X_train.shape

In [None]:
X_valid.shape

In [None]:
y_train.value_counts(normalize=True).sort_index()
# admit
# Fail    0.6894
# Pass    0.3106
# Name: proportion, dtype: float64
y_valid.value_counts(normalize=True).sort_index()
# admit
# Fail    0.689349
# Pass    0.310651
# Name: proportion, dtype: float64

In [None]:
# 데이터프레임과 타겟 변수명을 지정하면 훈련셋과 검증셋으로 분할하여 4개의 객체를 반환하는 함수
def make_train_test(data, yvar, test_size=0.2, seed=1):
    X = data.drop(columns=yvar)
    y = data[yvar].copy()
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=seed)
    return X_train, X_valid, y_train, y_valid

In [None]:
X_tr, X_vl, y_tr, y_vl = make_train_test(data=df, yvar='admit', seed=1234)

In [None]:
X_tr.shape

In [None]:
X_vl.shape

In [None]:
y_tr.value_counts(normalize=True).sort_index()

In [None]:
y_vl.value_counts(normalize=True).sort_index()

## 로지스틱 회귀모델 학습

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
model_logit = LogisticRegression(C=np.inf, max_iter=1000, random_state=0)

In [None]:
model_logit.fit(X=X_train, y=y_train)

In [None]:
model_logit.score(X=X_train, y=y_train)
# 0.7323943661971831
model_logit.score(X=X_valid, y=y_valid)
# 0.6863905325443787

In [None]:
model_logit.intercept_
# array([-5.35142011])

In [None]:
model_logit.coef_[0]
# array([ 0.0038324 ,  1.07656158, -0.2272636 , -1.1787027 , -1.78295985,
#        -2.16249395])

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X=X_train)
X_valid_scaled = scaler.transform(X=X_valid)

In [None]:
X_train_scaled = pd.DataFrame(data=X_train_scaled, columns=X_train.columns)
X_valid_scaled = pd.DataFrame(data=X_valid_scaled, columns=X_valid.columns)

In [None]:
model_scaled = LogisticRegression(C=np.inf, max_iter=100, random_state=0)
model_scaled.fit(X=X_train_scaled, y=y_train)

In [None]:
model_scaled.score(X=X_train_scaled, y=y_train)

In [None]:
model_scaled.score(X=X_valid_scaled, y=y_valid)

## 릿지 로지스틱 회귀모델

In [None]:
model_ridge = LogisticRegression(l1_ratio=0, max_iter=1000, random_state=0, C=0.1, solver='lbfgs')

In [None]:
model_ridge.fit(X=X_train, y=y_train)

In [None]:
model_ridge.score(X=X_train, y=y_train)
# 0.7249814677538917
model_ridge.score(X=X_valid, y=y_valid)
# 0.6952662721893491

## 라쏘 로지스틱 회귀모델

In [None]:
model_lasso = LogisticRegression(l1_ratio=1, max_iter=1000, random_state=0, solver='liblinear', C=0.1)

In [None]:
model_lasso.fit(X=X_train, y=y_train)

In [None]:
model_lasso.score(X=X_train, y=y_train)
# 0.7220163083765753
model_lasso.score(X=X_valid, y=y_valid)
# 0.6804733727810651

## 세 모델의 회귀계수 비교

In [None]:
pd.DataFrame(
    data={
        'Logit': model_logit.coef_[0],
        'Ridge': model_ridge.coef_[0],
        'Lasso': model_lasso.coef_[0],
    },
    index=X_train.columns
)

## 예측값 생성

In [None]:
y_pred_logit = model_logit.predict(X=X_valid)
y_pred_ridge = model_ridge.predict(X=X_valid)
y_pred_lasso = model_lasso.predict(X=X_valid)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_logit)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_ridge)

In [None]:
hds.stat.clfmetrics(y_true=y_valid, y_pred=y_pred_lasso)

## 예측 확률 생성

In [None]:
y_prob_logit = model_logit.predict_proba(X=X_valid)
y_prob_ridge = model_ridge.predict_proba(X=X_valid)
y_prob_lasso = model_lasso.predict_proba(X=X_valid)

In [None]:
y_prob_logit[:, 1]

In [None]:
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_logit, color='red')
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_ridge, color='green')
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_lasso, color='blue')

- 분류기준점(cut-off)가 0.5인 것은 기본값이지만 최적은 아님
- 따라서 0.0부터 1.0까지 조금씩 변경해가면서 최고의 분류기준점을 탐색하는 그리드 서치 방식이 필요

In [None]:
sns.boxplot(x=y_valid, y=y_prob_logit[:, 1])
plt.axhline(y=0.5, color='0.5', linestyle='--')
plt.axhline(y=0.31, color='red', linestyle='-')
plt.show()

## PR곡선

In [None]:
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob_logit, color='red')
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob_ridge, color='green')
hds.plot.pr_curve(y_true=y_valid, y_prob=y_prob_lasso, color='blue')

## 오디널 인코딩

In [None]:
df.head()

In [None]:
rank_gb = {
    1: 'S',
    2: 'A',
    3: 'B',
    4: 'C'
}

In [None]:
df['rank'] = df['rank'].map(rank_gb)

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
oe = OrdinalEncoder(categories=[['S', 'A', 'B', 'C']])
df['rank_oe'] = oe.fit_transform(df[['rank']])[:, 0]

In [None]:
df.head()

In [None]:
X_oe_tr, X_oe_vl, y_oe_tr, y_oe_vl = make_train_test(data=df.drop(columns='rank'), yvar='admit', seed=1234)

In [None]:
X_oe_tr.shape

In [None]:
X_oe_vl.shape

In [None]:
y_oe_tr.value_counts(normalize=True)

In [None]:
y_oe_vl.value_counts(normalize=True)

In [None]:
model_logit_oe = LogisticRegression(C=np.inf, max_iter=1000, random_state=0)

In [None]:
model_logit_oe.fit(X=X_oe_tr, y=y_oe_tr)

In [None]:
model_logit_oe.score(X=X_oe_vl, y=y_oe_vl)

In [None]:
y_prob_logit_oe = model_logit_oe.predict_proba(X=X_oe_vl)

In [None]:
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_ridge, color='red')
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_logit_oe, color='blue')
hds.plot.roc_curve(y_true=y_valid, y_prob=y_prob_logit, color='green')