# 로지스틱회귀

In [None]:
import os 
import numpy as np
import pandas as pd
import hds
from plt_rcs import *
from scipy import stats
import pingouin as pg

In [None]:
df = pd.read_csv('https://bit.ly/UnivAdmit')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().round(2)

In [None]:
df['rank'] = df['rank'].astype(str)

In [None]:
df.dtypes

In [None]:
df.describe(include=object)

In [None]:
df['rank'].value_counts(normalize=True).sort_index()

In [None]:
df['admit'].value_counts(normalize=True).sort_index()

In [None]:
plt.rc(group='figure', figsize=(4,4))

In [None]:
hds.plot.bar_freq(
    data=df, x='admit', palette=['skyblue', 'orange']
)

In [None]:
hds.plot.box_group(
    data=df, x='admit', y='gre',
    palette=['skyblue', 'orange']
)

In [None]:
hds.plot.box_group(
    data=df, x='admit', y='gpa',
    palette=['skyblue', 'orange']
)

In [None]:
sns.kdeplot(
    data=df, x='gre', hue='admit',
    fill=True, palette=['skyblue', 'orange']
)
plt.show()

In [None]:
sns.kdeplot(
    data=df, x='gpa', hue='admit',
    fill=True, palette=['skyblue', 'orange']
)
plt.show()

In [None]:
hds.plot.bar_dodge_freq(
    data=df, x='rank', g='admit',
    palette=['skyblue', 'orange']
)

In [None]:
hds.plot.bar_stack_freq(
    data=df, x='rank', g='admit',
    palette=['skyblue', 'orange']
)

In [None]:
hds.plot.bar_stack_prop(
    data=df, x='rank', g='admit',
    palette=['skyblue', 'orange']
)

## t-검정

In [None]:
pd.pivot_table(
    data=df, index='admit', values='gre',
    aggfunc=['count', 'mean', 'std'], margins=True
).round(2)
#       count	mean	std
#       gre	gre	gre
# admit			
# Fail	1163	556.08	96.36
# Pass	524	614.75	88.92
# All	1687	574.30	97.92

In [None]:
pg.normality(data=df, dv='gre', group='admit')
#       W	pval	normal
# admit			
# Fail	0.990857	0.000001	False
# Pass	0.992274	0.008138	False

In [None]:
pg.homoscedasticity(data=df, dv='gre', group='admit')
#            W	        pval	    equal_var
# levene	3.596208	0.058082	True

In [None]:
y1 = df.loc[df['admit'].eq('Fail'), 'gre']
y2 = df.loc[df['admit'].eq('Pass'), 'gre']
pg.ttest(x=y1, y=y2, correction=False)

In [None]:
pd.pivot_table(data=df, index='admit', values='gpa',
                aggfunc=['count','mean','std'], margins=True)

In [None]:
pg.normality(data=df, dv='gpa', group='admit')

In [None]:
pg.homoscedasticity(data=df, dv='gpa', group='admit')

In [None]:
y1 = df.loc[df['admit'].eq('Fail'), 'gpa']
y2 = df.loc[df['admit'].eq('Pass'), 'gpa']
pg.ttest(x=y1, y=y2, correction=False)

## 교차분석

In [None]:
pd.crosstab(index=df['rank'], columns=df['admit'], margins=True, normalize='index')
# admit	Fail	Pass
# rank		
# 1	0.450549	0.549451
# 2	0.644366	0.355634
# 3	0.779630	0.220370
# 4	0.826797	0.173203
# All	0.689389	0.310611

In [None]:
pg.chi2_independence(data=df, x='rank', y='admit')[2]

## 범주형 입력변수의 더미 변수 변환

In [None]:
df = pd.get_dummies(
    data=df, columns=['rank', 'admit'],
    prefix=['rank', None],
    dtype=int, drop_first=True
)
# gre	gpa	rank_2	rank_3	rank_4	Pass
# 0	380.0	3.61	0	1	0	0
# 1	660.0	3.67	0	1	0	1
# 2	800.0	4.00	0	0	0	1
# 3	640.0	3.19	0	0	1	1
# 4	520.0	2.93	0	0	1	0

In [None]:
df.head()

## 입력변수 행렬과 목표변수 벡터로 분리

In [None]:
yvar = 'Pass'
X = df.drop(columns=yvar)
y = df[yvar].copy()
display(X)
display(y)

## 로지스틱 회귀 모형

In [None]:
model = hds.stat.glm(X=X, y=y)
model.summary()

## 로지스틱 회귀 모형의 유의성 검정

In [None]:
devGap = model.null_deviance - model.deviance
devGap
# np.float64(259.97760909804174)

In [None]:
dofGap = model.df_model
dofGap
# np.int64(5)

In [None]:
1 - stats.chi2.cdf(x=devGap, df=dofGap)
# np.float64(0.0)

## 다중공선성 확인

- 더미변수에서 다중공선성 문제가 발생하면 다 지움

In [None]:
hds.stat.vif(model=model)