# 선형 회귀

In [None]:
import os
import numpy as np
import pandas as pd
from scipy import stats
import pingouin as pg
from plt_rcs import *
import hds

In [None]:
os.getcwd()

In [None]:
os.chdir('../data')

In [None]:
sorted(os.listdir())

In [None]:
df = pd.read_pickle('Used_Cars_Prep.pkl')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

## 범주형 입력변수의 더미 변수 변환

In [None]:
sr1 = pd.Series(data=['A', 'B', 'O', 'AB'])
pd.get_dummies(data=sr1, dtype=int, drop_first=True)
#   AB	B	O
# 0	0	0	0
# 1	0	1	0
# 2	0	0	1
# 3	1	0	0

In [None]:
df = df.astype(dtype={'MetColor': int})

- pd.get_dummies의 data 매개변수에 데이터프레임을 지정하면 해당 데이터프레임에 있는 모든 범주형 변수에 대해 일괄적으로
- 원-핫 인코딩 또는 더미변수로 변환
- 만약 일부 범주형 변수만 변환하고 싶다면 columns 매개변수에 열이름을 리스트로 추가

In [None]:
df = pd.get_dummies(data=df, dtype=int, drop_first=True, prefix='', prefix_sep='')
df.head()
# Price	Age	KM	HP	MetColor	Doors	Weight	Petrol
# 0	13500	23	46986	90	1	3	1165	0
# 1	13750	23	72937	90	1	3	1165	0
# 2	13950	24	41711	90	1	3	1165	0
# 3	14950	26	48000	90	0	3	1165	0
# 4	13750	30	38500	90	0	3	1170	0

In [None]:
df.dtypes

## 입력변수 행렬과 목표변수 벡터로 분리

In [None]:
yvar = 'Price'

X = df.drop(columns=yvar)
y = df[yvar].copy()

display(X)
display(y)

## 선형 회귀 모형 적합

In [None]:
model = hds.stat.ols(y=y, X=X)
model.summary()

## 회귀진단 : 잔차 그래프

In [None]:
hds.stat.regressionDiagnosis(model=model)

## 잔차의 분포 확인

In [None]:
rnorm_resid = stats.norm.rvs(loc=0, scale=model.resid.std(), size=10000, random_state=1)

In [None]:
sns.kdeplot(x=model.resid, color='0', fill=True)
sns.kdeplot(x=rnorm_resid, color='red', fill=True)

plt.axvline(x=0, color='0.5', linestyle='--')

plt.show()

In [None]:
stats.shapiro(x=model.resid)

In [None]:
hds.stat.breushpagan(model=model)
#   Statistic	P-Value	        F-Value	    F P-Value
# 0	69.137164	2.206962e-12	10.387656	1.014454e-12

## 영향점 확인

In [None]:
aug = hds.stat.augment(model=model)

In [None]:
aug

In [None]:
out_index = aug.loc[aug['cooksd'].gt(4 / X.shape[0])].index
out_index

## 영향점 시각화

In [None]:
plt.figure(figsize=(4, 4))
sns.regplot(
    data=df, x='Age', y='Price', ci=None,
    scatter_kws={'color': '0.8', 's': 10, 'ec': '0.8'},
    line_kws={'color': 'red', 'lw': 1.5}
)
sns.scatterplot(
    data=df.loc[out_index, :], x='Age', y='Price',
    fc='red', ec='red', s=20, label='Outlier'
)
plt.legend()
plt.show()

### Age - Price 만 비교하는 모델

In [None]:
X_1 = df[['Age']]
model_1 = hds.stat.ols(X=X_1, y=y)
model_1.summary()

In [None]:
aug_1 = hds.stat.augment(model=model_1)

In [None]:
out_index_1 = aug_1.loc[aug_1['cooksd'].gt(4 / X_1.shape[0])].index

In [None]:
plt.figure(figsize=(4, 4))
sns.regplot(
    data=df, x='Age', y='Price', ci=None,
    scatter_kws={'color': '0.8', 's': 10, 'ec': '0.8'},
    line_kws={'color': 'red', 'lw': 1.5}
)
sns.scatterplot(
    data=df.loc[out_index_1, :], x='Age', y='Price',
    fc='red', ec='red', s=20, label='Outlier'
)
plt.axvline(x=df['Age'].mean(), color='0.5', linestyle='--')

plt.legend()

plt.show()

## 영향점 제거

In [None]:
len(out_index)

In [None]:
X = X.drop(index=out_index)
y = y.drop(index=out_index)
X.shape

## 영향점 제거한 모형

In [None]:
model = hds.stat.ols(X=X, y=y)
model.summary()

In [None]:
stats.shapiro(model.resid)
# ShapiroResult(statistic=np.float64(0.9987108551069022), pvalue=np.float64(0.5564148139317799))

In [None]:
hds.stat.breushpagan(model=model)
#   Statistic	P-Value	    F-Value	    F P-Value
# 0	14.124654	0.049008	2.028417	0.048685

In [None]:
hds.stat.regressionDiagnosis(model=model)

## 다중공선성 확인

In [None]:
hds.stat.vif(model=model)
#   Age	        KM	        HP	        MetColor	Doors	    Weight	    Petrol
# 0	1.335567	1.609369	3.123682	1.015065	1.633648	4.762933	6.615611

In [None]:
X = X.drop(columns='Petrol')
model = hds.stat.ols(X=X, y=y)
hds.stat.vif(model=model)
#   Age	        KM	        HP	    MetColor	Doors	    Weight
# 0	1.333597	1.498871	1.13305	1.013634	1.279331	1.457494

In [None]:
plt.figure(figsize=(6, 4))
hds.plot.corr_heatmap(data=df)

In [None]:
model.summary()

## 단계적방법으로 모형 적합

In [None]:
model = hds.stat.stepwise(X=X, y=y, direction='both')
model.summary()

In [None]:
hds.stat.breushpagan(model=model)
#   Statistic	P-Value	    F-Value 	F P-Value
# 0	10.220546	0.069222	2.051491	0.069082

In [None]:
hds.stat.regressionDiagnosis(model=model)