In [26]:
import pandas as pd
from statsmodels.formula.api import ols

In [27]:
wine = pd.read_csv('winequality-both.csv', sep=',', header=0)
wine.columns = wine.columns.str.replace(' ', '_')

### 독립변수 분석

In [28]:
# 선형 모델을 만드는 공식
# 종속변수 ~ 독립변수1 + 독립변수2 ............. 독립변수N
my_formula = 'quality ~ alcohol + chlorides + citric_acid + density + fixed_acidity + free_sulfur_dioxide + pH + residual_sugar + sulphates + total_sulfur_dioxide + volatile_acidity'
# ols() => 선형회귀모델을 만드는 클래스의 생성자
# fit() => 모델을 학습하여 선형회귀식을 완성한다
lm = ols(my_formula, data=wine).fit()

# lm => 해당 인스턴스가 선형회귀모델

In [29]:
lm.summary()

0,1,2,3
Dep. Variable:,quality,R-squared:,0.292
Model:,OLS,Adj. R-squared:,0.291
Method:,Least Squares,F-statistic:,243.3
Date:,"Wed, 18 Jan 2023",Prob (F-statistic):,0.0
Time:,14:47:16,Log-Likelihood:,-7215.5
No. Observations:,6497,AIC:,14450.0
Df Residuals:,6485,BIC:,14540.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,55.7627,11.894,4.688,0.000,32.447,79.079
alcohol,0.2670,0.017,15.963,0.000,0.234,0.300
chlorides,-0.4837,0.333,-1.454,0.146,-1.136,0.168
citric_acid,-0.1097,0.080,-1.377,0.168,-0.266,0.046
density,-54.9669,12.137,-4.529,0.000,-78.760,-31.173
fixed_acidity,0.0677,0.016,4.346,0.000,0.037,0.098
free_sulfur_dioxide,0.0060,0.001,7.948,0.000,0.004,0.007
pH,0.4393,0.090,4.861,0.000,0.262,0.616
residual_sugar,0.0436,0.005,8.449,0.000,0.033,0.054

0,1,2,3
Omnibus:,144.075,Durbin-Watson:,1.646
Prob(Omnibus):,0.0,Jarque-Bera (JB):,324.712
Skew:,-0.006,Prob(JB):,3.09e-71
Kurtosis:,4.095,Cond. No.,249000.0


### 확인해야 할 사항

* Dep. Variable : 종속변수
* No. Observations : 관측개수 => 행개수
* R-squared(결정계수) : 예를 들어 0.3인경우 독립변수가 종속변수의 30% 정도를 설명. 일반적으로 20% 이상 수치가 유의미한 수치로 받아들임
* Prob (F-statistic) : 0.05 보다 작은 경우 유의미한 결과로 해석
* Intercept : y절편 => 고정 상수
* coef(회귀계수) *** : 다른 독립 변수는 고정되어 있고 특정 독립변수가 1단위 변할 때 종속변수가 변화하는 평균 

---

### 데이터 정규화 (Data Standadization)
* 서로 다른 독립변수의 값의 Scale을 표준화하는 작업

In [30]:
wine.head(1)

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,red,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [34]:
dependent_variable = wine['quality']

In [35]:
independent_variables2 = wine[wine.columns.difference(['quality', 'type'])]
independent_variables2.head(1)

Unnamed: 0,alcohol,chlorides,citric_acid,density,fixed_acidity,free_sulfur_dioxide,pH,residual_sugar,sulphates,total_sulfur_dioxide,volatile_acidity
0,9.4,0.076,0.0,0.9978,7.4,11.0,3.51,1.9,0.56,34.0,0.7


* 표준화 공식 : 열 데이터 - 열 데이터의 평균 / 열 데이터의 표준편차

In [36]:
independent_variables_standardized = (independent_variables2 - independent_variables2.mean()) / independent_variables2.std()

In [37]:
wine_standardized = pd.concat([dependent_variable, independent_variables_standardized], axis=1)
wine_standardized.head()

Unnamed: 0,quality,alcohol,chlorides,citric_acid,density,fixed_acidity,free_sulfur_dioxide,pH,residual_sugar,sulphates,total_sulfur_dioxide,volatile_acidity
0,5,-0.915394,0.569914,-2.192664,1.034913,0.142462,-1.100055,1.81295,-0.744721,0.193082,-1.446247,2.188664
1,5,-0.580023,1.197883,-2.192664,0.701432,0.451001,-0.311296,-0.115064,-0.597594,0.999502,-0.862402,3.281982
2,5,-0.580023,1.026618,-1.917405,0.768128,0.451001,-0.874695,0.2581,-0.660648,0.797897,-1.092402,2.553104
3,6,-0.580023,0.54137,1.660957,1.101609,3.07358,-0.762016,-0.36384,-0.744721,0.327485,-0.986248,-0.362411
4,5,-0.915394,0.569914,-2.192664,1.034913,0.142462,-1.100055,1.81295,-0.744721,0.193082,-1.446247,2.188664


In [38]:
lm_standardized = ols(my_formula, data=wine_standardized).fit()
lm_standardized.summary()

0,1,2,3
Dep. Variable:,quality,R-squared:,0.292
Model:,OLS,Adj. R-squared:,0.291
Method:,Least Squares,F-statistic:,243.3
Date:,"Wed, 18 Jan 2023",Prob (F-statistic):,0.0
Time:,14:48:13,Log-Likelihood:,-7215.5
No. Observations:,6497,AIC:,14450.0
Df Residuals:,6485,BIC:,14540.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.8184,0.009,637.785,0.000,5.800,5.836
alcohol,0.3185,0.020,15.963,0.000,0.279,0.358
chlorides,-0.0169,0.012,-1.454,0.146,-0.040,0.006
citric_acid,-0.0159,0.012,-1.377,0.168,-0.039,0.007
density,-0.1648,0.036,-4.529,0.000,-0.236,-0.093
fixed_acidity,0.0877,0.020,4.346,0.000,0.048,0.127
free_sulfur_dioxide,0.1060,0.013,7.948,0.000,0.080,0.132
pH,0.0706,0.015,4.861,0.000,0.042,0.099
residual_sugar,0.2072,0.025,8.449,0.000,0.159,0.255

0,1,2,3
Omnibus:,144.075,Durbin-Watson:,1.646
Prob(Omnibus):,0.0,Jarque-Bera (JB):,324.712
Skew:,-0.006,Prob(JB):,3.09e-71
Kurtosis:,4.095,Cond. No.,9.61


### 질문] 위 분석에서 와인의 품질 속성에 가장 영향을 미치는 독립변수는 무엇인가요?

---