In [13]:
import pandas as pd
from statsmodels.formula.api import ols

In [14]:
wine = pd.read_csv('winequality-both.csv', sep=',', header=0)
wine.columns = wine.columns.str.replace(' ', '_')

* 용어 정리
    * 선형회귀분석(Linear Regression Analysis): 선형데이터간의 관계 분석, 선형데이터 값을 예측하기 위한 분석을 지원
    * 선형데이터: 값이 증가 하거나 감소하는 값 (아파트값, 와인품질, 주식, ...)
    * 종속변수 (Dependant Variable): 예측하고자 하는 값 (여기서는 선형 데이터), 독립변수에 영향을 받는다.
    * 독립변수 (Indendant Variable): 예측하고자 하는 값 (종속변수)에 영향을 미치는 외부적인 요소

### 독립변수 분석

In [15]:
wine.quality.unique()

array([5, 6, 7, 4, 8, 3, 9], dtype=int64)

In [16]:
wine.head()

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,red,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,red,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,red,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,red,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,red,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [17]:
wine.shape

(6497, 13)

In [18]:
# 선형 모델을 만드는 공식
# 종속변수 ~ 독립변수1  + 독립변수2 ....... 독립변수N
my_formula = 'quality ~ alcohol + chlorides + citric_acid + density + fixed_acidity + free_sulfur_dioxide + pH + residual_sugar + sulphates + total_sulfur_dioxide + volatile_acidity'
# ols() 선형 회귀 모델을 만드는 클래스의 생성자
# fit() => 모델을 학습하여 선형회귀식을 완성함
lm = ols(my_formula, data=wine).fit()
# lm => 해당 인스턴스가 선형회귀 모델

In [19]:
lm.summary()

0,1,2,3
Dep. Variable:,quality,R-squared:,0.292
Model:,OLS,Adj. R-squared:,0.291
Method:,Least Squares,F-statistic:,243.3
Date:,"Fri, 09 Aug 2024",Prob (F-statistic):,0.0
Time:,11:33:07,Log-Likelihood:,-7215.5
No. Observations:,6497,AIC:,14450.0
Df Residuals:,6485,BIC:,14540.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,55.7627,11.894,4.688,0.000,32.447,79.079
alcohol,0.2670,0.017,15.963,0.000,0.234,0.300
chlorides,-0.4837,0.333,-1.454,0.146,-1.136,0.168
citric_acid,-0.1097,0.080,-1.377,0.168,-0.266,0.046
density,-54.9669,12.137,-4.529,0.000,-78.760,-31.173
fixed_acidity,0.0677,0.016,4.346,0.000,0.037,0.098
free_sulfur_dioxide,0.0060,0.001,7.948,0.000,0.004,0.007
pH,0.4393,0.090,4.861,0.000,0.262,0.616
residual_sugar,0.0436,0.005,8.449,0.000,0.033,0.054

0,1,2,3
Omnibus:,144.075,Durbin-Watson:,1.646
Prob(Omnibus):,0.0,Jarque-Bera (JB):,324.712
Skew:,-0.006,Prob(JB):,3.09e-71
Kurtosis:,4.095,Cond. No.,249000.0


### 확인해야 할 사항

#### 핵심사항

* Dep. Variable: 종속변수
* No. Observations: 관측개수
* R-squared(결정계수): 예를 들어 0.3인경우 독립변수가 종속변수의 30% 정도를 설명. (0~1사이의 값)
    * 1에 가까울 수록 모델이 종속 변수를 잘 반영했다고 해석
    * 일반적으로 최소 0.2(20%) 이상 수치가 유의미한 수치로 받아들임
* Prob (F-statistic): p-value로 여기서는 모델의 신뢰도를 평가하며 0.05 보다 작은 모델을 신뢰할 수 있는 용도로 활용.
* coef(회귀계수): 다른 독립 변수는 고정되어 있고 특정 독립변수가 1단위 변할 때 종속변수가 변화하는 평균
* Df model: 독립변수 갯수
* Intercept: y절편 => 고정 상수 (내부 선형회귀모델에서 사용: 참고용)

#### 기타 참고사항
* 절대적인 기준 값이 없음
* 기존 분석 결과에서 새로운 데이터 수집 또는 전처리 전후의 값으로 통계모델 분석력을 참고하는 보조지표로 활용

#### 예)

* F-statistic: 모델의 적합도 평가. 클 수록 신뢰도가 올라감 
* Log-Likelihood: 모델의 적합도 평가. 클 수록 신뢰도가 올라감 
* AIC/BIC: 모델의 적합도 평가. 낮을 수록 신뢰도가 올라감 
* Df Residuals: 모델의 적합도 평가. 낮을 수록 신뢰도가 올라감 
* std err: 추정치의 편차, 낮을 수록 신뢰도가 올라감 

---

### 데이터 정규화, 표준화 (Data Standadization)
* 서로 다른 독립변수의 값의 Scale을 표준화하는 작업

In [20]:
wine.head(1)

Unnamed: 0,type,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,red,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [21]:
wine.describe()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,7.215307,0.339666,0.318633,5.443235,0.056034,30.525319,115.744574,0.994697,3.218501,0.531268,10.491801,5.818378
std,1.296434,0.164636,0.145318,4.757804,0.035034,17.7494,56.521855,0.002999,0.160787,0.148806,1.192712,0.873255
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.72,0.22,8.0,3.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,77.0,0.99234,3.11,0.43,9.5,5.0
50%,7.0,0.29,0.31,3.0,0.047,29.0,118.0,0.99489,3.21,0.51,10.3,6.0
75%,7.7,0.4,0.39,8.1,0.065,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,15.9,1.58,1.66,65.8,0.611,289.0,440.0,1.03898,4.01,2.0,14.9,9.0


In [22]:
dependent_variable = wine['quality']

In [23]:
independent_variables2 = wine[wine.columns.difference(['quality', 'type'])] # quality, type을 제외한 모든 열 선택
independent_variables2.head(1)

Unnamed: 0,alcohol,chlorides,citric_acid,density,fixed_acidity,free_sulfur_dioxide,pH,residual_sugar,sulphates,total_sulfur_dioxide,volatile_acidity
0,9.4,0.076,0.0,0.9978,7.4,11.0,3.51,1.9,0.56,34.0,0.7


* 표준화 공식: 열 데이터 - 열 데이터의 평균 / 열 데이터의 표준편차

In [25]:
independent_variables_standardized = (independent_variables2 - independent_variables2.mean()) / independent_variables2.std()

In [26]:
wine_standardized = pd.concat([dependent_variable, independent_variables_standardized], axis=1)
wine_standardized.head()

Unnamed: 0,quality,alcohol,chlorides,citric_acid,density,fixed_acidity,free_sulfur_dioxide,pH,residual_sugar,sulphates,total_sulfur_dioxide,volatile_acidity
0,5,-0.915394,0.569914,-2.192664,1.034913,0.142462,-1.100055,1.81295,-0.744721,0.193082,-1.446247,2.188664
1,5,-0.580023,1.197883,-2.192664,0.701432,0.451001,-0.311296,-0.115064,-0.597594,0.999502,-0.862402,3.281982
2,5,-0.580023,1.026618,-1.917405,0.768128,0.451001,-0.874695,0.2581,-0.660648,0.797897,-1.092402,2.553104
3,6,-0.580023,0.54137,1.660957,1.101609,3.07358,-0.762016,-0.36384,-0.744721,0.327485,-0.986248,-0.362411
4,5,-0.915394,0.569914,-2.192664,1.034913,0.142462,-1.100055,1.81295,-0.744721,0.193082,-1.446247,2.188664


In [27]:
wine_standardized.describe()

Unnamed: 0,quality,alcohol,chlorides,citric_acid,density,fixed_acidity,free_sulfur_dioxide,pH,residual_sugar,sulphates,total_sulfur_dioxide,volatile_acidity
count,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0,6497.0
mean,5.818378,6.386901e-16,3.499672e-17,0.0,-3.534668e-15,-4.89954e-16,-8.749179e-17,2.712246e-15,3.499672e-17,-5.599475e-16,-6.999344e-17,1.049902e-16
std,0.873255,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,3.0,-2.089189,-1.342536,-2.192664,-2.529997,-2.634386,-1.663455,-3.100376,-1.017956,-2.091774,-1.941631,-1.577208
25%,5.0,-0.8315512,-0.514759,-0.472297,-0.7858922,-0.6288845,-0.7620156,-0.6748102,-0.7657389,-0.6805395,-0.6854795,-0.66611
50%,6.0,-0.1608107,-0.2578628,-0.059409,0.06448391,-0.1660764,-0.08593639,-0.05287017,-0.5135217,-0.1429263,0.0399036,-0.3016707
75%,6.0,0.6776148,0.2559297,0.491108,0.7647937,0.3738663,0.5901428,0.6312639,0.5584015,0.4618885,0.7122099,0.366468
max,9.0,3.695947,15.84097,9.23057,14.76765,6.69891,14.56245,4.92265,12.68585,9.870119,5.736815,7.533774


In [19]:
lm_standardized = ols(my_formula, data=wine_standardized).fit()
lm_standardized.summary()

0,1,2,3
Dep. Variable:,quality,R-squared:,0.292
Model:,OLS,Adj. R-squared:,0.291
Method:,Least Squares,F-statistic:,243.3
Date:,"Fri, 13 Jan 2023",Prob (F-statistic):,0.0
Time:,14:58:11,Log-Likelihood:,-7215.5
No. Observations:,6497,AIC:,14450.0
Df Residuals:,6485,BIC:,14540.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.8184,0.009,637.785,0.000,5.800,5.836
alcohol,0.3185,0.020,15.963,0.000,0.279,0.358
chlorides,-0.0169,0.012,-1.454,0.146,-0.040,0.006
citric_acid,-0.0159,0.012,-1.377,0.168,-0.039,0.007
density,-0.1648,0.036,-4.529,0.000,-0.236,-0.093
fixed_acidity,0.0877,0.020,4.346,0.000,0.048,0.127
free_sulfur_dioxide,0.1060,0.013,7.948,0.000,0.080,0.132
pH,0.0706,0.015,4.861,0.000,0.042,0.099
residual_sugar,0.2072,0.025,8.449,0.000,0.159,0.255

0,1,2,3
Omnibus:,144.075,Durbin-Watson:,1.646
Prob(Omnibus):,0.0,Jarque-Bera (JB):,324.712
Skew:,-0.006,Prob(JB):,3.09e-71
Kurtosis:,4.095,Cond. No.,9.61


### 질문] 위 분석에서 와인의 품질 속성에 가장 영향을 미치는 독립변수는 무엇인가요?

---