# Multi Linear Regression을 이용하여 p-value 계산하기
## p-value : 귀무가설을 가정하였을 때 표본 이상으로 극단적인 결과를 얻을 확률
## 귀무가설 : Multi Linear Regression을 사용하여 회귀모델을 설계할 때 각 feature의 coefficient가 0이 됨
- 따라서 p-value는 극단적인 결과를 얻을 확률이 적어야 안정적인 해당 feature가 안정적이라고 할 수 있다.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import sklearn
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

import scipy.stats as stats
import urllib
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

In [3]:
# 나이대 별로 약 20개 이상의 feature에 대한 p-value를 확인해야하므로 나이대 별로 서로 다른 파일에 저장
df_50 = pd.read_csv('../data/innovation/Merge_data/df_age50_nowon.csv')

In [4]:
df_50.columns

Index(['age50accum', 'age50leisure goods', 'age50leisure busi', 'age50culture',
       'age50furniture', 'age50electronic', 'age50kitchen', 'age50fuel',
       'age50optic', 'age50Appliances', 'age50circul', 'age50cloth',
       'age50textile', 'age50stuff', 'age50book', 'age50affair',
       'age50car sell', 'age50car repair', 'age50medical',
       'age50public health', 'age50food', 'age50grocery',
       'age50repair survice', 'age50', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain'],
      dtype='object')

In [5]:
df_50.columns = ['age50accum', 'age50leisure_goods', 'age50leisure_busi', 'age50culture',
       'age50furniture', 'age50electronic', 'age50kitchen', 'age50fuel',
       'age50optic', 'age50Appliances', 'age50circul', 'age50cloth',
       'age50textile', 'age50stuff', 'age50book', 'age50affair',
       'age50car_sell', 'age50car_repair', 'age50medical',
       'age50public_health', 'age50food', 'age50grocery',
       'age50repair_service', 'age50', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain']

In [6]:
def pvalue(col_name):
    formula = col_name + ' ~ pm10 + pm25 + temp + humi + CONTENT + rain + CONTENT:temp + CONTENT:rain + CONTENT:humi'
    lm = ols(formula, df_50).fit()
    return lm.summary()

# p-value확인

* 10 숙박

In [7]:
print(pvalue("age50accum")) # pm10 pm25

                            OLS Regression Results                            
Dep. Variable:             age50accum   R-squared:                       0.132
Model:                            OLS   Adj. R-squared:                  0.110
Method:                 Least Squares   F-statistic:                     5.998
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           7.81e-08
Time:                        05:52:23   Log-Likelihood:                -1541.9
No. Observations:                 365   AIC:                             3104.
Df Residuals:                     355   BIC:                             3143.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       53.2948      8.476      6.288   

* 20 레저용품

In [9]:
print(pvalue("age50leisure_goods")) # pm10 pm25

                            OLS Regression Results                            
Dep. Variable:     age50leisure_goods   R-squared:                       0.234
Model:                            OLS   Adj. R-squared:                  0.215
Method:                 Least Squares   F-statistic:                     12.06
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.21e-16
Time:                        05:52:39   Log-Likelihood:                -1939.8
No. Observations:                 365   AIC:                             3900.
Df Residuals:                     355   BIC:                             3939.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      260.8802     25.211     10.348   

* 21 레저업소

In [10]:
print(pvalue("age50leisure_busi")) # pm25

                            OLS Regression Results                            
Dep. Variable:      age50leisure_busi   R-squared:                       0.354
Model:                            OLS   Adj. R-squared:                  0.338
Method:                 Least Squares   F-statistic:                     21.65
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           3.45e-29
Time:                        05:52:51   Log-Likelihood:                -2403.5
No. Observations:                 365   AIC:                             4827.
Df Residuals:                     355   BIC:                             4866.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1061.8482     89.820     11.822   

* 22 문화취미

In [12]:
print(pvalue('age50culture'))

                            OLS Regression Results                            
Dep. Variable:           age50culture   R-squared:                       0.273
Model:                            OLS   Adj. R-squared:                  0.254
Method:                 Least Squares   F-statistic:                     14.80
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           2.03e-20
Time:                        05:54:04   Log-Likelihood:                -2302.7
No. Observations:                 365   AIC:                             4625.
Df Residuals:                     355   BIC:                             4664.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      456.0919     68.145      6.693   

* 30 가구

In [14]:
print(pvalue('age50furniture')) 

                            OLS Regression Results                            
Dep. Variable:         age50furniture   R-squared:                       0.052
Model:                            OLS   Adj. R-squared:                  0.027
Method:                 Least Squares   F-statistic:                     2.143
Date:                Tue, 10 Sep 2019   Prob (F-statistic):             0.0255
Time:                        07:07:07   Log-Likelihood:                -1371.1
No. Observations:                 365   AIC:                             2762.
Df Residuals:                     355   BIC:                             2801.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       12.1019      5.309      2.280   

* 전기 31

In [15]:
print(pvalue('age50electronic'))  

                            OLS Regression Results                            
Dep. Variable:        age50electronic   R-squared:                       0.024
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9811
Date:                Tue, 10 Sep 2019   Prob (F-statistic):              0.455
Time:                        07:07:25   Log-Likelihood:                -1222.5
No. Observations:                 365   AIC:                             2465.
Df Residuals:                     355   BIC:                             2504.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        2.6134      3.533      0.740   

* 32 주방용구

In [16]:
print(pvalue("age50kitchen")) # pm10

                            OLS Regression Results                            
Dep. Variable:           age50kitchen   R-squared:                       0.043
Model:                            OLS   Adj. R-squared:                  0.018
Method:                 Least Squares   F-statistic:                     1.761
Date:                Tue, 10 Sep 2019   Prob (F-statistic):             0.0745
Time:                        07:07:32   Log-Likelihood:                -1428.1
No. Observations:                 365   AIC:                             2876.
Df Residuals:                     355   BIC:                             2915.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        1.3592      6.205      0.219   

* 33 연료

In [17]:
print(pvalue("age50fuel")) # temp rain

                            OLS Regression Results                            
Dep. Variable:              age50fuel   R-squared:                       0.201
Model:                            OLS   Adj. R-squared:                  0.180
Method:                 Least Squares   F-statistic:                     9.900
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.43e-13
Time:                        07:07:45   Log-Likelihood:                -2437.7
No. Observations:                 365   AIC:                             4895.
Df Residuals:                     355   BIC:                             4934.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     2367.5999     98.648     24.000   

* 33 광학

In [18]:
print(pvalue("age50optic"))  

                            OLS Regression Results                            
Dep. Variable:             age50optic   R-squared:                       0.185
Model:                            OLS   Adj. R-squared:                  0.164
Method:                 Least Squares   F-statistic:                     8.943
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           3.51e-12
Time:                        07:07:57   Log-Likelihood:                -1726.0
No. Observations:                 365   AIC:                             3472.
Df Residuals:                     355   BIC:                             3511.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       16.6516     14.037      1.186   

* 35 가전

In [19]:
print(pvalue("age50Appliances"))  # temp

                            OLS Regression Results                            
Dep. Variable:        age50Appliances   R-squared:                       0.114
Model:                            OLS   Adj. R-squared:                  0.091
Method:                 Least Squares   F-statistic:                     5.068
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.88e-06
Time:                        07:08:07   Log-Likelihood:                -1872.0
No. Observations:                 365   AIC:                             3764.
Df Residuals:                     355   BIC:                             3803.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      116.5556     20.936      5.567   

* 40 유통업

In [21]:
print(pvalue("age50circul")) # pm10, pm25 <<temp

                            OLS Regression Results                            
Dep. Variable:            age50circul   R-squared:                       0.289
Model:                            OLS   Adj. R-squared:                  0.271
Method:                 Least Squares   F-statistic:                     16.01
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           4.99e-22
Time:                        07:08:21   Log-Likelihood:                -3327.8
No. Observations:                 365   AIC:                             6676.
Df Residuals:                     355   BIC:                             6715.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     2.606e+04   1130.271     23.053   

* 42 의복

In [22]:
print(pvalue("age50cloth")) # rain

                            OLS Regression Results                            
Dep. Variable:             age50cloth   R-squared:                       0.256
Model:                            OLS   Adj. R-squared:                  0.237
Method:                 Least Squares   F-statistic:                     13.54
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.07e-18
Time:                        07:08:33   Log-Likelihood:                -2136.0
No. Observations:                 365   AIC:                             4292.
Df Residuals:                     355   BIC:                             4331.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      316.3248     43.158      7.330   

* 43 직물

In [23]:
print(pvalue("age50textile")) 

                            OLS Regression Results                            
Dep. Variable:           age50textile   R-squared:                       0.044
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                     1.809
Date:                Tue, 10 Sep 2019   Prob (F-statistic):             0.0654
Time:                        07:08:43   Log-Likelihood:                -1463.1
No. Observations:                 365   AIC:                             2946.
Df Residuals:                     355   BIC:                             2985.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       19.6435      6.829      2.876   

* 44 신변잡화

In [25]:
print(pvalue("age50stuff")) 

                            OLS Regression Results                            
Dep. Variable:             age50stuff   R-squared:                       0.054
Model:                            OLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     2.252
Date:                Tue, 10 Sep 2019   Prob (F-statistic):             0.0185
Time:                        07:08:56   Log-Likelihood:                -1732.1
No. Observations:                 365   AIC:                             3484.
Df Residuals:                     355   BIC:                             3523.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      108.9741     14.273      7.635   

* 50 서적문구

In [26]:
print(pvalue("age50book"))  # pm25

                            OLS Regression Results                            
Dep. Variable:              age50book   R-squared:                       0.188
Model:                            OLS   Adj. R-squared:                  0.168
Method:                 Least Squares   F-statistic:                     9.150
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.75e-12
Time:                        07:09:05   Log-Likelihood:                -2493.4
No. Observations:                 365   AIC:                             5007.
Df Residuals:                     355   BIC:                             5046.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      528.7515    114.905      4.602   

* 52 사무통신

In [27]:
print(pvalue('age50affair'))  

                            OLS Regression Results                            
Dep. Variable:            age50affair   R-squared:                       0.118
Model:                            OLS   Adj. R-squared:                  0.096
Method:                 Least Squares   F-statistic:                     5.275
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           9.25e-07
Time:                        07:09:38   Log-Likelihood:                -1430.3
No. Observations:                 365   AIC:                             2881.
Df Residuals:                     355   BIC:                             2920.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       13.9138      6.242      2.229   

* 60 자동차판매

In [28]:
print(pvalue("age50car_sell"))

                            OLS Regression Results                            
Dep. Variable:          age50car_sell   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                 -0.009
Method:                 Least Squares   F-statistic:                    0.6407
Date:                Tue, 10 Sep 2019   Prob (F-statistic):              0.762
Time:                        07:09:44   Log-Likelihood:                -936.84
No. Observations:                 365   AIC:                             1894.
Df Residuals:                     355   BIC:                             1933.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        0.1770      1.615      0.110   

* 62 자동차정비

In [30]:
print(pvalue("age50car_repair"))

                            OLS Regression Results                            
Dep. Variable:        age50car_repair   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.047
Method:                 Least Squares   F-statistic:                     2.991
Date:                Tue, 10 Sep 2019   Prob (F-statistic):            0.00189
Time:                        07:09:59   Log-Likelihood:                -2068.8
No. Observations:                 365   AIC:                             4158.
Df Residuals:                     355   BIC:                             4197.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      251.2878     35.897      7.000   

* 70 의료기관

In [31]:
print(pvalue("age50medical")) 

                            OLS Regression Results                            
Dep. Variable:           age50medical   R-squared:                       0.184
Model:                            OLS   Adj. R-squared:                  0.164
Method:                 Least Squares   F-statistic:                     8.920
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           3.79e-12
Time:                        07:10:05   Log-Likelihood:                -3358.7
No. Observations:                 365   AIC:                             6737.
Df Residuals:                     355   BIC:                             6776.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     4911.4948   1229.834      3.994   

* 71 보건위생

In [32]:
print(pvalue("age50public_health")) # pm10 pm25 temp humi

                            OLS Regression Results                            
Dep. Variable:     age50public_health   R-squared:                       0.350
Model:                            OLS   Adj. R-squared:                  0.333
Method:                 Least Squares   F-statistic:                     21.22
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.14e-28
Time:                        07:10:19   Log-Likelihood:                -2533.1
No. Observations:                 365   AIC:                             5086.
Df Residuals:                     355   BIC:                             5125.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1558.5567    128.115     12.165   

* 80 요식업소

In [33]:
print(pvalue("age50food"))# pm10 pm25 temp

                            OLS Regression Results                            
Dep. Variable:              age50food   R-squared:                       0.196
Model:                            OLS   Adj. R-squared:                  0.176
Method:                 Least Squares   F-statistic:                     9.628
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           3.54e-13
Time:                        07:10:40   Log-Likelihood:                -3103.5
No. Observations:                 365   AIC:                             6227.
Df Residuals:                     355   BIC:                             6266.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1.471e+04    611.257     24.072   

* 81 음료식품

In [34]:
print(pvalue('age50grocery'))  # temp humi CONTENT

                            OLS Regression Results                            
Dep. Variable:           age50grocery   R-squared:                       0.088
Model:                            OLS   Adj. R-squared:                  0.065
Method:                 Least Squares   F-statistic:                     3.818
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           0.000127
Time:                        07:10:54   Log-Likelihood:                -2742.9
No. Observations:                 365   AIC:                             5506.
Df Residuals:                     355   BIC:                             5545.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     2976.4068    227.590     13.078   

* 92 수리서비스

In [35]:
print(pvalue('age50repair_service')) # temp

                             OLS Regression Results                            
Dep. Variable:     age50repair_service   R-squared:                       0.107
Model:                             OLS   Adj. R-squared:                  0.084
Method:                  Least Squares   F-statistic:                     4.722
Date:                 Tue, 10 Sep 2019   Prob (F-statistic):           6.08e-06
Time:                         07:11:10   Log-Likelihood:                -2197.4
No. Observations:                  365   AIC:                             4415.
Df Residuals:                      355   BIC:                             4454.
Df Model:                            9                                         
Covariance Type:             nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      273.2330     51.068    