# Multi Linear Regression을 이용하여 p-value 계산하기
## p-value : 귀무가설을 가정하였을 때 표본 이상으로 극단적인 결과를 얻을 확률
## 귀무가설 : Multi Linear Regression을 사용하여 회귀모델을 설계할 때 각 feature의 coefficient가 0이 됨
- 따라서 p-value는 극단적인 결과를 얻을 확률이 적어야 안정적인 해당 feature가 안정적이라고 할 수 있다.

In [9]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import sklearn
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

import scipy.stats as stats
import urllib
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

In [10]:
# 나이대 별로 약 20개 이상의 feature에 대한 p-value를 확인해야하므로 나이대 별로 서로 다른 파일에 저장
df_30 = pd.read_csv('../data/innovation/Merge_data/df_age30_nowon.csv')

In [11]:
df_30.head()

Unnamed: 0,age30accum,age30leisure goods,age30leisure busi,age30culture,age30furniture,age30electronic,age30kitchen,age30fuel,age30optic,age30Appliances,...,LCLS_50_P,LCLS_60_P,LCLS_70_P,LCLS_80_P,pm10,pm25,humi,temp,CONTENT,rain
0,170.0,588.0,1435.0,481.0,8.0,0.0,0.0,1346.0,45.0,21.0,...,46787,5417,8331,6619,68.328472,35.256944,56.769284,16.578527,515,0.0
1,33.0,144.0,905.0,319.0,8.0,0.0,16.0,1198.0,70.0,95.0,...,38527,5108,9985,8507,72.005556,29.909028,57.838777,19.628631,888,0.0
2,74.0,86.0,875.0,270.0,0.0,0.0,12.0,953.0,40.0,86.0,...,36532,6031,7723,4392,72.838194,24.295833,61.681723,19.191452,856,0.1
3,62.0,85.0,877.0,155.0,0.0,0.0,0.0,1051.0,53.0,74.0,...,48128,5592,7261,4501,17.746352,10.958304,60.144545,14.073384,893,18.5
4,38.0,129.0,915.0,294.0,0.0,0.0,8.0,1019.0,58.0,56.0,...,57862,6960,7867,4693,9.92616,3.922644,77.163194,8.62,864,10.5


In [12]:
df_30.columns

Index(['age30accum', 'age30leisure goods', 'age30leisure busi', 'age30culture',
       'age30furniture', 'age30electronic', 'age30kitchen', 'age30fuel',
       'age30optic', 'age30Appliances', 'age30circul', 'age30cloth',
       'age30textile', 'age30stuff', 'age30book', 'age30affair',
       'age30car sell', 'age30car repair', 'age30medical',
       'age30public health', 'age30food', 'age30grocery',
       'age30repair survice', 'age30', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain'],
      dtype='object')

In [13]:
df_30.columns =['age30accum', 'age30leisure_goods', 'age30leisure_busi', 'age30culture',
       'age30furniture', 'age30electronic', 'age30kitchen', 'age30fuel',
       'age30optic', 'age30Appliances', 'age30circul', 'age30cloth',
       'age30textile', 'age30stuff', 'age30book', 'age30affair',
       'age30car_sell', 'age30car_repair', 'age30medical',
       'age30public_health', 'age30food', 'age30grocery',
       'age30repair_service', 'age30', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain']

In [16]:
def pvalue(col_name):
    formula = col_name + ' ~ pm10 + pm25 + temp + humi + CONTENT + rain + CONTENT:temp + CONTENT:rain + CONTENT:humi'
    lm = ols(formula, df_30).fit()
    return lm.summary()

# p-value확인

* 10 숙박

In [17]:
print(pvalue("age30accum")) # temp

                            OLS Regression Results                            
Dep. Variable:             age30accum   R-squared:                       0.312
Model:                            OLS   Adj. R-squared:                  0.294
Method:                 Least Squares   F-statistic:                     17.85
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.97e-24
Time:                        05:42:50   Log-Likelihood:                -1871.3
No. Observations:                 365   AIC:                             3763.
Df Residuals:                     355   BIC:                             3802.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      110.2793     20.897      5.277   

* 20 레저용품

In [50]:
print(pvalue("age30leisure_goods")) # pm10 pm25

                            OLS Regression Results                            
Dep. Variable:     age30leisure_goods   R-squared:                       0.385
Model:                            OLS   Adj. R-squared:                  0.369
Method:                 Least Squares   F-statistic:                     24.66
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           8.99e-33
Time:                        07:16:05   Log-Likelihood:                -2345.7
No. Observations:                 365   AIC:                             4711.
Df Residuals:                     355   BIC:                             4750.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      423.7932     76.668      5.528   

* 21 레저업소

In [21]:
print(pvalue("age30leisure_busi")) # pm25

                            OLS Regression Results                            
Dep. Variable:      age30leisure_busi   R-squared:                       0.391
Model:                            OLS   Adj. R-squared:                  0.375
Method:                 Least Squares   F-statistic:                     25.28
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.71e-33
Time:                        05:45:22   Log-Likelihood:                -2572.0
No. Observations:                 365   AIC:                             5164.
Df Residuals:                     355   BIC:                             5203.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1411.5806    142.507      9.905   

* 22 문화취미

In [22]:
print(pvalue('age30culture'))  # pm25

                            OLS Regression Results                            
Dep. Variable:           age30culture   R-squared:                       0.246
Model:                            OLS   Adj. R-squared:                  0.227
Method:                 Least Squares   F-statistic:                     12.90
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           8.08e-18
Time:                        05:45:33   Log-Likelihood:                -2449.9
No. Observations:                 365   AIC:                             4920.
Df Residuals:                     355   BIC:                             4959.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      672.9740    102.000      6.598   

* 30 가구

In [23]:
print(pvalue('age30furniture'))  # temp

                            OLS Regression Results                            
Dep. Variable:         age30furniture   R-squared:                       0.026
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     1.072
Date:                Tue, 10 Sep 2019   Prob (F-statistic):              0.383
Time:                        05:45:47   Log-Likelihood:                -1113.3
No. Observations:                 365   AIC:                             2247.
Df Residuals:                     355   BIC:                             2286.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        3.6546      2.619      1.395   

* 전기 31

In [24]:
print(pvalue('age30electronic'))  # pm10 pm25 humi

                            OLS Regression Results                            
Dep. Variable:        age30electronic   R-squared:                       0.057
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     2.390
Date:                Tue, 10 Sep 2019   Prob (F-statistic):             0.0123
Time:                        05:45:58   Log-Likelihood:                -933.27
No. Observations:                 365   AIC:                             1887.
Df Residuals:                     355   BIC:                             1926.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -3.2688      1.600     -2.044   

* 32 주방용구

In [25]:
print(pvalue("age30kitchen"))

                            OLS Regression Results                            
Dep. Variable:           age30kitchen   R-squared:                       0.046
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                     1.920
Date:                Tue, 10 Sep 2019   Prob (F-statistic):             0.0481
Time:                        05:46:13   Log-Likelihood:                -1263.0
No. Observations:                 365   AIC:                             2546.
Df Residuals:                     355   BIC:                             2585.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       12.2713      3.947      3.109   

* 33 연료

In [27]:
print(pvalue("age30fuel")) # temp CONTENT

                            OLS Regression Results                            
Dep. Variable:              age30fuel   R-squared:                       0.419
Model:                            OLS   Adj. R-squared:                  0.404
Method:                 Least Squares   F-statistic:                     28.39
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           5.18e-37
Time:                        05:46:22   Log-Likelihood:                -2428.4
No. Observations:                 365   AIC:                             4877.
Df Residuals:                     355   BIC:                             4916.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1563.5309     96.150     16.261   

* 33 광학

In [28]:
print(pvalue("age30optic"))  

                            OLS Regression Results                            
Dep. Variable:             age30optic   R-squared:                       0.162
Model:                            OLS   Adj. R-squared:                  0.140
Method:                 Least Squares   F-statistic:                     7.603
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           3.26e-10
Time:                        05:46:46   Log-Likelihood:                -1678.3
No. Observations:                 365   AIC:                             3377.
Df Residuals:                     355   BIC:                             3416.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       63.3439     12.315      5.144   

* 35 가전

In [29]:
print(pvalue("age30Appliances"))

                            OLS Regression Results                            
Dep. Variable:        age30Appliances   R-squared:                       0.077
Model:                            OLS   Adj. R-squared:                  0.054
Method:                 Least Squares   F-statistic:                     3.290
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           0.000722
Time:                        05:46:52   Log-Likelihood:                -1720.5
No. Observations:                 365   AIC:                             3461.
Df Residuals:                     355   BIC:                             3500.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       63.7197     13.828      4.608   

* 40 유통업

In [30]:
print(pvalue("age30circul")) # pm10, pm25 temp

                            OLS Regression Results                            
Dep. Variable:            age30circul   R-squared:                       0.456
Model:                            OLS   Adj. R-squared:                  0.442
Method:                 Least Squares   F-statistic:                     33.07
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           5.02e-42
Time:                        05:46:56   Log-Likelihood:                -3368.9
No. Observations:                 365   AIC:                             6758.
Df Residuals:                     355   BIC:                             6797.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     2.957e+04   1264.975     23.378   

* 42 의복

In [31]:
print(pvalue("age30cloth")) # pm10 pm25 rain

                            OLS Regression Results                            
Dep. Variable:             age30cloth   R-squared:                       0.208
Model:                            OLS   Adj. R-squared:                  0.188
Method:                 Least Squares   F-statistic:                     10.37
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           3.02e-14
Time:                        05:47:07   Log-Likelihood:                -2201.8
No. Observations:                 365   AIC:                             4424.
Df Residuals:                     355   BIC:                             4463.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      258.7977     51.685      5.007   

* 43 직물

In [33]:
print(pvalue("age30textile")) 

                            OLS Regression Results                            
Dep. Variable:           age30textile   R-squared:                       0.040
Model:                            OLS   Adj. R-squared:                  0.015
Method:                 Least Squares   F-statistic:                     1.623
Date:                Tue, 10 Sep 2019   Prob (F-statistic):              0.107
Time:                        05:47:28   Log-Likelihood:                -1273.5
No. Observations:                 365   AIC:                             2567.
Df Residuals:                     355   BIC:                             2606.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        8.0270      4.062      1.976   

* 44 신변잡화

In [34]:
print(pvalue("age30stuff"))  # rain

                            OLS Regression Results                            
Dep. Variable:             age30stuff   R-squared:                       0.164
Model:                            OLS   Adj. R-squared:                  0.143
Method:                 Least Squares   F-statistic:                     7.726
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           2.14e-10
Time:                        05:47:40   Log-Likelihood:                -1809.6
No. Observations:                 365   AIC:                             3639.
Df Residuals:                     355   BIC:                             3678.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      132.9164     17.646      7.532   

* 50 서적문구

In [35]:
print(pvalue("age30book"))  # pm25

                            OLS Regression Results                            
Dep. Variable:              age30book   R-squared:                       0.229
Model:                            OLS   Adj. R-squared:                  0.209
Method:                 Least Squares   F-statistic:                     11.70
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           3.83e-16
Time:                        05:47:49   Log-Likelihood:                -2346.2
No. Observations:                 365   AIC:                             4712.
Df Residuals:                     355   BIC:                             4751.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      622.0605     76.768      8.103   

* 52 사무통신

In [36]:
print(pvalue('age30affair'))  

                            OLS Regression Results                            
Dep. Variable:            age30affair   R-squared:                       0.057
Model:                            OLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     2.398
Date:                Tue, 10 Sep 2019   Prob (F-statistic):             0.0120
Time:                        05:47:58   Log-Likelihood:                -1285.2
No. Observations:                 365   AIC:                             2590.
Df Residuals:                     355   BIC:                             2629.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       10.0916      4.195      2.406   

* 60 자동차판매

In [37]:
print(pvalue("age30car_sell"))

                            OLS Regression Results                            
Dep. Variable:          age30car_sell   R-squared:                       0.024
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.9629
Date:                Tue, 10 Sep 2019   Prob (F-statistic):              0.471
Time:                        05:48:06   Log-Likelihood:                -823.42
No. Observations:                 365   AIC:                             1667.
Df Residuals:                     355   BIC:                             1706.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -0.3919      1.184     -0.331   

* 62 자동차정비

In [38]:
print(pvalue("age30car_repair"))  # pm10 pm25

                            OLS Regression Results                            
Dep. Variable:        age30car_repair   R-squared:                       0.132
Model:                            OLS   Adj. R-squared:                  0.110
Method:                 Least Squares   F-statistic:                     5.983
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           8.22e-08
Time:                        05:48:12   Log-Likelihood:                -1978.7
No. Observations:                 365   AIC:                             3977.
Df Residuals:                     355   BIC:                             4016.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      277.0346     28.050      9.876   

* 70 의료기관

In [39]:
print(pvalue("age30medical")) # temp

                            OLS Regression Results                            
Dep. Variable:           age30medical   R-squared:                       0.147
Model:                            OLS   Adj. R-squared:                  0.125
Method:                 Least Squares   F-statistic:                     6.781
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           5.37e-09
Time:                        05:48:22   Log-Likelihood:                -3286.2
No. Observations:                 365   AIC:                             6592.
Df Residuals:                     355   BIC:                             6631.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     5861.3173   1008.286      5.813   

* 71 보건위생

In [40]:
print(pvalue("age30public_health")) # pm25

                            OLS Regression Results                            
Dep. Variable:     age30public_health   R-squared:                       0.307
Model:                            OLS   Adj. R-squared:                  0.289
Method:                 Least Squares   F-statistic:                     17.46
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           6.21e-24
Time:                        05:48:42   Log-Likelihood:                -2613.2
No. Observations:                 365   AIC:                             5246.
Df Residuals:                     355   BIC:                             5285.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1539.9932    159.555      9.652   

* 80 요식업소

In [41]:
print(pvalue("age30food"))# pm10 pm25 temp rain

                            OLS Regression Results                            
Dep. Variable:              age30food   R-squared:                       0.398
Model:                            OLS   Adj. R-squared:                  0.382
Method:                 Least Squares   F-statistic:                     26.03
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           2.36e-34
Time:                        05:48:52   Log-Likelihood:                -3227.3
No. Observations:                 365   AIC:                             6475.
Df Residuals:                     355   BIC:                             6514.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1.716e+04    858.163     19.992   

* 81 음료식품

In [43]:
print(pvalue('age30grocery'))  # temp

                            OLS Regression Results                            
Dep. Variable:           age30grocery   R-squared:                       0.157
Model:                            OLS   Adj. R-squared:                  0.136
Method:                 Least Squares   F-statistic:                     7.371
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           7.17e-10
Time:                        05:49:05   Log-Likelihood:                -2662.2
No. Observations:                 365   AIC:                             5344.
Df Residuals:                     355   BIC:                             5383.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     3198.6737    182.460     17.531   

* 92 수리서비스

In [44]:
print(pvalue('age30repair_service')) # temp

                             OLS Regression Results                            
Dep. Variable:     age30repair_service   R-squared:                       0.064
Model:                             OLS   Adj. R-squared:                  0.041
Method:                  Least Squares   F-statistic:                     2.716
Date:                 Tue, 10 Sep 2019   Prob (F-statistic):            0.00451
Time:                         05:49:15   Log-Likelihood:                -2237.9
No. Observations:                  365   AIC:                             4496.
Df Residuals:                      355   BIC:                             4535.
Df Model:                            9                                         
Covariance Type:             nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      304.7117     57.062    