# Multi Linear Regression을 이용하여 p-value 계산하기
## p-value : 귀무가설을 가정하였을 때 표본 이상으로 극단적인 결과를 얻을 확률
## 귀무가설 : Multi Linear Regression을 사용하여 회귀모델을 설계할 때 각 feature의 coefficient가 0이 됨
- 따라서 p-value는 극단적인 결과를 얻을 확률이 적어야 안정적인 해당 feature가 안정적이라고 할 수 있다.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import sklearn
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

import scipy.stats as stats
import urllib
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

In [3]:
# 나이대 별로 약 20개 이상의 feature에 대한 p-value를 확인해야하므로 나이대 별로 서로 다른 파일에 저장
df_40 = pd.read_csv('../data/innovation/Merge_data/df_age40_nowon.csv')

In [4]:
df_40.columns

Index(['age40accum', 'age40leisure goods', 'age40leisure busi', 'age40culture',
       'age40furniture', 'age40electronic', 'age40kitchen', 'age40fuel',
       'age40optic', 'age40Appliances', 'age40circul', 'age40cloth',
       'age40textile', 'age40stuff', 'age40book', 'age40affair',
       'age40car sell', 'age40car repair', 'age40medical',
       'age40public health', 'age40food', 'age40grocery',
       'age40repair survice', 'age40', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain'],
      dtype='object')

In [6]:
df_40.columns = ['age40accum', 'age40leisure_goods', 'age40leisure_busi', 'age40culture',
       'age40furniture', 'age40electronic', 'age40kitchen', 'age40fuel',
       'age40optic', 'age40Appliances', 'age40circul', 'age40cloth',
       'age40textile', 'age40stuff', 'age40book', 'age40affair',
       'age40car_sell', 'age40car_repair', 'age40medical',
       'age40public_health', 'age40food', 'age40grocery',
       'age40repair_service', 'age40', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain']

In [7]:
def pvalue(col_name):
    formula = col_name + ' ~ pm10 + pm25 + temp + humi + CONTENT + rain + CONTENT:temp + CONTENT:rain + CONTENT:humi'
    lm = ols(formula, df_40).fit()
    return lm.summary()

# p-value확인

* 10 숙박

In [8]:
print(pvalue("age40accum"))

                            OLS Regression Results                            
Dep. Variable:             age40accum   R-squared:                       0.127
Model:                            OLS   Adj. R-squared:                  0.105
Method:                 Least Squares   F-statistic:                     5.758
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.78e-07
Time:                        06:49:25   Log-Likelihood:                -1581.0
No. Observations:                 365   AIC:                             3182.
Df Residuals:                     355   BIC:                             3221.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       73.6517      9.435      7.806   

* 20 레저용품

In [11]:
print(pvalue("age40leisure_goods")) # pm10 pm25

                            OLS Regression Results                            
Dep. Variable:     age40leisure_goods   R-squared:                       0.435
Model:                            OLS   Adj. R-squared:                  0.421
Method:                 Least Squares   F-statistic:                     30.38
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           3.57e-39
Time:                        06:56:43   Log-Likelihood:                -2276.5
No. Observations:                 365   AIC:                             4573.
Df Residuals:                     355   BIC:                             4612.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      415.9408     63.424      6.558   

* 21 레저업소

In [12]:
print(pvalue("age40leisure_busi")) # pm10 pm25 temp

                            OLS Regression Results                            
Dep. Variable:      age40leisure_busi   R-squared:                       0.404
Model:                            OLS   Adj. R-squared:                  0.389
Method:                 Least Squares   F-statistic:                     26.78
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           3.29e-35
Time:                        06:56:50   Log-Likelihood:                -2469.3
No. Observations:                 365   AIC:                             4959.
Df Residuals:                     355   BIC:                             4998.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1320.6461    107.566     12.278   

* 22 문화취미

In [13]:
print(pvalue('age40culture'))  # pm25 temp humi

                            OLS Regression Results                            
Dep. Variable:           age40culture   R-squared:                       0.311
Model:                            OLS   Adj. R-squared:                  0.294
Method:                 Least Squares   F-statistic:                     17.81
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           2.20e-24
Time:                        06:57:05   Log-Likelihood:                -2586.6
No. Observations:                 365   AIC:                             5193.
Df Residuals:                     355   BIC:                             5232.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1014.5932    148.315      6.841   

* 30 가구

In [15]:
print(pvalue('age40furniture')) 

                            OLS Regression Results                            
Dep. Variable:         age40furniture   R-squared:                       0.047
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                     1.924
Date:                Tue, 10 Sep 2019   Prob (F-statistic):             0.0475
Time:                        07:02:17   Log-Likelihood:                -1443.0
No. Observations:                 365   AIC:                             2906.
Df Residuals:                     355   BIC:                             2945.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       15.7954      6.465      2.443   

* 전기 31

In [16]:
print(pvalue('age40electronic'))  

                            OLS Regression Results                            
Dep. Variable:        age40electronic   R-squared:                       0.053
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     2.200
Date:                Tue, 10 Sep 2019   Prob (F-statistic):             0.0216
Time:                        07:02:38   Log-Likelihood:                -1165.0
No. Observations:                 365   AIC:                             2350.
Df Residuals:                     355   BIC:                             2389.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        3.2416      3.018      1.074   

* 32 주방용구

In [17]:
print(pvalue("age40kitchen"))

                            OLS Regression Results                            
Dep. Variable:           age40kitchen   R-squared:                       0.024
Model:                            OLS   Adj. R-squared:                 -0.000
Method:                 Least Squares   F-statistic:                    0.9837
Date:                Tue, 10 Sep 2019   Prob (F-statistic):              0.453
Time:                        07:02:47   Log-Likelihood:                -1316.9
No. Observations:                 365   AIC:                             2654.
Df Residuals:                     355   BIC:                             2693.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept        7.9001      4.575      1.727   

* 33 연료

In [18]:
print(pvalue("age40fuel")) # temp  rain

                            OLS Regression Results                            
Dep. Variable:              age40fuel   R-squared:                       0.371
Model:                            OLS   Adj. R-squared:                  0.355
Method:                 Least Squares   F-statistic:                     23.28
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           3.77e-31
Time:                        07:02:52   Log-Likelihood:                -2452.7
No. Observations:                 365   AIC:                             4925.
Df Residuals:                     355   BIC:                             4964.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1929.8917    102.773     18.778   

* 33 광학

In [19]:
print(pvalue("age40optic"))  # pm25

                            OLS Regression Results                            
Dep. Variable:             age40optic   R-squared:                       0.210
Model:                            OLS   Adj. R-squared:                  0.189
Method:                 Least Squares   F-statistic:                     10.46
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           2.27e-14
Time:                        07:03:06   Log-Likelihood:                -1754.2
No. Observations:                 365   AIC:                             3528.
Df Residuals:                     355   BIC:                             3567.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       51.5077     15.161      3.397   

* 35 가전

In [20]:
print(pvalue("age40Appliances")) # temp

                            OLS Regression Results                            
Dep. Variable:        age40Appliances   R-squared:                       0.140
Model:                            OLS   Adj. R-squared:                  0.118
Method:                 Least Squares   F-statistic:                     6.425
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.82e-08
Time:                        07:03:24   Log-Likelihood:                -1889.9
No. Observations:                 365   AIC:                             3800.
Df Residuals:                     355   BIC:                             3839.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      133.1414     21.991      6.054   

* 40 유통업

In [23]:
print(pvalue("age40circul")) # pm10, pm25 temp

                            OLS Regression Results                            
Dep. Variable:            age40circul   R-squared:                       0.389
Model:                            OLS   Adj. R-squared:                  0.374
Method:                 Least Squares   F-statistic:                     25.12
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           2.63e-33
Time:                        07:04:11   Log-Likelihood:                -3517.6
No. Observations:                 365   AIC:                             7055.
Df Residuals:                     355   BIC:                             7094.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     3.818e+04   1900.992     20.086   

* 42 의복

In [25]:
print(pvalue("age40cloth")) # rain

                            OLS Regression Results                            
Dep. Variable:             age40cloth   R-squared:                       0.093
Model:                            OLS   Adj. R-squared:                  0.070
Method:                 Least Squares   F-statistic:                     4.042
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           6.02e-05
Time:                        07:04:21   Log-Likelihood:                -2309.3
No. Observations:                 365   AIC:                             4639.
Df Residuals:                     355   BIC:                             4678.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      313.0453     69.382      4.512   

* 43 직물

In [26]:
print(pvalue("age40textile")) # temp

                            OLS Regression Results                            
Dep. Variable:           age40textile   R-squared:                       0.049
Model:                            OLS   Adj. R-squared:                  0.025
Method:                 Least Squares   F-statistic:                     2.047
Date:                Tue, 10 Sep 2019   Prob (F-statistic):             0.0336
Time:                        07:04:32   Log-Likelihood:                -1414.7
No. Observations:                 365   AIC:                             2849.
Df Residuals:                     355   BIC:                             2888.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       16.8406      5.981      2.815   

* 44 신변잡화

In [27]:
print(pvalue("age40stuff"))  # pm10 pm25

                            OLS Regression Results                            
Dep. Variable:             age40stuff   R-squared:                       0.224
Model:                            OLS   Adj. R-squared:                  0.205
Method:                 Least Squares   F-statistic:                     11.40
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           1.03e-15
Time:                        07:04:41   Log-Likelihood:                -1902.7
No. Observations:                 365   AIC:                             3825.
Df Residuals:                     355   BIC:                             3864.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      203.8518     22.773      8.951   

* 50 서적문구

In [28]:
print(pvalue("age40book"))  # pm25 temp

                            OLS Regression Results                            
Dep. Variable:              age40book   R-squared:                       0.241
Model:                            OLS   Adj. R-squared:                  0.222
Method:                 Least Squares   F-statistic:                     12.54
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           2.62e-17
Time:                        07:04:53   Log-Likelihood:                -2809.0
No. Observations:                 365   AIC:                             5638.
Df Residuals:                     355   BIC:                             5677.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1316.2751    272.760      4.826   

* 52 사무통신

In [29]:
print(pvalue('age40affair'))  # temp

                            OLS Regression Results                            
Dep. Variable:            age40affair   R-squared:                       0.086
Model:                            OLS   Adj. R-squared:                  0.063
Method:                 Least Squares   F-statistic:                     3.707
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           0.000184
Time:                        07:05:06   Log-Likelihood:                -1443.0
No. Observations:                 365   AIC:                             2906.
Df Residuals:                     355   BIC:                             2945.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       11.2203      6.465      1.736   

* 60 자동차판매

In [30]:
print(pvalue("age40car_sell")) # 

                            OLS Regression Results                            
Dep. Variable:          age40car_sell   R-squared:                       0.016
Model:                            OLS   Adj. R-squared:                 -0.009
Method:                 Least Squares   F-statistic:                    0.6558
Date:                Tue, 10 Sep 2019   Prob (F-statistic):              0.749
Time:                        07:05:17   Log-Likelihood:                -874.67
No. Observations:                 365   AIC:                             1769.
Df Residuals:                     355   BIC:                             1808.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -1.6364      1.362     -1.201   

* 62 자동차정비

In [31]:
print(pvalue("age40car_repair"))  

                            OLS Regression Results                            
Dep. Variable:        age40car_repair   R-squared:                       0.080
Model:                            OLS   Adj. R-squared:                  0.057
Method:                 Least Squares   F-statistic:                     3.435
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           0.000450
Time:                        07:05:25   Log-Likelihood:                -2077.2
No. Observations:                 365   AIC:                             4174.
Df Residuals:                     355   BIC:                             4213.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      263.8246     36.735      7.182   

* 70 의료기관

In [33]:
print(pvalue("age40medical")) # temp

                            OLS Regression Results                            
Dep. Variable:           age40medical   R-squared:                       0.138
Model:                            OLS   Adj. R-squared:                  0.116
Method:                 Least Squares   F-statistic:                     6.314
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           2.65e-08
Time:                        07:05:45   Log-Likelihood:                -3407.8
No. Observations:                 365   AIC:                             6836.
Df Residuals:                     355   BIC:                             6875.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     7357.8674   1406.935      5.230   

* 71 보건위생

In [35]:
print(pvalue("age40public_health")) # pm10 pm25

                            OLS Regression Results                            
Dep. Variable:     age40public_health   R-squared:                       0.320
Model:                            OLS   Adj. R-squared:                  0.303
Method:                 Least Squares   F-statistic:                     18.58
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           2.31e-25
Time:                        07:05:52   Log-Likelihood:                -2739.7
No. Observations:                 365   AIC:                             5499.
Df Residuals:                     355   BIC:                             5538.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     2137.7728    225.646      9.474   

* 80 요식업소

In [36]:
print(pvalue("age40food"))# pm10 pm25 temp rain

                            OLS Regression Results                            
Dep. Variable:              age40food   R-squared:                       0.336
Model:                            OLS   Adj. R-squared:                  0.319
Method:                 Least Squares   F-statistic:                     19.92
Date:                Tue, 10 Sep 2019   Prob (F-statistic):           4.67e-27
Time:                        07:06:13   Log-Likelihood:                -3372.0
No. Observations:                 365   AIC:                             6764.
Df Residuals:                     355   BIC:                             6803.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     2.198e+04   1275.544     17.235   

* 81 음료식품

In [37]:
print(pvalue('age40grocery'))  # temp

                            OLS Regression Results                            
Dep. Variable:           age40grocery   R-squared:                       0.073
Model:                            OLS   Adj. R-squared:                  0.049
Method:                 Least Squares   F-statistic:                     3.098
Date:                Tue, 10 Sep 2019   Prob (F-statistic):            0.00135
Time:                        07:06:28   Log-Likelihood:                -2790.0
No. Observations:                 365   AIC:                             5600.
Df Residuals:                     355   BIC:                             5639.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     4511.5042    258.975     17.421   

* 92 수리서비스

In [38]:
print(pvalue('age40repair_service')) # temp

                             OLS Regression Results                            
Dep. Variable:     age40repair_service   R-squared:                       0.063
Model:                             OLS   Adj. R-squared:                  0.040
Method:                  Least Squares   F-statistic:                     2.671
Date:                 Tue, 10 Sep 2019   Prob (F-statistic):            0.00520
Time:                         07:06:38   Log-Likelihood:                -2408.1
No. Observations:                  365   AIC:                             4836.
Df Residuals:                      355   BIC:                             4875.
Df Model:                            9                                         
Covariance Type:             nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      502.8295     90.946    