# Multi Linear Regression을 이용하여 p-value 계산하기
## p-value : 귀무가설을 가정하였을 때 표본 이상으로 극단적인 결과를 얻을 확률
## 귀무가설 : Multi Linear Regression을 사용하여 회귀모델을 설계할 때 각 feature의 coefficient가 0이 됨
- 따라서 p-value는 극단적인 결과를 얻을 확률이 적어야 안정적인 해당 feature가 안정적이라고 할 수 있다.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import scipy.stats as stats
import urllib
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
from sklearn.model_selection import train_test_split

In [2]:
# 나이대 별로 약 20개 이상의 feature에 대한 p-value를 확인해야하므로 나이대 별로 서로 다른 파일에 저장
df_30 = pd.read_csv('../data/innovation/Merge_data/df_age30_jongro.csv')

In [3]:
df_30.head()

Unnamed: 0,age30accum,age30leisure goods,age30leisure busi,age30culture,age30furniture,age30electronic,age30kitchen,age30fuel,age30optic,age30Appliances,...,LCLS_50_P,LCLS_60_P,LCLS_70_P,LCLS_80_P,pm10,pm25,humi,temp,CONTENT,rain
0,585.0,107.0,672.0,862.0,0.0,0.0,0.0,112.0,20.0,92.0,...,68124,10029,15065,4995,84.070139,43.877083,61.370139,16.692569,515,0.0
1,332.0,103.0,825.0,244.0,0.0,0.0,17.0,106.0,43.0,35.0,...,66428,9316,15306,3638,72.274306,30.051389,58.825694,19.561181,888,0.0
2,328.0,150.0,664.0,384.0,0.0,0.0,12.0,173.0,41.0,60.0,...,66698,7793,13618,2648,66.1625,19.478472,63.531944,19.282292,856,0.1
3,250.0,133.0,681.0,409.0,17.0,0.0,30.0,113.0,39.0,25.0,...,83667,9649,19985,3078,18.021097,10.726442,62.014583,14.202361,893,18.5
4,399.0,171.0,666.0,269.0,0.0,0.0,21.0,120.0,42.0,35.0,...,108727,9057,13849,3796,11.831711,8.579972,79.125,8.963056,864,10.5


In [4]:
df_30.columns

Index(['age30accum', 'age30leisure goods', 'age30leisure busi', 'age30culture',
       'age30furniture', 'age30electronic', 'age30kitchen', 'age30fuel',
       'age30optic', 'age30Appliances', 'age30circul', 'age30cloth',
       'age30textile', 'age30stuff', 'age30book', 'age30affair',
       'age30car sell', 'age30car repair', 'age30medical',
       'age30public health', 'age30food', 'age30grocery',
       'age30repair survice', 'age30', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain'],
      dtype='object')

In [6]:
df_30.columns = ['age30accum', 'age30leisure_goods', 'age30leisure_busi', 'age30culture',
       'age30furniture', 'age30electronic', 'age30kitchen', 'age30fuel',
       'age30optic', 'age30Appliances', 'age30circul', 'age30cloth',
       'age30textile', 'age30stuff', 'age30book', 'age30affair',
       'age30car_sell', 'age30car_repair', 'age30medical',
       'age30public_health', 'age30food', 'age30grocery',
       'age30repair_survice', 'age30', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain']

In [7]:
def anova(col_name):
    formula = col_name + ' ~ pm10 + pm25 + temp + humi + CONTENT + rain + CONTENT:temp + CONTENT:rain + CONTENT:humi'
    lm = ols(formula, df_30).fit()
    return lm.summary()

In [8]:
print(anova('age30accum')) # CONTENT

                            OLS Regression Results                            
Dep. Variable:             age30accum   R-squared:                       0.321
Model:                            OLS   Adj. R-squared:                  0.304
Method:                 Least Squares   F-statistic:                     18.64
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           1.92e-25
Time:                        23:07:05   Log-Likelihood:                -2234.8
No. Observations:                 365   AIC:                             4490.
Df Residuals:                     355   BIC:                             4529.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      520.0277     56.532      9.199   

In [9]:
print(anova("age30book")) # CONTENT

                            OLS Regression Results                            
Dep. Variable:              age30book   R-squared:                       0.427
Model:                            OLS   Adj. R-squared:                  0.412
Method:                 Least Squares   F-statistic:                     29.34
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           4.71e-38
Time:                        23:07:11   Log-Likelihood:                -3251.2
No. Observations:                 365   AIC:                             6522.
Df Residuals:                     355   BIC:                             6561.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     8995.9607    915.494      9.826   

In [10]:
print(anova("age30culture")) # CONTENT

                            OLS Regression Results                            
Dep. Variable:           age30culture   R-squared:                       0.267
Model:                            OLS   Adj. R-squared:                  0.248
Method:                 Least Squares   F-statistic:                     14.35
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           8.27e-20
Time:                        23:07:16   Log-Likelihood:                -2484.6
No. Observations:                 365   AIC:                             4989.
Df Residuals:                     355   BIC:                             5028.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      755.9828    112.057      6.746   

In [11]:
print(anova('age30stuff')) # CONTENT

                            OLS Regression Results                            
Dep. Variable:             age30stuff   R-squared:                       0.272
Model:                            OLS   Adj. R-squared:                  0.254
Method:                 Least Squares   F-statistic:                     14.75
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           2.37e-20
Time:                        23:07:20   Log-Likelihood:                -2340.5
No. Observations:                 365   AIC:                             4701.
Df Residuals:                     355   BIC:                             4740.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      600.5800     75.517      7.953   

In [12]:
 print(anova('age30cloth')) # CONTENT

                            OLS Regression Results                            
Dep. Variable:             age30cloth   R-squared:                       0.299
Model:                            OLS   Adj. R-squared:                  0.281
Method:                 Least Squares   F-statistic:                     16.84
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           4.03e-23
Time:                        23:07:27   Log-Likelihood:                -2444.9
No. Observations:                 365   AIC:                             4910.
Df Residuals:                     355   BIC:                             4949.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      966.2735    100.511      9.614   

In [13]:
print(anova('age30medical')) # CONTENT

                            OLS Regression Results                            
Dep. Variable:           age30medical   R-squared:                       0.282
Model:                            OLS   Adj. R-squared:                  0.264
Method:                 Least Squares   F-statistic:                     15.52
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           2.23e-21
Time:                        23:07:35   Log-Likelihood:                -3312.9
No. Observations:                 365   AIC:                             6646.
Df Residuals:                     355   BIC:                             6685.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     4379.0879   1084.071      4.039   

In [14]:
print(anova('LCLS_30_P')) # pm10, pm25

                            OLS Regression Results                            
Dep. Variable:              LCLS_30_P   R-squared:                       0.830
Model:                            OLS   Adj. R-squared:                  0.826
Method:                 Least Squares   F-statistic:                     192.8
Date:                Mon, 09 Sep 2019   Prob (F-statistic):          7.51e-131
Time:                        23:07:51   Log-Likelihood:                -4266.4
No. Observations:                 365   AIC:                             8553.
Df Residuals:                     355   BIC:                             8592.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     4.049e+05   1.48e+04     27.404   

In [15]:
print(anova('age30car_repair')) # pm10, pm25

                            OLS Regression Results                            
Dep. Variable:        age30car_repair   R-squared:                       0.274
Model:                            OLS   Adj. R-squared:                  0.255
Method:                 Least Squares   F-statistic:                     14.88
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           1.62e-20
Time:                        23:08:04   Log-Likelihood:                -2899.2
No. Observations:                 365   AIC:                             5818.
Df Residuals:                     355   BIC:                             5857.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     2971.1999    348.950      8.515   

In [16]:
print(anova('age30leisure_busi')) # pm10, pm25

                            OLS Regression Results                            
Dep. Variable:      age30leisure_busi   R-squared:                       0.287
Model:                            OLS   Adj. R-squared:                  0.269
Method:                 Least Squares   F-statistic:                     15.87
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           7.53e-22
Time:                        23:08:17   Log-Likelihood:                -2362.3
No. Observations:                 365   AIC:                             4745.
Df Residuals:                     355   BIC:                             4784.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      894.9656     80.155     11.165   