# Multi Linear Regression을 이용하여 p-value 계산하기
## p-value : 귀무가설을 가정하였을 때 표본 이상으로 극단적인 결과를 얻을 확률
## 귀무가설 : Multi Linear Regression을 사용하여 회귀모델을 설계할 때 각 feature의 coefficient가 0이 됨
- 따라서 p-value는 극단적인 결과를 얻을 확률이 적어야 안정적인 해당 feature가 안정적이라고 할 수 있다.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import sklearn
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

import scipy.stats as stats
import urllib
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

In [2]:
# 나이대 별로 약 20개 이상의 feature에 대한 p-value를 확인해야하므로 나이대 별로 서로 다른 파일에 저장
df_20 = pd.read_csv('../data/innovation/Merge_data/df_age20_jongro.csv')

In [3]:
df_20.head()

Unnamed: 0,age20accum,age20leisure goods,age20leisure busi,age20culture,age20furniture,age20electronic,age20kitchen,age20fuel,age20optic,age20Appliances,...,LCLS_50_P,LCLS_60_P,LCLS_70_P,LCLS_80_P,pm10,pm25,humi,temp,CONTENT,rain
0,685.0,318.0,3730.0,1977.0,0.0,0.0,0.0,59.0,99.0,17.0,...,68124,10029,15065,4995,84.070139,43.877083,61.370139,16.692569,515,0.0
1,212.0,188.0,2626.0,596.0,0.0,0.0,0.0,55.0,91.0,0.0,...,66428,9316,15306,3638,72.274306,30.051389,58.825694,19.561181,888,0.0
2,257.0,192.0,2831.0,782.0,12.0,0.0,0.0,64.0,113.0,51.0,...,66698,7793,13618,2648,66.1625,19.478472,63.531944,19.282292,856,0.1
3,205.0,202.0,2716.0,772.0,8.0,0.0,17.0,0.0,106.0,56.0,...,83667,9649,19985,3078,18.021097,10.726442,62.014583,14.202361,893,18.5
4,273.0,184.0,2898.0,1001.0,0.0,0.0,21.0,47.0,60.0,21.0,...,108727,9057,13849,3796,11.831711,8.579972,79.125,8.963056,864,10.5


In [10]:
df_20.columns

Index(['age20accum', 'age20leisure goods', 'age20leisure busi', 'age20culture',
       'age20furniture', 'age20electronic', 'age20kitchen', 'age20fuel',
       'age20optic', 'age20Appliances', 'age20circul', 'age20cloth',
       'age20textile', 'age20stuff', 'age20book', 'age20affair',
       'age20car sell', 'age20car repair', 'age20medical',
       'age20public health', 'age20food', 'age20grocery',
       'age20repair survice', 'age20', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain'],
      dtype='object')

In [11]:
df_20.columns = ['age20accum', 'age20leisure_goods', 'age20leisure_busi', 'age20culture',
       'age20furniture', 'age20electronic', 'age20kitchen', 'age20fuel',
       'age20optic', 'age20Appliances', 'age20circul', 'age20cloth',
       'age20textile', 'age20stuff', 'age20book', 'age20affair',
       'age20car_sell', 'age20car_repair', 'age20medical',
       'age20public_health', 'age20food', 'age20grocery',
       'age20repair_survice', 'age20', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain']

In [12]:
def anova(col_name):
    formula = col_name + ' ~ pm10 + pm25 + temp + humi + CONTENT + rain + CONTENT:temp + CONTENT:rain + CONTENT:humi'
    lm = ols(formula, df_20).fit()
    return lm.summary()

## 20대 숙박

In [13]:
print(anova("age20accum")) # CONTENT

                            OLS Regression Results                            
Dep. Variable:             age20accum   R-squared:                       0.398
Model:                            OLS   Adj. R-squared:                  0.383
Method:                 Least Squares   F-statistic:                     26.08
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           2.08e-34
Time:                        23:04:26   Log-Likelihood:                -2397.8
No. Observations:                 365   AIC:                             4816.
Df Residuals:                     355   BIC:                             4855.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      569.5014     88.338      6.447   

In [14]:
print(anova('age20cloth')) # CONTENT

                            OLS Regression Results                            
Dep. Variable:             age20cloth   R-squared:                       0.366
Model:                            OLS   Adj. R-squared:                  0.350
Method:                 Least Squares   F-statistic:                     22.81
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           1.38e-30
Time:                        23:04:26   Log-Likelihood:                -2643.6
No. Observations:                 365   AIC:                             5307.
Df Residuals:                     355   BIC:                             5346.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     1590.2234    173.237      9.179   

In [17]:
print(anova('age20leisure_goods')) # CONTENT

                            OLS Regression Results                            
Dep. Variable:     age20leisure_goods   R-squared:                       0.349
Model:                            OLS   Adj. R-squared:                  0.332
Method:                 Least Squares   F-statistic:                     21.13
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           1.51e-28
Time:                        23:04:34   Log-Likelihood:                -2190.9
No. Observations:                 365   AIC:                             4402.
Df Residuals:                     355   BIC:                             4441.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      263.6827     50.113      5.262   

In [18]:
print(anova('age20stuff')) # CONTENT

                            OLS Regression Results                            
Dep. Variable:             age20stuff   R-squared:                       0.299
Model:                            OLS   Adj. R-squared:                  0.282
Method:                 Least Squares   F-statistic:                     16.85
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           3.86e-23
Time:                        23:04:46   Log-Likelihood:                -2538.7
No. Observations:                 365   AIC:                             5097.
Df Residuals:                     355   BIC:                             5136.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      963.5266    129.959      7.414   

In [20]:
print(anova("LCLS_30_P")) # pm10

                            OLS Regression Results                            
Dep. Variable:              LCLS_30_P   R-squared:                       0.830
Model:                            OLS   Adj. R-squared:                  0.826
Method:                 Least Squares   F-statistic:                     192.8
Date:                Mon, 09 Sep 2019   Prob (F-statistic):          7.51e-131
Time:                        23:05:05   Log-Likelihood:                -4266.4
No. Observations:                 365   AIC:                             8553.
Df Residuals:                     355   BIC:                             8592.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     4.049e+05   1.48e+04     27.404   

In [21]:
print(anova("age20car_repair")) # pm10

                            OLS Regression Results                            
Dep. Variable:        age20car_repair   R-squared:                       0.304
Model:                            OLS   Adj. R-squared:                  0.286
Method:                 Least Squares   F-statistic:                     17.23
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           1.25e-23
Time:                        23:05:20   Log-Likelihood:                -2409.8
No. Observations:                 365   AIC:                             4840.
Df Residuals:                     355   BIC:                             4879.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      896.8064     91.297      9.823   