# Multi Linear Regression을 이용하여 p-value 계산하기
## p-value : 귀무가설을 가정하였을 때 표본 이상으로 극단적인 결과를 얻을 확률
## 귀무가설 : Multi Linear Regression을 사용하여 회귀모델을 설계할 때 각 feature의 coefficient가 0이 됨
- 따라서 p-value는 극단적인 결과를 얻을 확률이 적어야 안정적인 해당 feature가 안정적이라고 할 수 있다.

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import scipy.stats as stats
import urllib
from statsmodels.formula.api import ols
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

In [2]:
# 나이대 별로 약 20개 이상의 feature에 대한 p-value를 확인해야하므로 나이대 별로 서로 다른 파일에 저장
df_40 = pd.read_csv('../data/innovation/Merge_data/df_age40_jongro.csv')

In [3]:
df_40.columns

Index(['age40accum', 'age40leisure goods', 'age40leisure busi', 'age40culture',
       'age40furniture', 'age40electronic', 'age40kitchen', 'age40fuel',
       'age40optic', 'age40Appliances', 'age40circul', 'age40cloth',
       'age40textile', 'age40stuff', 'age40book', 'age40affair',
       'age40car sell', 'age40car repair', 'age40medical',
       'age40public health', 'age40food', 'age40grocery',
       'age40repair survice', 'age40', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain'],
      dtype='object')

In [4]:
df_40.columns = ['age40accum', 'age40leisure_goods', 'age40leisure_busi', 'age40culture',
       'age40furniture', 'age40electronic', 'age40kitchen', 'age40fuel',
       'age40optic', 'age40Appliances', 'age40circul', 'age40cloth',
       'age40textile', 'age40stuff', 'age40book', 'age40affair',
       'age40car_sell', 'age40car_repair', 'age40medical',
       'age40public_health', 'age40food', 'age40grocery',
       'age40repair_survice', 'age40', 'LCLS_10_P', 'LCLS_20_P', 'LCLS_30_P',
       'LCLS_40_P', 'LCLS_50_P', 'LCLS_60_P', 'LCLS_70_P', 'LCLS_80_P', 'pm10',
       'pm25', 'humi', 'temp', 'CONTENT', 'rain']

In [5]:
def anova(col_name):
    formula = col_name + ' ~ pm10 + pm25 + temp + humi + CONTENT + rain + CONTENT:temp + CONTENT:rain + CONTENT:humi'
    lm = ols(formula, df_40).fit()
    return lm.summary()

In [6]:
print(anova('age40culture')) # CONTENT

                            OLS Regression Results                            
Dep. Variable:           age40culture   R-squared:                       0.269
Model:                            OLS   Adj. R-squared:                  0.250
Method:                 Least Squares   F-statistic:                     14.50
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           5.20e-20
Time:                        23:11:32   Log-Likelihood:                -2408.9
No. Observations:                 365   AIC:                             4838.
Df Residuals:                     355   BIC:                             4877.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept      559.1367     91.083      6.139   

In [7]:
print(anova("age40medical")) # CONTENT

                            OLS Regression Results                            
Dep. Variable:           age40medical   R-squared:                       0.282
Model:                            OLS   Adj. R-squared:                  0.264
Method:                 Least Squares   F-statistic:                     15.48
Date:                Mon, 09 Sep 2019   Prob (F-statistic):           2.56e-21
Time:                        23:11:38   Log-Likelihood:                -3375.4
No. Observations:                 365   AIC:                             6771.
Df Residuals:                     355   BIC:                             6810.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     4861.9293   1286.378      3.780   

In [9]:
print(anova("LCLS_30_P")) # pm10

                            OLS Regression Results                            
Dep. Variable:              LCLS_30_P   R-squared:                       0.830
Model:                            OLS   Adj. R-squared:                  0.826
Method:                 Least Squares   F-statistic:                     192.8
Date:                Mon, 09 Sep 2019   Prob (F-statistic):          7.51e-131
Time:                        23:11:52   Log-Likelihood:                -4266.4
No. Observations:                 365   AIC:                             8553.
Df Residuals:                     355   BIC:                             8592.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept     4.049e+05   1.48e+04     27.404   