<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc" style="margin-top: 1em;"><ul class="toc-item"></ul></div>

In [35]:
%matplotlib inline

import sys
sys.path.append("../..")

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.formula.api as smf
from bld.project_paths import project_paths_join as ppj

In [36]:
COVARIATES_CATEGORICAL = [
    "AGE_GROUPS", "EDUCATION_GROUPS_ISCED97", "EMPLOYMENT_STATUS",
    "GENDER", "MARITAL_STATUS", "MIGRATION_STATUS",
]

COVARIATES_CONTINUOUS = ["HH_NET_INCOME_YEAR"]

df = pd.read_pickle(ppj("OUT_ANALYSIS", "panel_reg.pkl"))

df = df[sorted(df)]

In [37]:
def make_regression_formula(y, x_continuous, x_discrete):
    x_discrete_mod = [f"C({i})" for i in x_discrete]
    
    return f"{y} ~ " + " + ".join(x_continuous + x_discrete_mod)

In [38]:
formula = make_regression_formula("FIRST_FACTOR_DELTA", ["EVENT_ANY"] + COVARIATES_CONTINUOUS, COVARIATES_CATEGORICAL)

model_1 = smf.ols(formula, df)
res_1 = model_1.fit(cov_type="cluster", cov_kwds={"groups": df["ID"]}, use_t=True)

In [39]:
res_1.summary()

0,1,2,3
Dep. Variable:,FIRST_FACTOR_DELTA,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,3.543
Date:,"Fri, 29 Mar 2019",Prob (F-statistic):,2.68e-07
Time:,01:05:28,Log-Likelihood:,-26877.0
No. Observations:,21371,AIC:,53790.0
Df Residuals:,21351,BIC:,53950.0
Df Model:,19,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1771,0.069,2.557,0.011,0.041,0.313
EVENT_ANY[T.True],0.0132,0.013,1.037,0.300,-0.012,0.038
"C(AGE_GROUPS)[T.Interval(30, 40, closed='right')]",-0.0252,0.027,-0.947,0.344,-0.077,0.027
"C(AGE_GROUPS)[T.Interval(40, 50, closed='right')]",-0.0695,0.024,-2.860,0.004,-0.117,-0.022
"C(AGE_GROUPS)[T.Interval(50, 60, closed='right')]",-0.0580,0.024,-2.369,0.018,-0.106,-0.010
"C(AGE_GROUPS)[T.Interval(60, 70, closed='right')]",-0.0391,0.026,-1.514,0.130,-0.090,0.012
"C(AGE_GROUPS)[T.Interval(70, 80, closed='right')]",-0.0665,0.029,-2.321,0.020,-0.123,-0.010
"C(AGE_GROUPS)[T.Interval(80, 105, closed='right')]",-0.1187,0.036,-3.299,0.001,-0.189,-0.048
C(EDUCATION_GROUPS_ISCED97)[T.[1] inadequately],-0.0677,0.088,-0.770,0.441,-0.240,0.105

0,1,2,3
Omnibus:,317.658,Durbin-Watson:,2.241
Prob(Omnibus):,0.0,Jarque-Bera (JB):,607.352
Skew:,-0.032,Prob(JB):,1.3e-132
Kurtosis:,3.823,Cond. No.,1430000.0


In [40]:
formula = make_regression_formula("FIRST_FACTOR_DELTA", ["EVENT_COUNT"] + COVARIATES_CONTINUOUS, COVARIATES_CATEGORICAL)

model_2 = smf.ols(formula, df)
res_2 = model_2.fit(cov_type="cluster", cov_kwds={"groups": df["ID"]}, use_t=True)

In [41]:
res_2.summary()

0,1,2,3
Dep. Variable:,FIRST_FACTOR_DELTA,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,3.511
Date:,"Fri, 29 Mar 2019",Prob (F-statistic):,3.36e-07
Time:,01:05:29,Log-Likelihood:,-26877.0
No. Observations:,21371,AIC:,53790.0
Df Residuals:,21351,BIC:,53950.0
Df Model:,19,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1789,0.069,2.583,0.010,0.043,0.315
"C(AGE_GROUPS)[T.Interval(30, 40, closed='right')]",-0.0250,0.027,-0.942,0.346,-0.077,0.027
"C(AGE_GROUPS)[T.Interval(40, 50, closed='right')]",-0.0697,0.024,-2.867,0.004,-0.117,-0.022
"C(AGE_GROUPS)[T.Interval(50, 60, closed='right')]",-0.0583,0.024,-2.381,0.017,-0.106,-0.010
"C(AGE_GROUPS)[T.Interval(60, 70, closed='right')]",-0.0392,0.026,-1.514,0.130,-0.090,0.012
"C(AGE_GROUPS)[T.Interval(70, 80, closed='right')]",-0.0685,0.029,-2.389,0.017,-0.125,-0.012
"C(AGE_GROUPS)[T.Interval(80, 105, closed='right')]",-0.1208,0.036,-3.356,0.001,-0.191,-0.050
C(EDUCATION_GROUPS_ISCED97)[T.[1] inadequately],-0.0675,0.088,-0.767,0.443,-0.240,0.105
C(EDUCATION_GROUPS_ISCED97)[T.[2] general elementary],-0.1088,0.071,-1.531,0.126,-0.248,0.030

0,1,2,3
Omnibus:,317.736,Durbin-Watson:,2.241
Prob(Omnibus):,0.0,Jarque-Bera (JB):,607.584
Skew:,-0.032,Prob(JB):,1.1600000000000001e-132
Kurtosis:,3.823,Cond. No.,1430000.0


In [43]:
formula = make_regression_formula("FIRST_FACTOR_DELTA", ["EVENT_SOCIAL + EVENT_ECONOMIC + EVENT_HEALTH"] + COVARIATES_CONTINUOUS, COVARIATES_CATEGORICAL)

model_3 = smf.ols(formula, df)
res_3 = model_3.fit(cov_type="cluster", cov_kwds={"groups": df["ID"]}, use_t=True)

In [44]:
res_3.summary()

0,1,2,3
Dep. Variable:,FIRST_FACTOR_DELTA,R-squared:,0.003
Model:,OLS,Adj. R-squared:,0.002
Method:,Least Squares,F-statistic:,3.337
Date:,"Fri, 29 Mar 2019",Prob (F-statistic):,3.57e-07
Time:,01:05:48,Log-Likelihood:,-26875.0
No. Observations:,21371,AIC:,53790.0
Df Residuals:,21349,BIC:,53970.0
Df Model:,21,,
Covariance Type:,cluster,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.1763,0.069,2.543,0.011,0.040,0.312
EVENT_SOCIAL[T.True],0.0044,0.015,0.284,0.777,-0.026,0.034
EVENT_ECONOMIC[T.True],0.0203,0.015,1.394,0.163,-0.008,0.049
EVENT_HEALTH[T.True],-0.0568,0.041,-1.376,0.169,-0.138,0.024
"C(AGE_GROUPS)[T.Interval(30, 40, closed='right')]",-0.0228,0.027,-0.855,0.393,-0.075,0.029
"C(AGE_GROUPS)[T.Interval(40, 50, closed='right')]",-0.0676,0.025,-2.746,0.006,-0.116,-0.019
"C(AGE_GROUPS)[T.Interval(50, 60, closed='right')]",-0.0560,0.025,-2.240,0.025,-0.105,-0.007
"C(AGE_GROUPS)[T.Interval(60, 70, closed='right')]",-0.0384,0.026,-1.466,0.143,-0.090,0.013
"C(AGE_GROUPS)[T.Interval(70, 80, closed='right')]",-0.0641,0.029,-2.186,0.029,-0.122,-0.007

0,1,2,3
Omnibus:,316.539,Durbin-Watson:,2.241
Prob(Omnibus):,0.0,Jarque-Bera (JB):,604.387
Skew:,-0.033,Prob(JB):,5.74e-132
Kurtosis:,3.821,Cond. No.,1430000.0


In [48]:
pd.get_dummies(COVARIATES_CATEGORICAL).corr()

Unnamed: 0,AGE_GROUPS,EDUCATION_GROUPS_ISCED97,EMPLOYMENT_STATUS,GENDER,MARITAL_STATUS,MIGRATION_STATUS
AGE_GROUPS,1.0,-0.2,-0.2,-0.2,-0.2,-0.2
EDUCATION_GROUPS_ISCED97,-0.2,1.0,-0.2,-0.2,-0.2,-0.2
EMPLOYMENT_STATUS,-0.2,-0.2,1.0,-0.2,-0.2,-0.2
GENDER,-0.2,-0.2,-0.2,1.0,-0.2,-0.2
MARITAL_STATUS,-0.2,-0.2,-0.2,-0.2,1.0,-0.2
MIGRATION_STATUS,-0.2,-0.2,-0.2,-0.2,-0.2,1.0
