In [277]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import xport
from patsy import dmatrices
from patsy import dmatrix
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn import preprocessing

In [278]:
!python -m xport ALQ_D.XPT > ALQ_D.csv
!python -m xport PAQ_D.XPT > PAQ_D.csv
!python -m xport PAQIAF_D.XPT > PAQIAF_D.csv
!python -m xport SLQ_D.XPT > SLQ_D.csv
!python -m xport DR1TOT_D.XPT > DR1TOT_D.csv
!python -m xport DEMO_D.XPT > DEMO_D.csv

In [279]:
data1=pd.read_csv('ALQ_D.csv')
data2=pd.read_csv('PAQ_D.csv')
data3=pd.read_csv('PAQIAF_D.csv')
data4=pd.read_csv('SLQ_D.csv')
data5=pd.read_csv('DR1TOT_D.csv')
data6=pd.read_csv('DEMO_D.csv')

In [280]:
data1=data1[['SEQN','ALQ130']]
data4=data4[['SEQN','SLD010H']]
data2=data2[['SEQN','PAQ520']]
data3=data3[['SEQN','PADTIMES','PADDURAT']]
data5=data5[['SEQN','DR1TKCAL','DR1TSUGR','DR1TCAFF']]
data6=data6[['SEQN','RIAGENDR','RIDAGEYR']]

In [281]:
data=pd.merge(data1,data3,on='SEQN',how='inner')
data=pd.merge(data,data2,on='SEQN',how='inner')
data=pd.merge(data,data4,on='SEQN',how='inner')
data=pd.merge(data,data5,on='SEQN',how='inner')
data=pd.merge(data,data6,on='SEQN',how='inner')

In [282]:
data=data.dropna()

In [283]:
data=data.groupby('SEQN').mean()

In [284]:
data['act_time']=data['PADTIMES']*data['PADDURAT']
data=data.drop(['PADTIMES','PADDURAT'],axis=1)
data.columns=['alcohol','act_level','sleep_time','energy','sugars','caffeine','gender','age','act_time']

In [285]:
data['gender']=data['gender'].apply(str)

In [286]:
data1 = data._get_numeric_data()

In [287]:

# get y and X dataframes based on this regression:
y, X = dmatrices('sleep_time ~ alcohol + act_level + energy + sugars + caffeine + age + act_time - 1', data1, return_type='dataframe')

In [288]:
vif=pd.DataFrame()
vif["VIF Factor"] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif["features"]=X.columns

In [289]:
print(vif)

   VIF Factor   features
0    1.020760    alcohol
1    3.135204  act_level
2    7.964849     energy
3    5.484066     sugars
4    1.754082   caffeine
5    4.072600        age
6    1.343241   act_time


In [290]:
data1.drop('sleep_time',axis=1).corr()

Unnamed: 0,alcohol,act_level,energy,sugars,caffeine,age,act_time
alcohol,1.0,0.021607,-0.001534,-0.00536,-0.016846,-0.060884,0.023307
act_level,0.021607,1.0,-0.043153,0.0122,-0.042291,-0.134242,-0.0641
energy,-0.001534,-0.043153,1.0,0.621193,0.101462,-0.190339,0.050308
sugars,-0.00536,0.0122,0.621193,1.0,0.041357,-0.162702,-0.016169
caffeine,-0.016846,-0.042291,0.101462,0.041357,1.0,0.138145,-0.01933
age,-0.060884,-0.134242,-0.190339,-0.162702,0.138145,1.0,0.038839
act_time,0.023307,-0.0641,0.050308,-0.016169,-0.01933,0.038839,1.0


In [291]:
mod = smf.ols(formula='sleep_time ~ age + act_level + sugars -1', data=data)
res = mod.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:             sleep_time   R-squared (uncentered):                   0.722
Model:                            OLS   Adj. R-squared (uncentered):              0.722
Method:                 Least Squares   F-statistic:                              1594.
Date:                Mon, 02 Dec 2019   Prob (F-statistic):                        0.00
Time:                        20:37:40   Log-Likelihood:                         -5280.6
No. Observations:                1844   AIC:                                  1.057e+04
Df Residuals:                    1841   BIC:                                  1.058e+04
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

In [292]:
transformed_x1 = dmatrix("bs(data.sugars, df=3, include_intercept=False)",
                        {"data.sugars": data.sugars}, return_type='dataframe')
transformed_x1 = transformed_x1.join(data[['sleep_time','age','act_level']]).drop('Intercept',axis=1)
transformed_x1.columns = ['sugars1','sugars2','sugars3','sleep_time','age','act_level']

In [293]:
mod2 = smf.ols(formula='sleep_time ~ sugars1 + sugars2 + sugars3  + age + act_level -1 ', data=transformed_x1)
res2 = mod2.fit()
print(res2.summary())

                                 OLS Regression Results                                
Dep. Variable:             sleep_time   R-squared (uncentered):                   0.734
Model:                            OLS   Adj. R-squared (uncentered):              0.733
Method:                 Least Squares   F-statistic:                              1015.
Date:                Mon, 02 Dec 2019   Prob (F-statistic):                        0.00
Time:                        20:37:44   Log-Likelihood:                         -5240.0
No. Observations:                1844   AIC:                                  1.049e+04
Df Residuals:                    1839   BIC:                                  1.052e+04
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------