In [1]:
import os
import sys
import csv
import json
import moment
import pymysql
import datetime

import numpy as np
import scipy as sp
import scipy.stats as stats
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict

from lifelines import CoxPHFitter
from lifelines import KaplanMeierFitter

%matplotlib inline

## This script can generate the results data file required for plots and tables

Step 1. Set which outcome variable you want to generate data for.

In [2]:
# data_file_name = 'data/pandas_df_v3-1_2020-04-25.pkl'
data_file_name = 'data/pandas_df_v4_2020-04-25.pkl'

df = pd.read_pickle(data_file_name)
_version, date_retrieved = data_file_name.split('_df_')[1].split('.')[0].split('_')
print(_version, date_retrieved)

v4 2020-04-25


In [3]:
# outcome = 'intubated'
# days_to = 'days_to_intubation'

outcome = 'died'
days_to = 'days_to_death'

In [4]:
# NaN indicate people who were not covid positive
covidpos = df.copy()

covidpos = covidpos[~pd.isna(covidpos["days_to_intubation"])]

# remove patients who were intubated before they were diagnosed
print(sum(covidpos["days_to_intubation"] < 0))
covidpos = covidpos[covidpos["days_to_intubation"] >= 0]

# remove patients that died before they were diagnosed
print(sum(covidpos["days_to_death"] < 0))
covidpos = covidpos[covidpos["days_to_death"] >= 0]

# remove patients who were intubated or died more than 90 days later
print(sum(covidpos["days_to_intubation"] < 0))
covidpos = covidpos[covidpos["days_to_intubation"] < 90]
covidpos = covidpos[covidpos["days_to_death"] < 90]

covidpos['age_over_65'] = (covidpos['age'] > 65)+1-1

covidpos.describe()

162
12
0


Unnamed: 0,pat_mrn_id,intubated,days_to_intubation,died,days_to_death,age,sex,smoker,macula,compl_def,...,race_asian,race_white,race_other,race_declined,eth_hispanic,eth_nonhispanic,eth_declinedother,hx_data,co_data,age_over_65
count,6393.0,6393.0,6393.0,6393.0,6393.0,6393.0,6393.0,2817.0,6393.0,6393.0,...,6393.0,6393.0,6393.0,6393.0,6393.0,6393.0,6393.0,6393.0,6393.0,6393.0
mean,1094742000.0,0.075708,19.596903,0.096668,19.983576,57.142709,0.496637,0.256656,0.013765,0.000626,...,0.02362,0.284374,0.278742,0.191303,0.341624,0.369936,0.28844,0.629282,0.680119,0.375411
std,131800700.0,0.264551,11.343829,0.295529,11.176388,19.885933,0.500028,0.436865,0.116523,0.025008,...,0.151873,0.451151,0.448416,0.393358,0.474291,0.482825,0.453072,0.483035,0.466467,0.484267
min,1000011000.0,0.0,0.0,0.0,0.0,0.002738,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1004369000.0,0.0,11.0,0.0,11.0,41.470226,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1009216000.0,0.0,20.0,0.0,20.0,58.527036,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
75%,1200197000.0,0.0,28.0,0.0,28.0,72.005476,1.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1400038000.0,1.0,84.0,1.0,83.0,120.202601,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Iterate through each of the covariates to model and save the summary for later plotting

Covariates
- macula
- compl_def
- coagulation
- hypertension
- type2_diabetes
- obesity
- cad
- age over 65
- cough

In [122]:
covariates = ['macula', 'age_over_65', 'refctrl', 'compl_def', 'coagulation', 'hypertension', 'type2_diabetes', 'obesity', 'cad', 'smoker']

results = defaultdict(dict)

for covar in covariates:
    
    # univariate analysis
    covar_df = covidpos[[outcome, days_to, covar]]
    cph = CoxPHFitter()
    cph.fit(covar_df.dropna(), duration_col=days_to, event_col=outcome)
    cph.print_summary()
    
    results[covar]['univariate'] = cph.summary.T.to_dict()[covar]
    
    # age and sex corrected analysis
    covar_df = covidpos[[outcome, days_to, covar, 'age', 'sex']]
    cph = CoxPHFitter()
    cph.fit(covar_df.dropna(), duration_col=days_to, event_col=outcome)
    cph.print_summary()
    
    results[covar]['age_sex_corrected'] = cph.summary.T.to_dict()[covar]

0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-5184.97
time fit was run,2020-05-22 00:22:31 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
macula,1.1,2.99,0.22,0.67,1.52,1.96,4.58,5.05,<0.005,21.12

0,1
Concordance,0.51
Log-likelihood ratio test,18.60 on 1 df
-log2(p) of ll-ratio test,15.92


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-4808.32
time fit was run,2020-05-22 00:22:31 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
macula,0.43,1.53,0.22,-0.0,0.85,1.0,2.35,1.95,0.05,4.3
age,0.07,1.07,0.0,0.06,0.07,1.06,1.07,24.24,<0.005,428.81
sex,0.45,1.56,0.08,0.28,0.61,1.33,1.84,5.39,<0.005,23.79

0,1
Concordance,0.82
Log-likelihood ratio test,771.91 on 3 df
-log2(p) of ll-ratio test,552.34


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-4901.67
time fit was run,2020-05-22 00:22:31 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_over_65,2.18,8.8,0.11,1.97,2.39,7.14,10.86,20.29,<0.005,301.7

0,1
Concordance,0.74
Log-likelihood ratio test,585.21 on 1 df
-log2(p) of ll-ratio test,427.06


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-4804.37
time fit was run,2020-05-22 00:22:31 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_over_65,0.52,1.68,0.16,0.21,0.82,1.23,2.28,3.3,<0.005,10.04
age,0.06,1.06,0.0,0.05,0.06,1.05,1.07,13.55,<0.005,136.51
sex,0.42,1.52,0.08,0.25,0.58,1.29,1.79,5.03,<0.005,20.98

0,1
Concordance,0.82
Log-likelihood ratio test,779.81 on 3 df
-log2(p) of ll-ratio test,558.03


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-5191.41
time fit was run,2020-05-22 00:22:31 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
refctrl,0.28,1.32,0.11,0.06,0.5,1.06,1.65,2.47,0.01,6.21

0,1
Concordance,0.51
Log-likelihood ratio test,5.71 on 1 df
-log2(p) of ll-ratio test,5.89


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-4807.22
time fit was run,2020-05-22 00:22:31 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
refctrl,0.28,1.32,0.11,0.05,0.5,1.06,1.65,2.43,0.01,6.07
age,0.07,1.07,0.0,0.06,0.07,1.06,1.08,24.44,<0.005,435.97
sex,0.44,1.55,0.08,0.28,0.6,1.32,1.83,5.34,<0.005,23.34

0,1
Concordance,0.82
Log-likelihood ratio test,774.11 on 3 df
-log2(p) of ll-ratio test,553.92


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-5193.84
time fit was run,2020-05-22 00:22:31 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
compl_def,-11.9,0.0,586.81,-1162.02,1138.22,0.0,inf,-0.02,0.98,0.02

0,1
Concordance,0.50
Log-likelihood ratio test,0.86 on 1 df
-log2(p) of ll-ratio test,1.49


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-4809.58
time fit was run,2020-05-22 00:22:31 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
compl_def,-12.86,0.0,959.95,-1894.33,1868.6,0.0,inf,-0.01,0.99,0.02
age,0.07,1.07,0.0,0.06,0.07,1.06,1.08,24.46,<0.005,436.41
sex,0.44,1.56,0.08,0.28,0.61,1.33,1.83,5.37,<0.005,23.63

0,1
Concordance,0.82
Log-likelihood ratio test,769.38 on 3 df
-log2(p) of ll-ratio test,550.52


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-5149.62
time fit was run,2020-05-22 00:22:32 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
coagulation,0.85,2.33,0.08,0.68,1.01,1.98,2.76,9.98,<0.005,75.52

0,1
Concordance,0.58
Log-likelihood ratio test,89.30 on 1 df
-log2(p) of ll-ratio test,68.00


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-4787.44
time fit was run,2020-05-22 00:22:32 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
coagulation,0.6,1.81,0.09,0.43,0.76,1.53,2.14,6.96,<0.005,38.09
age,0.07,1.07,0.0,0.06,0.07,1.06,1.07,23.56,<0.005,405.39
sex,0.47,1.59,0.08,0.3,0.63,1.36,1.87,5.65,<0.005,25.86

0,1
Concordance,0.82
Log-likelihood ratio test,813.67 on 3 df
-log2(p) of ll-ratio test,582.43


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-5062.63
time fit was run,2020-05-22 00:22:32 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
hypertension,1.32,3.75,0.08,1.16,1.48,3.19,4.41,16.07,<0.005,190.71

0,1
Concordance,0.66
Log-likelihood ratio test,263.28 on 1 df
-log2(p) of ll-ratio test,194.27


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-4758.17
time fit was run,2020-05-22 00:22:32 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
hypertension,0.83,2.3,0.08,0.67,1.0,1.96,2.71,10.04,<0.005,76.37
age,0.06,1.07,0.0,0.06,0.07,1.06,1.07,22.16,<0.005,359.14
sex,0.47,1.6,0.08,0.31,0.63,1.36,1.88,5.66,<0.005,25.94

0,1
Concordance,0.83
Log-likelihood ratio test,872.20 on 3 df
-log2(p) of ll-ratio test,624.60


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-5129.54
time fit was run,2020-05-22 00:22:32 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
type2_diabetes,1.08,2.93,0.09,0.91,1.25,2.47,3.48,12.35,<0.005,114.07

0,1
Concordance,0.59
Log-likelihood ratio test,129.45 on 1 df
-log2(p) of ll-ratio test,97.22


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-4782.28
time fit was run,2020-05-22 00:22:32 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
type2_diabetes,0.68,1.98,0.09,0.51,0.86,1.67,2.35,7.82,<0.005,47.42
age,0.07,1.07,0.0,0.06,0.07,1.06,1.07,23.5,<0.005,403.33
sex,0.44,1.55,0.08,0.28,0.6,1.32,1.83,5.32,<0.005,23.22

0,1
Concordance,0.83
Log-likelihood ratio test,823.98 on 3 df
-log2(p) of ll-ratio test,589.85


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-5184.63
time fit was run,2020-05-22 00:22:32 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
obesity,0.48,1.61,0.1,0.28,0.68,1.32,1.98,4.62,<0.005,18.0

0,1
Concordance,0.53
Log-likelihood ratio test,19.28 on 1 df
-log2(p) of ll-ratio test,16.43


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-4793.33
time fit was run,2020-05-22 00:22:32 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
obesity,0.65,1.92,0.11,0.44,0.86,1.56,2.36,6.16,<0.005,30.39
age,0.07,1.07,0.0,0.06,0.07,1.06,1.08,24.48,<0.005,437.13
sex,0.52,1.68,0.08,0.35,0.68,1.42,1.98,6.17,<0.005,30.42

0,1
Concordance,0.82
Log-likelihood ratio test,801.88 on 3 df
-log2(p) of ll-ratio test,573.93


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-5067.31
time fit was run,2020-05-22 00:22:32 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
cad,1.31,3.69,0.08,1.15,1.47,3.15,4.33,16.13,<0.005,192.13

0,1
Concordance,0.65
Log-likelihood ratio test,253.92 on 1 df
-log2(p) of ll-ratio test,187.49


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,6393
number of events observed,618
partial log-likelihood,-4763.08
time fit was run,2020-05-22 00:22:32 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
cad,0.8,2.23,0.08,0.64,0.96,1.89,2.62,9.68,<0.005,71.19
age,0.06,1.06,0.0,0.06,0.07,1.06,1.07,22.1,<0.005,357.23
sex,0.46,1.59,0.08,0.3,0.63,1.35,1.87,5.59,<0.005,25.41

0,1
Concordance,0.83
Log-likelihood ratio test,862.37 on 3 df
-log2(p) of ll-ratio test,617.52


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,2817
number of events observed,326
partial log-likelihood,-2450.98
time fit was run,2020-05-22 00:22:33 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
smoker,0.42,1.53,0.12,0.19,0.65,1.21,1.92,3.6,<0.005,11.64

0,1
Concordance,0.54
Log-likelihood ratio test,12.33 on 1 df
-log2(p) of ll-ratio test,11.13


0,1
model,lifelines.CoxPHFitter
duration col,'days_to_death'
event col,'died'
baseline estimation,breslow
number of observations,2817
number of events observed,326
partial log-likelihood,-2253.70
time fit was run,2020-05-22 00:22:33 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
smoker,0.08,1.08,0.12,-0.15,0.31,0.86,1.36,0.66,0.51,0.97
age,0.07,1.08,0.0,0.07,0.08,1.07,1.09,17.5,<0.005,225.25
sex,0.35,1.42,0.11,0.13,0.58,1.14,1.78,3.08,<0.005,8.92

0,1
Concordance,0.81
Log-likelihood ratio test,406.89 on 3 df
-log2(p) of ll-ratio test,289.50


In [123]:
# print(json.dumps(results, indent=2))

In [107]:
# save results to file
outfh = open('results/coxph_%s_%s_%s.json' % (outcome, _version, date_retrieved), 'w')
outfh.write(json.dumps(results, indent=2))
outfh.close()