In [1]:
import os
import sys
import csv
import json
import moment
import pymysql
import datetime

import numpy as np
import scipy as sp
import scipy.stats as stats
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from collections import defaultdict

%matplotlib inline

# Create a file that can make a pretty Table 1 Section A for the manuscript

## Groups of patients (columns)

- All
- COVID+ 
- COVID- 
- Macula 
- Complement deficiency
- Coaglation
- Hypertension
- Diabetes
- Obesity
- CAD

## Features (rows)

Demographics
- N
- Age (IQR)
- Sex (%)
- Race/Ethnicity

Comorbidities
- hypertension
- t2dm
- obesity
- cad

Outcomes
- death
- intubation


In [2]:
#data_file_name = 'data/pandas_df_v2_2020-04-25.pkl'
# data_file_name = 'data/pandas_df_v3_2020-04-25.pkl'
# data_file_name = 'data/pandas_df_v3-1_2020-04-25.pkl'
# data_file_name = 'data/pandas_df_v3-1_2020-04-25.pkl'
data_file_name = 'data/pandas_df_v4_2020-04-25.pkl'

df = pd.read_pickle(data_file_name)
_version, date_retrieved = data_file_name.split('_df_')[1].split('.')[0].split('_')
print(_version, date_retrieved)
df.head()

v4 2020-04-25


Unnamed: 0,pat_mrn_id,intubated,days_to_intubation,died,days_to_death,age,sex,smoker,macula,compl_def,...,race_black,race_asian,race_white,race_other,race_declined,eth_hispanic,eth_nonhispanic,eth_declinedother,hx_data,co_data
0,1000000661,,,,,74.288843,0,,0,0,...,0,0,1,0,0,0,1,0,1,1
1,1000011330,0.0,3.0,0.0,3.0,62.469541,0,0.0,0,0,...,1,0,0,0,0,0,1,0,1,1
2,1000013538,,,,,57.130732,1,1.0,0,0,...,0,0,0,1,0,1,0,0,1,1
3,1000015181,0.0,27.0,0.0,27.0,28.016427,1,,0,0,...,0,0,1,0,0,0,1,0,0,1
4,1000023814,,,,,54.855578,0,,0,0,...,0,0,0,1,0,0,0,1,1,1


## Derive a covid-positive only data frame for easy access later

In [12]:
sum(df['smoker']>=1)

1359

In [3]:
covidpos = df.copy()

# NaN indicate people who were not covid positive
covidpos = covidpos[~pd.isna(covidpos["days_to_intubation"])]

# remove patients who were intubated before they were diagnosed
covidpos = covidpos[covidpos["days_to_intubation"] >= 0]

# remove patients that died before they were diagnosed
covidpos = covidpos[covidpos["days_to_death"] >= 0]

covidpos['age_over_65'] = (covidpos['age'] > 65)+1-1

In [13]:
sum(covidpos['smoker']>=1)

723

## This is the main function that will build the rows for a given dataset

Notes:
- `propci` is a function to copute the 95% confidence interval of a proporation
- `build_features` is a function that generates a dictionary of row values

In [5]:
def propci(p, n):
    s = np.sqrt((p*(1-p)/n))
    return (p, p-1.96*s, p+1.96*s)


def build_features(df):
    features = {
        'n': {'value': "%d" % df.shape[0], 'name': 'N'},
        'age': {'name': 'Age (IQR)', 'value': "%.1f (%.1f-%.1f)" % (df['age'].mean(), df['age'].quantile(q=0.25), df['age'].quantile(q=0.75))},
        'sex': {'name': 'Sex (% Male)', 'value': "%.1f" % (100.*df['sex'].mean())},
        'smoker':
            {
                'name': 'Past/Current Smoker (%)',
                'value': '%.1f' % (100*df['smoker'].mean())
            },
        
        'hx_data':
            {
                'name': 'Data Source Historical (%)',
                'value': '%.1f' % (100*df['hx_data'].mean())
            },
        
        'htn': {'name': 'Hypertension (%)', 'value': "%.1f" % (100.*df['hypertension'].mean())},
        't2dm': {'name': 'Type 2 Daibetes (%)', 'value': "%.1f" % (100.*df['type2_diabetes'].mean())},
        'ob': {'name': 'Obesity (%)', 'value': "%.1f" % (100.*df['obesity'].mean())},
        'cad': {'name': 'CAD (%)', 'value': "%.1f" % (100.*df['cad'].mean())},
        
        'race_asian': {
            'name': 'Asian (%)',
            'value': '%.1f' % (100.*df['race_asian'].mean())
        },
        'race_black': {
            'name': 'Black/African American (%)',
            'value': '%.1f' % (100.*df['race_black'].mean())
        },
        'race_white': {
            'name': 'White (%)',
            'value': '%.1f' % (100.*df['race_white'].mean())
        },
        'race_other': {
            'name': 'Other (%)',
            'value': '%.1f' % (100.*df['race_other'].mean())
        },
        'race_declined': {
            'name': 'Declined (%)',
            'value': '%.1f' % (100.*df['race_declined'].mean())
        },
        
        'eth_hispanic': {
            'name': 'Hispanic (%)',
            'value': '%.1f' % (100.*df['eth_hispanic'].mean())
        },
        'eth_nonhispanic': {
            'name': 'Not Hispanic (%)',
            'value': '%.1f' % (100.*df['eth_nonhispanic'].mean())
        },
        'eth_declinedother': {
            'name': 'Declined/Other (%)',
            'value': '%.1f' % (100.*df['eth_declinedother'].mean())
        },
        
        'death': {
            'name': 'Mortality',
            'value': "%.1f (%.1f-%.1f)" % tuple(map(lambda x: 100*x, propci(df['died'].mean(), df.shape[0]))),
        },
        'intub': {
            'name': 'Mech Ventilation',
            'value': "%.1f (%.1f-%.1f)" % tuple(map(lambda x: 100*x, propci(df['intubated'].mean(), df.shape[0]))),
        }
    }
    return features

# build_features(df[df['died']==1])
build_features(df)


{'n': {'value': '11116', 'name': 'N'},
 'age': {'name': 'Age (IQR)', 'value': '52.0 (34.7-69.5)'},
 'sex': {'name': 'Sex (% Male)', 'value': '44.8'},
 'smoker': {'name': 'Past/Current Smoker (%)', 'value': '26.8'},
 'hx_data': {'name': 'Data Source Historical (%)', 'value': '61.7'},
 'htn': {'name': 'Hypertension (%)', 'value': '28.2'},
 't2dm': {'name': 'Type 2 Daibetes (%)', 'value': '12.6'},
 'ob': {'name': 'Obesity (%)', 'value': '12.0'},
 'cad': {'name': 'CAD (%)', 'value': '26.8'},
 'race_asian': {'name': 'Asian (%)', 'value': '2.7'},
 'race_black': {'name': 'Black/African American (%)', 'value': '21.2'},
 'race_white': {'name': 'White (%)', 'value': '31.3'},
 'race_other': {'name': 'Other (%)', 'value': '26.6'},
 'race_declined': {'name': 'Declined (%)', 'value': '18.1'},
 'eth_hispanic': {'name': 'Hispanic (%)', 'value': '31.8'},
 'eth_nonhispanic': {'name': 'Not Hispanic (%)', 'value': '39.5'},
 'eth_declinedother': {'name': 'Declined/Other (%)', 'value': '28.7'},
 'death': {'

In [29]:
demogs = ['n', 'age', 'sex', 'smoker', 'hx_data']
demogs += ['race_asian', 'race_black', 'race_white', 'race_other', 'race_declined']
demogs += ['eth_hispanic', 'eth_nonhispanic', 'eth_declinedother']

covars = ['htn', 't2dm', 'ob', 'cad']
outcomes = ['intub', 'death']

table_data = {
    'all': {
        'name': 'All Patients',
        'features': build_features(df),
    },
    'covidpos': {
        'name': 'SARS-Cov-2 Positive (C19+)',
        'features': build_features(covidpos),
    },
    'intubated': {
        'name': 'Intubated and C19+',
        'features': build_features(covidpos[covidpos['intubated']==1]),
    },
    'death': {
        'name': 'Mortality and C19+',
        'features': build_features(covidpos[covidpos['died']==1]),
    },
    'macula': {
        'name': 'Macula and C19+',
        'features': build_features(covidpos[covidpos['macula']==1]),
    },
    'compl_def': {
        'name': 'Complement Def. and C19+',
        'features': build_features(covidpos[covidpos['compl_def']==1]),
    },
    'coagulation': {
        'name': 'Coagulation and C19+',
        'features': build_features(covidpos[covidpos['coagulation']==1]),
    },
    'hypertension': {
        'name': 'Hypertension and C19+',
        'features': build_features(covidpos[covidpos['hypertension']==1]),
    },
    'type2_diabetes': {
        'name': 'Type 2 Diabetes and C19+',
        'features': build_features(covidpos[covidpos['type2_diabetes']==1]),
    },
    'obesity': {
        'name': 'Obesity and C19+',
        'features': build_features(covidpos[covidpos['obesity']==1]),
    },
    'cad': {
        'name': 'Coronary Artery Disease and C19+',
        'features': build_features(covidpos[covidpos['cad']==1]),
    },
    'age_over_65': {
        'name': 'Over 65 Years',
        'features': build_features(covidpos[covidpos['age_over_65']==1]),
    },
    'refctrl': {
        'name': 'Cough (Reference) and C19+',
        'features': build_features(covidpos[covidpos['refctrl']==1])
    },
    'smoker': {
        'name': 'Smokers and C19+',
        'features': build_features(covidpos[covidpos['smoker']==1])
    },
}

In [30]:
groups = ['all', 'covidpos', 'intubated', 'death', 'macula', 'compl_def', 'coagulation', 'hypertension', 'type2_diabetes', 'obesity', 'cad', 'age_over_65', 'refctrl', 'smoker']

outfh = open('results/table1_sectionA_%s_%s.csv' % (_version, date_retrieved), 'w')
writer = csv.writer(outfh)

row_data = list()
row_data.append('Variable')
for group in groups:
    row_data.append(table_data[group]['name'])

writer.writerow(row_data)

# The basic statistics rows
for row in (demogs + covars + outcomes):
    
    row_name = None
    row_data = list()
    
    for group in table_data.keys():
        if row_name is None:
            row_name = table_data[group]['features'][row]['name']
        
        row_data.append(table_data[group]['features'][row]['value'])
    
    writer.writerow([row_name] + row_data)

outfh.close()