In [1]:
import pymysql 
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
from collections import defaultdict, Counter
import _pickle as pickle
import random
from scipy.stats import epps_singleton_2samp, wasserstein_distance, ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns
from random import sample
from lifelines import KaplanMeierFitter, CoxPHFitter

In [2]:
visit_probability = {}
for visit_id, prob in np.array(pd.read_csv('data/rfr_model_depth_69_trees_190_preds.csv', header=None)):
    visit_probability[int(visit_id)] = prob
    
len(visit_probability)

1573113

In [3]:
phenotype_visits_1y_all = pickle.load(open('data/phenotype_visits_1y_all.p', 'rb'))
followup_visits_1y = pickle.load(open('data/followup_visits_1y.p', 'rb'))
followup_tm_1y = pickle.load(open('data/followup_tm_1y.p', 'rb'))

In [4]:
demographics_variables = pickle.load(open('data/demographic_variables.p', 'rb'))
diag_variables = pickle.load(open('data/diag_variables.p', 'rb'))

In [5]:
all_visit_ids = set(followup_tm_1y.keys())

len(all_visit_ids)

525287

In [6]:
demographics_data = {}

file = open('data/all_visit_demographic_matrix.csv', 'r')

for line in tqdm(file):
    v_id = int(line.split(',')[0])
    if v_id not in all_visit_ids:
        continue
    dt = [int(i) for i in line.strip().split(',')[1:]]
    demographics_data[v_id]=dt

1573113it [00:03, 468830.80it/s]


In [7]:
file.close()

In [8]:
diag_data = {}

file = open('data/all_visit_diag_matrix.csv', 'r')

for line in tqdm(file):
    v_id = int(line.split(',')[0])
    if v_id not in all_visit_ids:
        continue
    dt = [int(i) for i in line.strip().split(',')[1:]]
    diag_data[v_id]=dt

1573113it [02:26, 10747.44it/s]


In [9]:
file.close()

In [10]:
len(demographics_data), len(diag_data)

(525287, 525287)

In [11]:
previous_conditions = defaultdict(list)

file = open('data/previous_conditions_phe.csv', 'r')

for line in tqdm(file):
    if 'phecode' in line:
        continue
    v_id = int(line.split(',')[1])
    if v_id not in all_visit_ids:
        continue
    phe = float(line.split(',')[0])
    if phe not in previous_conditions[v_id]:
        previous_conditions[v_id].append(phe)
        
    

780740814it [13:10, 987574.00it/s] 


In [12]:
file.close()

In [13]:
cases_411_2 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y_all['411.2']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_411_2.append([visit_id, (diag_date-ed_date).days])
            
cases_411_2 = np.array(pd.DataFrame(cases_411_2))
non_cases_411_2 = list((set(followup_visits_1y[:,0])-set(cases_411_2[:,0])))


cases_591 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y_all['591']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_591.append([visit_id, (diag_date-ed_date).days])
            
cases_591 = np.array(pd.DataFrame(cases_591))
non_cases_591 = list((set(followup_visits_1y[:,0])-set(cases_591[:,0])))

cases_585_1 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y_all['585.1']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_585_1.append([visit_id, (diag_date-ed_date).days])
            
cases_585_1 = np.array(pd.DataFrame(cases_585_1))
non_cases_585_1 = list((set(followup_visits_1y[:,0])-set(cases_585_1[:,0])))


cases_250_2 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y_all['250.2']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_250_2.append([visit_id, (diag_date-ed_date).days])
            
cases_250_2 = np.array(pd.DataFrame(cases_250_2))
non_cases_250_2 = list((set(followup_visits_1y[:,0])-set(cases_250_2[:,0])))

In [14]:
len(cases_411_2), len(cases_591), len(cases_585_1), len(cases_250_2), len(non_cases_411_2), len(non_cases_591), len(non_cases_585_1), len(non_cases_250_2)

(11402, 28301, 38400, 47182, 523983, 522909, 522213, 520660)

In [20]:
condition_cts = Counter([phe for visit_id in set(followup_visits_1y[:,0]) for phe in set(previous_conditions[visit_id])])

In [21]:
pd.DataFrame([[phe, condition_cts[phe]] for phe in condition_cts], columns=['phe', 'cts']).sort_values(by='cts', ascending=False)[:10]


Unnamed: 0,phe,cts
4,401.1,226903
0,512.7,178181
3,272.1,156472
223,646.0,154584
84,512.8,149556
72,760.0,138735
212,1009.0,132551
5,530.11,126552
124,339.0,117382
20,508.0,115328


In [22]:
list(pd.DataFrame([[phe, condition_cts[phe]] for phe in condition_cts], columns=['phe', 'cts']).sort_values(by='cts', ascending=False)[:10]['phe'])


[401.1, 512.7, 272.1, 646.0, 512.8, 760.0, 1009.0, 530.11, 339.0, 508.0]

In [24]:
previous_conditions_top_10 = {}

for visit_id in tqdm(set(followup_visits_1y[:,0])):
    d = []
    for phe in [401.1, 512.7, 272.1, 646.0, 512.8, 760.0, 1009.0, 530.11, 339.0, 508.0]:
        if phe in previous_conditions[visit_id]:
            d.append(1)
        else:
            d.append(0)
    previous_conditions_top_10[visit_id] = d
    
previous_top_10_variables = [str(i) for i in [401.1, 512.7, 272.1, 646.0, 512.8, 760.0, 1009.0, 530.11, 339.0, 508.0]]

100%|██████████| 525287/525287 [00:09<00:00, 52843.59it/s] 


In [37]:
coxph_model_data = []
for visit_id in set(cases_411_2[:,0]):
    time_to_diag = min(cases_411_2[:,1][cases_411_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_411_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/411_2_multi_cox.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,525287
number of events observed,1304
partial log-likelihood,-14539.30
time fit was run,2022-06-14 16:58:57 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,-1.6,0.2,0.52,-2.62,-0.58,0.07,0.56,-3.08,<0.005,8.91
age_adult,-0.04,0.96,0.16,-0.34,0.27,0.71,1.31,-0.23,0.81,0.3
age_senior,0.76,2.14,0.15,0.46,1.06,1.58,2.89,4.92,<0.005,20.17
sex_1,-0.67,0.51,0.06,-0.79,-0.56,0.45,0.57,-11.22,<0.005,94.64
AMERICAN INDIAN OR ALASKA,1.36,3.89,0.26,0.85,1.87,2.33,6.49,5.19,<0.005,22.2
ASIAN,-1.1,0.33,0.28,-1.65,-0.54,0.19,0.58,-3.87,<0.005,13.19
BLACK OR AFRICAN AMERICAN,-0.29,0.75,0.09,-0.47,-0.11,0.63,0.9,-3.19,<0.005,9.43
NAT.HAWAIIAN/OTH.PACIFIC,0.28,1.32,0.71,-1.11,1.67,0.33,5.31,0.4,0.69,0.53
WHITE,-0.25,0.78,0.07,-0.37,-0.12,0.69,0.89,-3.7,<0.005,12.19
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.27,0.77,0.07,-0.41,-0.13,0.67,0.88,-3.74,<0.005,12.4

0,1
Concordance,0.84
Partial AIC,29120.60
log-likelihood ratio test,2091.95 on 21 df
-log2(p) of ll-ratio test,inf


In [38]:
coxph_model_data = []
for visit_id in set(cases_591[:,0]):
    time_to_diag = min(cases_591[:,1][cases_591[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_591:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/591_multi_cox.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,525287
number of events observed,2378
partial log-likelihood,-27632.49
time fit was run,2022-06-14 16:59:19 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,-0.76,0.47,0.17,-1.1,-0.43,0.33,0.65,-4.49,<0.005,17.1
age_adult,-0.5,0.61,0.08,-0.66,-0.34,0.52,0.71,-6.12,<0.005,29.98
age_senior,-0.4,0.67,0.09,-0.56,-0.23,0.57,0.8,-4.62,<0.005,17.99
sex_1,0.34,1.41,0.04,0.25,0.43,1.29,1.53,7.68,<0.005,45.83
AMERICAN INDIAN OR ALASKA,0.02,1.02,0.38,-0.72,0.76,0.49,2.14,0.05,0.96,0.06
ASIAN,0.17,1.19,0.14,-0.09,0.44,0.91,1.55,1.26,0.21,2.28
BLACK OR AFRICAN AMERICAN,0.08,1.08,0.06,-0.04,0.21,0.96,1.23,1.28,0.20,2.33
NAT.HAWAIIAN/OTH.PACIFIC,-0.24,0.79,0.71,-1.62,1.15,0.2,3.16,-0.33,0.74,0.44
WHITE,0.12,1.13,0.05,0.03,0.22,1.03,1.25,2.48,0.01,6.26
HISPANIC OR LATINO OR SPANISH ORIGIN,0.1,1.1,0.05,0.0,0.2,1.0,1.22,2.01,0.04,4.48

0,1
Concordance,0.80
Partial AIC,55306.97
log-likelihood ratio test,2446.04 on 21 df
-log2(p) of ll-ratio test,inf


In [39]:
coxph_model_data = []
for visit_id in set(cases_585_1[:,0]):
    time_to_diag = min(cases_585_1[:,1][cases_585_1[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_585_1:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/585_1_multi_cox.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,525287
number of events observed,3074
partial log-likelihood,-32861.58
time fit was run,2022-06-14 16:59:51 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,1.03,2.8,0.18,0.67,1.39,1.96,4.02,5.61,<0.005,25.55
age_adult,0.97,2.64,0.14,0.7,1.24,2.01,3.47,7.0,<0.005,38.53
age_senior,1.34,3.81,0.14,1.06,1.61,2.9,5.0,9.61,<0.005,70.16
sex_1,-0.54,0.58,0.04,-0.61,-0.46,0.54,0.63,-14.09,<0.005,147.27
AMERICAN INDIAN OR ALASKA,0.16,1.18,0.33,-0.49,0.82,0.61,2.26,0.48,0.63,0.67
ASIAN,0.56,1.75,0.1,0.37,0.75,1.45,2.11,5.87,<0.005,27.77
BLACK OR AFRICAN AMERICAN,0.29,1.33,0.05,0.18,0.39,1.2,1.48,5.32,<0.005,23.19
NAT.HAWAIIAN/OTH.PACIFIC,-0.44,0.64,0.71,-1.83,0.94,0.16,2.57,-0.63,0.53,0.91
WHITE,0.03,1.03,0.05,-0.06,0.12,0.94,1.13,0.62,0.54,0.9
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.32,0.73,0.05,-0.41,-0.23,0.66,0.8,-6.85,<0.005,36.93

0,1
Concordance,0.92
Partial AIC,65765.17
log-likelihood ratio test,10099.53 on 21 df
-log2(p) of ll-ratio test,inf


In [40]:
coxph_model_data = []
for visit_id in set(cases_250_2[:,0]):
    time_to_diag = min(cases_250_2[:,1][cases_250_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_250_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/250_2_multi_cox.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,525287
number of events observed,4627
partial log-likelihood,-51277.38
time fit was run,2022-06-14 17:00:01 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,2.24,9.43,0.46,1.35,3.14,3.84,23.16,4.89,<0.005,19.95
age_adult,3.32,27.68,0.41,2.52,4.12,12.4,61.76,8.11,<0.005,50.79
age_senior,3.65,38.36,0.41,2.84,4.45,17.18,85.64,8.9,<0.005,60.65
sex_1,-0.09,0.91,0.03,-0.15,-0.03,0.86,0.97,-2.99,<0.005,8.49
AMERICAN INDIAN OR ALASKA,0.32,1.38,0.24,-0.14,0.79,0.87,2.2,1.37,0.17,2.56
ASIAN,-0.08,0.92,0.09,-0.26,0.1,0.77,1.11,-0.87,0.38,1.39
BLACK OR AFRICAN AMERICAN,-0.2,0.82,0.05,-0.29,-0.11,0.75,0.9,-4.27,<0.005,15.65
NAT.HAWAIIAN/OTH.PACIFIC,0.05,1.05,0.41,-0.75,0.85,0.47,2.34,0.12,0.90,0.15
WHITE,-0.26,0.77,0.04,-0.34,-0.19,0.72,0.82,-7.26,<0.005,41.28
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.23,0.79,0.04,-0.31,-0.16,0.74,0.85,-6.24,<0.005,31.05

0,1
Concordance,0.87
Partial AIC,102596.76
log-likelihood ratio test,9378.71 on 21 df
-log2(p) of ll-ratio test,inf


In [42]:
coxph_model_data = []
for visit_id in set(cases_411_2[:,0]):
    time_to_diag = min(cases_411_2[:,1][cases_411_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_411_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + previous_top_10_variables + ['covid_prob','phenotype', 'days']))

coxph_model_data.to_csv('data/411_2_multi_cox_data.csv')

coxph_model_data = []
for visit_id in set(cases_591[:,0]):
    time_to_diag = min(cases_591[:,1][cases_591[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_591:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + previous_top_10_variables + ['covid_prob','phenotype', 'days']))

coxph_model_data.to_csv('data/591_multi_cox_data.csv')

coxph_model_data = []
for visit_id in set(cases_585_1[:,0]):
    time_to_diag = min(cases_585_1[:,1][cases_585_1[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_585_1:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + previous_top_10_variables + ['covid_prob','phenotype', 'days']))

coxph_model_data.to_csv('data/585_1_multi_cox_data.csv')

coxph_model_data = []
for visit_id in set(cases_250_2[:,0]):
    time_to_diag = min(cases_250_2[:,1][cases_250_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_250_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + previous_top_10_variables + ['covid_prob','phenotype', 'days']))

coxph_model_data.to_csv('data/250_2_multi_cox_data.csv')