In [1]:
import pymysql 
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
from collections import defaultdict, Counter
import _pickle as pickle
import random
from scipy.stats import epps_singleton_2samp, wasserstein_distance, ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns
from random import sample
from lifelines import KaplanMeierFitter, CoxPHFitter

In [2]:
visit_probability = {}
for visit_id, prob in np.array(pd.read_csv('data/rfr_model_depth_69_trees_190_preds.csv', header=None)):
    visit_probability[int(visit_id)] = prob
    
len(visit_probability)

1573113

In [3]:
phenotype_visits_1y = pickle.load(open('data/phenotype_visits_1y.p', 'rb'))
followup_visits_1y = pickle.load(open('data/followup_visits_1y.p', 'rb'))
followup_tm_1y = pickle.load(open('data/followup_tm_1y.p', 'rb'))

In [4]:
demographics_variables = pickle.load(open('data/demographic_variables.p', 'rb'))
diag_variables = pickle.load(open('data/diag_variables.p', 'rb'))

In [8]:
all_visit_ids = set(followup_tm_1y.keys())

len(all_visit_ids)

525287

In [9]:
demographics_data = {}

file = open('data/all_visit_demographic_matrix.csv', 'r')

for line in tqdm(file):
    v_id = int(line.split(',')[0])
    if v_id not in all_visit_ids:
        continue
    dt = [int(i) for i in line.strip().split(',')[1:]]
    demographics_data[v_id]=dt

1573113it [00:02, 597581.00it/s]


In [10]:
file.close()

In [11]:
diag_data = {}

file = open('data/all_visit_diag_matrix.csv', 'r')

for line in tqdm(file):
    v_id = int(line.split(',')[0])
    if v_id not in all_visit_ids:
        continue
    dt = [int(i) for i in line.strip().split(',')[1:]]
    diag_data[v_id]=dt

1573113it [02:22, 11046.43it/s]


In [12]:
file.close()

In [13]:
len(demographics_data), len(diag_data)

(525287, 525287)

In [14]:
previous_conditions = defaultdict(list)

file = open('data/previous_conditions_phe.csv', 'r')

for line in tqdm(file):
    if 'phecode' in line:
        continue
    v_id = int(line.split(',')[1])
    if v_id not in all_visit_ids:
        continue
    phe = float(line.split(',')[0])
    if phe not in previous_conditions[v_id]:
        previous_conditions[v_id].append(phe)
        
    

780740814it [13:02, 997989.44it/s] 


In [15]:
file.close()

In [16]:
previous_conditions_phe_visit = defaultdict(list)

for visit_id in tqdm(previous_conditions):
    for phe in previous_conditions[visit_id]:
        previous_conditions_phe_visit[phe].append(visit_id)

100%|██████████| 444611/444611 [00:04<00:00, 108221.59it/s]


In [18]:
cases_411_2 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y['411.2']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_411_2.append([visit_id, (diag_date-ed_date).days])
            
cases_411_2 = np.array(pd.DataFrame(cases_411_2))
non_cases_411_2 = list((set(followup_visits_1y[:,0])-set(cases_411_2[:,0]))-set(previous_conditions_phe_visit[411.2]))


cases_591 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y['591']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_591.append([visit_id, (diag_date-ed_date).days])
            
cases_591 = np.array(pd.DataFrame(cases_591))
non_cases_591 = list((set(followup_visits_1y[:,0])-set(cases_591[:,0]))-set(previous_conditions_phe_visit[591]))

cases_585_1 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y['585.1']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_585_1.append([visit_id, (diag_date-ed_date).days])
            
cases_585_1 = np.array(pd.DataFrame(cases_585_1))
non_cases_585_1 = list((set(followup_visits_1y[:,0])-set(cases_585_1[:,0]))-set(previous_conditions_phe_visit[585.1]))


cases_250_2 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y['250.2']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_250_2.append([visit_id, (diag_date-ed_date).days])
            
cases_250_2 = np.array(pd.DataFrame(cases_250_2))
non_cases_250_2 = list((set(followup_visits_1y[:,0])-set(cases_250_2[:,0]))-set(previous_conditions_phe_visit[250.2]))

In [19]:
len(cases_411_2), len(cases_591), len(cases_585_1), len(cases_250_2), len(non_cases_411_2), len(non_cases_591), len(non_cases_585_1), len(non_cases_250_2)

(5642, 14059, 19604, 15156, 488748, 420121, 450594, 420652)

In [20]:
condition_cts = Counter([phe for visit_id in set(followup_visits_1y[:,0]) for phe in set(previous_conditions[visit_id])])

In [21]:
pd.DataFrame([[phe, condition_cts[phe]] for phe in condition_cts], columns=['phe', 'cts']).sort_values(by='cts', ascending=False)[:10]


Unnamed: 0,phe,cts
4,401.1,226903
0,512.7,178181
3,272.1,156472
223,646.0,154584
84,512.8,149556
72,760.0,138735
212,1009.0,132551
5,530.11,126552
124,339.0,117382
20,508.0,115328


In [22]:
list(pd.DataFrame([[phe, condition_cts[phe]] for phe in condition_cts], columns=['phe', 'cts']).sort_values(by='cts', ascending=False)[:10]['phe'])


[401.1, 512.7, 272.1, 646.0, 512.8, 760.0, 1009.0, 530.11, 339.0, 508.0]

In [23]:
previous_conditions_top_10 = {}

for visit_id in tqdm(set(followup_visits_1y[:,0])):
    d = []
    for phe in [401.1, 512.7, 272.1, 646.0, 512.8, 760.0, 1009.0, 530.11, 339.0, 508.0]:
        if phe in previous_conditions[visit_id]:
            d.append(1)
        else:
            d.append(0)
    previous_conditions_top_10[visit_id] = d
    
previous_top_10_variables = [str(i) for i in [401.1, 512.7, 272.1, 646.0, 512.8, 760.0, 1009.0, 530.11, 339.0, 508.0]]

100%|██████████| 525287/525287 [00:07<00:00, 71495.95it/s] 


In [32]:
coxph_model_data = []
for visit_id in set(cases_411_2[:,0]):
    time_to_diag = min(cases_411_2[:,1][cases_411_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_411_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/411_2_multi_cox_new.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,489537
number of events observed,789
partial log-likelihood,-8461.39
time fit was run,2022-06-14 17:00:26 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,-0.96,0.38,0.76,-2.45,0.53,0.09,1.7,-1.27,0.21,2.28
age_adult,0.66,1.94,0.29,0.1,1.23,1.1,3.44,2.29,0.02,5.5
age_senior,1.61,4.98,0.29,1.04,2.17,2.83,8.76,5.57,<0.005,25.2
sex_1,-0.76,0.47,0.08,-0.91,-0.61,0.4,0.54,-9.96,<0.005,75.16
AMERICAN INDIAN OR ALASKA,-0.81,0.44,1.0,-2.77,1.15,0.06,3.17,-0.81,0.42,1.26
ASIAN,-0.97,0.38,0.34,-1.63,-0.3,0.2,0.74,-2.83,<0.005,7.75
BLACK OR AFRICAN AMERICAN,-0.21,0.81,0.12,-0.43,0.02,0.65,1.02,-1.8,0.07,3.79
NAT.HAWAIIAN/OTH.PACIFIC,0.76,2.14,0.71,-0.63,2.15,0.53,8.61,1.07,0.28,1.82
WHITE,-0.22,0.8,0.09,-0.39,-0.05,0.68,0.95,-2.57,0.01,6.63
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.31,0.73,0.09,-0.49,-0.13,0.61,0.88,-3.33,<0.005,10.16

0,1
Concordance,0.87
Partial AIC,16964.78
log-likelihood ratio test,1622.59 on 21 df
-log2(p) of ll-ratio test,inf


In [33]:
coxph_model_data = []
for visit_id in set(cases_591[:,0]):
    time_to_diag = min(cases_591[:,1][cases_591[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_591:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/591_multi_cox_new.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,421798
number of events observed,1677
partial log-likelihood,-18704.37
time fit was run,2022-06-14 17:00:38 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,-0.62,0.54,0.2,-1.01,-0.23,0.36,0.79,-3.14,<0.005,9.21
age_adult,-0.43,0.65,0.1,-0.63,-0.23,0.53,0.8,-4.21,<0.005,15.26
age_senior,-0.08,0.92,0.1,-0.29,0.12,0.75,1.13,-0.78,0.44,1.2
sex_1,0.49,1.63,0.05,0.39,0.59,1.47,1.81,9.3,<0.005,65.95
AMERICAN INDIAN OR ALASKA,0.26,1.3,0.41,-0.54,1.06,0.58,2.89,0.63,0.53,0.93
ASIAN,0.21,1.23,0.16,-0.11,0.53,0.89,1.7,1.26,0.21,2.27
BLACK OR AFRICAN AMERICAN,0.04,1.04,0.08,-0.11,0.19,0.9,1.21,0.55,0.59,0.77
NAT.HAWAIIAN/OTH.PACIFIC,-0.07,0.94,0.71,-1.45,1.32,0.23,3.75,-0.09,0.93,0.11
WHITE,0.16,1.17,0.06,0.04,0.28,1.05,1.32,2.72,0.01,7.24
HISPANIC OR LATINO OR SPANISH ORIGIN,0.14,1.15,0.06,0.02,0.25,1.02,1.29,2.35,0.02,5.75

0,1
Concordance,0.82
Partial AIC,37450.74
log-likelihood ratio test,2158.53 on 21 df
-log2(p) of ll-ratio test,inf


In [34]:
coxph_model_data = []
for visit_id in set(cases_585_1[:,0]):
    time_to_diag = min(cases_585_1[:,1][cases_585_1[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_585_1:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/585_1_multi_cox_new.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,452706
number of events observed,2112
partial log-likelihood,-21384.45
time fit was run,2022-06-14 17:01:04 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,1.09,2.97,0.22,0.66,1.51,1.94,4.55,5.0,<0.005,20.74
age_adult,0.99,2.7,0.16,0.67,1.31,1.95,3.72,6.02,<0.005,29.12
age_senior,1.34,3.83,0.16,1.02,1.67,2.77,5.29,8.14,<0.005,51.2
sex_1,-0.6,0.55,0.05,-0.69,-0.51,0.5,0.6,-13.01,<0.005,126.12
AMERICAN INDIAN OR ALASKA,0.22,1.24,0.41,-0.59,1.02,0.56,2.77,0.53,0.60,0.75
ASIAN,0.14,1.15,0.15,-0.15,0.43,0.86,1.54,0.93,0.35,1.51
BLACK OR AFRICAN AMERICAN,0.38,1.46,0.06,0.25,0.5,1.29,1.66,5.95,<0.005,28.5
NAT.HAWAIIAN/OTH.PACIFIC,-1.07,0.34,1.0,-3.03,0.89,0.05,2.45,-1.07,0.29,1.8
WHITE,-0.03,0.97,0.06,-0.13,0.08,0.87,1.09,-0.46,0.64,0.64
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.29,0.75,0.06,-0.4,-0.18,0.67,0.84,-5.17,<0.005,22.03

0,1
Concordance,0.94
Partial AIC,42810.89
log-likelihood ratio test,8285.42 on 21 df
-log2(p) of ll-ratio test,inf


In [35]:
coxph_model_data = []
for visit_id in set(cases_250_2[:,0]):
    time_to_diag = min(cases_250_2[:,1][cases_250_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_250_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/250_2_multi_cox_new.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,423193
number of events observed,2541
partial log-likelihood,-26238.56
time fit was run,2022-06-14 17:01:12 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,1.51,4.54,0.56,0.42,2.61,1.52,13.54,2.71,0.01,7.22
age_adult,2.88,17.85,0.45,2.0,3.76,7.4,43.08,6.41,<0.005,32.69
age_senior,3.51,33.46,0.45,2.63,4.39,13.86,80.76,7.81,<0.005,47.28
sex_1,-0.23,0.8,0.04,-0.31,-0.15,0.73,0.86,-5.49,<0.005,24.56
AMERICAN INDIAN OR ALASKA,0.01,1.01,0.38,-0.74,0.75,0.48,2.12,0.02,0.99,0.02
ASIAN,0.09,1.1,0.12,-0.14,0.33,0.87,1.39,0.79,0.43,1.21
BLACK OR AFRICAN AMERICAN,-0.06,0.94,0.06,-0.18,0.06,0.84,1.06,-0.99,0.32,1.64
NAT.HAWAIIAN/OTH.PACIFIC,0.31,1.37,0.45,-0.57,1.19,0.57,3.29,0.7,0.49,1.04
WHITE,-0.46,0.63,0.05,-0.56,-0.36,0.57,0.7,-9.26,<0.005,65.43
HISPANIC OR LATINO OR SPANISH ORIGIN,0.01,1.01,0.05,-0.09,0.1,0.91,1.11,0.11,0.92,0.13

0,1
Concordance,0.91
Partial AIC,52519.12
log-likelihood ratio test,7111.72 on 21 df
-log2(p) of ll-ratio test,inf


In [37]:
coxph_model_data = []
for visit_id in set(cases_411_2[:,0]):
    time_to_diag = min(cases_411_2[:,1][cases_411_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_411_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + previous_top_10_variables + ['covid_prob','phenotype', 'days']))

coxph_model_data.to_csv('data/411_2_multi_cox_data_new.csv')

coxph_model_data = []
for visit_id in set(cases_591[:,0]):
    time_to_diag = min(cases_591[:,1][cases_591[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_591:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + previous_top_10_variables + ['covid_prob','phenotype', 'days']))

coxph_model_data.to_csv('data/591_multi_cox_data_new.csv')

coxph_model_data = []
for visit_id in set(cases_585_1[:,0]):
    time_to_diag = min(cases_585_1[:,1][cases_585_1[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_585_1:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + previous_top_10_variables + ['covid_prob','phenotype', 'days']))

coxph_model_data.to_csv('data/585_1_multi_cox_data_new.csv')

coxph_model_data = []
for visit_id in set(cases_250_2[:,0]):
    time_to_diag = min(cases_250_2[:,1][cases_250_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_250_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + previous_top_10_variables + ['covid_prob','phenotype', 'days']))

coxph_model_data.to_csv('data/250_2_multi_cox_data_new.csv')