In [1]:
import pymysql 
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
from collections import defaultdict, Counter
import _pickle as pickle
import random
from scipy.stats import epps_singleton_2samp, wasserstein_distance, ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns
from random import sample
from lifelines import KaplanMeierFitter, CoxPHFitter

In [2]:
visit_probability = {}
for visit_id, prob in np.array(pd.read_csv('data/all_data_predictions_calibrated.csv', header=None)):
    visit_probability[int(visit_id)] = prob
    
len(visit_probability)

1573113

In [3]:
phenotype_visits_1y_all = pickle.load(open('data/phenotype_visits_1y_all.p', 'rb'))
followup_visits_1y = pickle.load(open('data/followup_visits_1y.p', 'rb'))
followup_tm_1y = pickle.load(open('data/followup_tm_1y.p', 'rb'))

In [4]:
demographics_variables = pickle.load(open('../vfinal_1/data/demographic_variables.p', 'rb'))
diag_variables = pickle.load(open('../vfinal_1/data/diag_variables.p', 'rb'))

In [5]:
all_visit_ids = set(followup_tm_1y.keys())

len(all_visit_ids)

525287

In [6]:
demographics_data = {}

file = open('../vfinal_1/data/all_visit_demographic_matrix.csv', 'r')

for line in tqdm(file):
    v_id = int(line.split(',')[0])
    if v_id not in all_visit_ids:
        continue
    dt = [int(i) for i in line.strip().split(',')[1:]]
    demographics_data[v_id]=dt

1573113it [00:04, 322195.54it/s]


In [7]:
file.close()

In [8]:
diag_data = {}

file = open('../vfinal_1/data/all_visit_diag_matrix.csv', 'r')

for line in tqdm(file):
    v_id = int(line.split(',')[0])
    if v_id not in all_visit_ids:
        continue
    dt = [int(i) for i in line.strip().split(',')[1:]]
    diag_data[v_id]=dt

1573113it [03:56, 6652.60it/s] 


In [9]:
file.close()

In [10]:
len(demographics_data), len(diag_data)

(525287, 525287)

In [11]:
previous_conditions = defaultdict(list)

file = open('data/previous_conditions_phe.csv', 'r')

for line in tqdm(file):
    if 'phecode' in line:
        continue
    v_id = int(line.split(',')[1])
    if v_id not in all_visit_ids:
        continue
    phe = float(line.split(',')[0])
    if phe not in previous_conditions[v_id]:
        previous_conditions[v_id].append(phe)
        
    

780740814it [15:28, 840781.56it/s] 


In [12]:
file.close()

In [14]:
cases_411_2 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y_all['411.2']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_411_2.append([visit_id, (diag_date-ed_date).days])
            
cases_411_2 = np.array(pd.DataFrame(cases_411_2))
non_cases_411_2 = list((set(followup_visits_1y[:,0])-set(cases_411_2[:,0])))


cases_591 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y_all['591']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_591.append([visit_id, (diag_date-ed_date).days])
            
cases_591 = np.array(pd.DataFrame(cases_591))
non_cases_591 = list((set(followup_visits_1y[:,0])-set(cases_591[:,0])))

cases_585_1 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y_all['585.1']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_585_1.append([visit_id, (diag_date-ed_date).days])
            
cases_585_1 = np.array(pd.DataFrame(cases_585_1))
non_cases_585_1 = list((set(followup_visits_1y[:,0])-set(cases_585_1[:,0])))


cases_250_2 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y_all['250.2']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_250_2.append([visit_id, (diag_date-ed_date).days])
            
cases_250_2 = np.array(pd.DataFrame(cases_250_2))
non_cases_250_2 = list((set(followup_visits_1y[:,0])-set(cases_250_2[:,0])))

cases_282_5 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y_all['282.5']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_282_5.append([visit_id, (diag_date-ed_date).days])
            
cases_282_5 = np.array(pd.DataFrame(cases_282_5))
non_cases_282_5 = list((set(followup_visits_1y[:,0])-set(cases_282_5[:,0])))

In [15]:
len(cases_411_2), len(cases_591), len(cases_585_1), len(cases_250_2), len(cases_282_5), len(non_cases_411_2), len(non_cases_591), len(non_cases_585_1), len(non_cases_250_2), len(non_cases_282_5)

(11402, 28301, 38400, 47182, 45120, 523983, 522909, 522213, 520660, 522559)

In [16]:
condition_cts = Counter([phe for visit_id in set(followup_visits_1y[:,0]) for phe in set(previous_conditions[visit_id])])

In [17]:
pd.DataFrame([[phe, condition_cts[phe]] for phe in condition_cts], columns=['phe', 'cts']).sort_values(by='cts', ascending=False)[:30]


Unnamed: 0,phe,cts
4,401.1,226903
0,512.7,178181
3,272.1,156472
223,646.0,154584
84,512.8,149556
72,760.0,138735
212,1009.0,132551
5,530.11,126552
124,339.0,117382
20,508.0,115328


In [18]:
top_30 = list(pd.DataFrame([[phe, condition_cts[phe]] for phe in condition_cts], columns=['phe', 'cts']).sort_values(by='cts', ascending=False)[:30]['phe'])


In [19]:
previous_conditions_top_10 = {}

for visit_id in tqdm(set(followup_visits_1y[:,0])):
    d = []
    for phe in top_30:
        if phe in previous_conditions[visit_id]:
            d.append(1)
        else:
            d.append(0)
    previous_conditions_top_10[visit_id] = d
    
previous_top_10_variables = top_30

100%|██████████| 525287/525287 [00:06<00:00, 83302.09it/s]


In [20]:
coxph_model_data = []
for visit_id in set(cases_411_2[:,0]):
    time_to_diag = min(cases_411_2[:,1][cases_411_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_411_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))

coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
coxph_model_data.to_csv('data/411_2_multi_30_cox_data.csv')

cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/411_2_multi_30_cox.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,525287
number of events observed,1304
partial log-likelihood,-14443.69
time fit was run,2023-06-13 22:08:21 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,-1.49,0.22,0.52,-2.51,-0.47,0.08,0.62,-2.87,<0.005,7.91
age_adult,0.03,1.03,0.16,-0.28,0.35,0.75,1.41,0.2,0.84,0.25
age_senior,0.76,2.14,0.16,0.44,1.08,1.56,2.93,4.71,<0.005,18.63
sex_1,-0.58,0.56,0.06,-0.71,-0.46,0.49,0.63,-9.17,<0.005,64.24
AMERICAN INDIAN OR ALASKA,1.37,3.92,0.26,0.85,1.88,2.34,6.57,5.2,<0.005,22.27
ASIAN,-1.07,0.34,0.28,-1.63,-0.52,0.2,0.6,-3.8,<0.005,12.73
BLACK OR AFRICAN AMERICAN,-0.26,0.77,0.09,-0.44,-0.08,0.65,0.92,-2.85,<0.005,7.86
NAT.HAWAIIAN/OTH.PACIFIC,0.29,1.33,0.71,-1.1,1.68,0.33,5.35,0.41,0.69,0.55
WHITE,-0.24,0.79,0.07,-0.37,-0.11,0.69,0.9,-3.62,<0.005,11.72
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.24,0.79,0.07,-0.38,-0.1,0.68,0.91,-3.33,<0.005,10.16

0,1
Concordance,0.85
Partial AIC,28969.37
log-likelihood ratio test,2283.17 on 41 df
-log2(p) of ll-ratio test,inf


In [21]:
coxph_model_data = []
for visit_id in set(cases_591[:,0]):
    time_to_diag = min(cases_591[:,1][cases_591[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_591:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
coxph_model_data.to_csv('data/591_multi_30_cox_data.csv')

cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/591_multi_30_cox.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,525287
number of events observed,2378
partial log-likelihood,-26722.86
time fit was run,2023-06-13 22:08:34 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,-0.74,0.47,0.17,-1.08,-0.41,0.34,0.66,-4.36,<0.005,16.25
age_adult,-0.49,0.61,0.09,-0.66,-0.32,0.52,0.72,-5.71,<0.005,26.37
age_senior,-0.33,0.72,0.09,-0.51,-0.15,0.6,0.86,-3.63,<0.005,11.77
sex_1,0.28,1.32,0.05,0.19,0.37,1.2,1.44,5.94,<0.005,28.35
AMERICAN INDIAN OR ALASKA,-0.22,0.8,0.38,-0.97,0.52,0.38,1.68,-0.59,0.56,0.84
ASIAN,-0.02,0.98,0.14,-0.28,0.25,0.75,1.28,-0.12,0.90,0.15
BLACK OR AFRICAN AMERICAN,0.04,1.04,0.06,-0.09,0.16,0.91,1.17,0.55,0.58,0.78
NAT.HAWAIIAN/OTH.PACIFIC,-0.3,0.74,0.71,-1.69,1.09,0.18,2.97,-0.42,0.67,0.57
WHITE,0.08,1.08,0.05,-0.02,0.18,0.98,1.19,1.57,0.12,3.09
HISPANIC OR LATINO OR SPANISH ORIGIN,0.02,1.02,0.05,-0.07,0.12,0.93,1.13,0.49,0.62,0.69

0,1
Concordance,0.84
Partial AIC,53527.71
log-likelihood ratio test,4265.30 on 41 df
-log2(p) of ll-ratio test,inf


In [22]:
coxph_model_data = []
for visit_id in set(cases_585_1[:,0]):
    time_to_diag = min(cases_585_1[:,1][cases_585_1[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_585_1:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
coxph_model_data.to_csv('data/585_1_multi_30_cox_data.csv')

cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/585_1_multi_30_cox.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,525287
number of events observed,3074
partial log-likelihood,-32768.07
time fit was run,2023-06-13 22:08:47 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,1.07,2.92,0.18,0.71,1.43,2.03,4.19,5.79,<0.005,27.07
age_adult,1.08,2.96,0.14,0.81,1.36,2.24,3.9,7.67,<0.005,45.7
age_senior,1.47,4.36,0.14,1.19,1.75,3.29,5.76,10.29,<0.005,80.04
sex_1,-0.45,0.63,0.04,-0.53,-0.38,0.59,0.69,-11.33,<0.005,96.43
AMERICAN INDIAN OR ALASKA,0.13,1.14,0.33,-0.52,0.79,0.59,2.2,0.4,0.69,0.53
ASIAN,0.47,1.6,0.1,0.28,0.66,1.33,1.93,4.91,<0.005,20.09
BLACK OR AFRICAN AMERICAN,0.23,1.26,0.05,0.13,0.34,1.14,1.4,4.34,<0.005,16.08
NAT.HAWAIIAN/OTH.PACIFIC,-0.53,0.59,0.71,-1.92,0.86,0.15,2.35,-0.75,0.45,1.14
WHITE,0.01,1.01,0.05,-0.08,0.11,0.93,1.11,0.32,0.75,0.42
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.24,0.79,0.05,-0.33,-0.15,0.72,0.86,-5.1,<0.005,21.48

0,1
Concordance,0.93
Partial AIC,65618.14
log-likelihood ratio test,10286.55 on 41 df
-log2(p) of ll-ratio test,inf


In [23]:
coxph_model_data = []
for visit_id in set(cases_250_2[:,0]):
    time_to_diag = min(cases_250_2[:,1][cases_250_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_250_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
coxph_model_data.to_csv('data/250_2_multi_30_cox_data.csv')

cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/250_2_multi_30_cox.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,525287
number of events observed,4627
partial log-likelihood,-48503.29
time fit was run,2023-06-13 22:09:00 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,2.02,7.52,0.46,1.12,2.92,3.06,18.48,4.4,<0.005,16.49
age_adult,2.75,15.61,0.41,1.94,3.55,6.99,34.88,6.7,<0.005,35.48
age_senior,2.94,18.89,0.41,2.13,3.74,8.45,42.25,7.16,<0.005,40.13
sex_1,0.06,1.06,0.03,-0.01,0.12,0.99,1.13,1.79,0.07,3.77
AMERICAN INDIAN OR ALASKA,0.16,1.18,0.24,-0.3,0.63,0.74,1.88,0.69,0.49,1.03
ASIAN,-0.14,0.87,0.09,-0.32,0.04,0.72,1.04,-1.52,0.13,2.97
BLACK OR AFRICAN AMERICAN,-0.33,0.72,0.05,-0.42,-0.23,0.66,0.79,-7.02,<0.005,38.68
NAT.HAWAIIAN/OTH.PACIFIC,0.0,1.0,0.41,-0.8,0.81,0.45,2.24,0.01,0.99,0.01
WHITE,-0.16,0.85,0.04,-0.23,-0.09,0.79,0.91,-4.44,<0.005,16.73
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.37,0.69,0.04,-0.44,-0.3,0.64,0.74,-9.86,<0.005,73.73

0,1
Concordance,0.92
Partial AIC,97088.59
log-likelihood ratio test,14926.88 on 41 df
-log2(p) of ll-ratio test,inf


In [29]:
coxph_model_data = []
for visit_id in set(cases_282_5[:,0]):
    time_to_diag = min(cases_282_5[:,1][cases_282_5[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_10[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_282_5:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_10[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + previous_top_10_variables + ['covid_prob','phenotype', 'days']))
coxph_model_data.to_csv('data/282_5_multi_30_cox_data.csv')

cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/282_5_multi_30_cox.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,525287
number of events observed,2728
partial log-likelihood,-28427.56
time fit was run,2023-06-13 22:13:05 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,0.07,1.07,0.07,-0.06,0.2,0.94,1.22,1.04,0.30,1.75
age_adult,-0.95,0.39,0.06,-1.06,-0.83,0.35,0.43,-16.18,<0.005,193.15
age_senior,-2.86,0.06,0.12,-3.08,-2.63,0.05,0.07,-24.75,<0.005,446.82
sex_1,-0.27,0.77,0.04,-0.35,-0.19,0.7,0.83,-6.36,<0.005,32.2
AMERICAN INDIAN OR ALASKA,-14.12,0.0,399.36,-796.84,768.6,0.0,inf,-0.04,0.97,0.04
ASIAN,-0.43,0.65,0.2,-0.82,-0.03,0.44,0.97,-2.11,0.03,4.84
BLACK OR AFRICAN AMERICAN,1.31,3.7,0.05,1.21,1.41,3.36,4.08,26.64,<0.005,517.08
NAT.HAWAIIAN/OTH.PACIFIC,-13.55,0.0,786.37,-1554.81,1527.72,0.0,inf,-0.02,0.99,0.02
WHITE,-0.63,0.53,0.06,-0.75,-0.51,0.47,0.6,-10.28,<0.005,79.95
HISPANIC OR LATINO OR SPANISH ORIGIN,0.2,1.22,0.05,0.11,0.29,1.12,1.34,4.47,<0.005,16.97

0,1
Concordance,0.93
Partial AIC,56937.11
log-likelihood ratio test,10416.89 on 41 df
-log2(p) of ll-ratio test,inf
