In [1]:
import pymysql 
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
from collections import defaultdict, Counter
import _pickle as pickle
import random
from scipy.stats import epps_singleton_2samp, wasserstein_distance, ks_2samp
import matplotlib.pyplot as plt
import seaborn as sns
from random import sample
from lifelines import KaplanMeierFitter, CoxPHFitter

In [2]:
visit_probability = {}
for visit_id, prob in np.array(pd.read_csv('data/all_data_predictions_calibrated.csv', header=None)):
    visit_probability[int(visit_id)] = prob
    
len(visit_probability)

1573113

In [4]:
phenotype_visits_1y = pickle.load(open('data/phenotype_visits_1y_new.p', 'rb'))
followup_visits_1y = pickle.load(open('data/followup_visits_1y_new.p', 'rb'))
followup_tm_1y = pickle.load(open('data/followup_tm_1y_new.p', 'rb'))

In [5]:
demographics_variables = pickle.load(open('../vfinal_1/data/demographic_variables.p', 'rb'))
diag_variables = pickle.load(open('../vfinal_1/data/diag_variables.p', 'rb'))

In [6]:
all_visit_ids = set(followup_tm_1y.keys())

len(all_visit_ids)

525287

In [7]:
demographics_data = {}

file = open('../vfinal_1/data/all_visit_demographic_matrix.csv', 'r')

for line in tqdm(file):
    v_id = int(line.split(',')[0])
    if v_id not in all_visit_ids:
        continue
    dt = [int(i) for i in line.strip().split(',')[1:]]
    demographics_data[v_id]=dt

1573113it [00:02, 548137.32it/s]


In [8]:
file.close()

In [9]:
diag_data = {}

file = open('../vfinal_1/data/all_visit_diag_matrix.csv', 'r')

for line in tqdm(file):
    v_id = int(line.split(',')[0])
    if v_id not in all_visit_ids:
        continue
    dt = [int(i) for i in line.strip().split(',')[1:]]
    diag_data[v_id]=dt

1573113it [03:32, 7404.65it/s] 


In [10]:
file.close()

In [11]:
len(demographics_data), len(diag_data)

(525287, 525287)

In [12]:
previous_conditions = defaultdict(list)

file = open('data/previous_conditions_phe.csv', 'r')

for line in tqdm(file):
    if 'phecode' in line:
        continue
    v_id = int(line.split(',')[1])
    if v_id not in all_visit_ids:
        continue
    phe = float(line.split(',')[0])
    if phe not in previous_conditions[v_id]:
        previous_conditions[v_id].append(phe)
        
    

780740814it [22:37, 575119.35it/s]


In [13]:
file.close()

In [14]:
previous_conditions_phe_visit = defaultdict(list)

for visit_id in tqdm(previous_conditions):
    for phe in previous_conditions[visit_id]:
        previous_conditions_phe_visit[phe].append(visit_id)

100%|██████████| 444611/444611 [00:05<00:00, 80587.12it/s] 


In [15]:
cases_411_2 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y['411.2']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_411_2.append([visit_id, (diag_date-ed_date).days])
            
cases_411_2 = np.array(pd.DataFrame(cases_411_2))
non_cases_411_2 = list((set(followup_visits_1y[:,0])-set(cases_411_2[:,0]))-set(previous_conditions_phe_visit[411.2]))


cases_591 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y['591']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_591.append([visit_id, (diag_date-ed_date).days])
            
cases_591 = np.array(pd.DataFrame(cases_591))
non_cases_591 = list((set(followup_visits_1y[:,0])-set(cases_591[:,0]))-set(previous_conditions_phe_visit[591]))

cases_585_1 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y['585.1']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_585_1.append([visit_id, (diag_date-ed_date).days])
            
cases_585_1 = np.array(pd.DataFrame(cases_585_1))
non_cases_585_1 = list((set(followup_visits_1y[:,0])-set(cases_585_1[:,0]))-set(previous_conditions_phe_visit[585.1]))


cases_250_2 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y['250.2']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_250_2.append([visit_id, (diag_date-ed_date).days])
            
cases_250_2 = np.array(pd.DataFrame(cases_250_2))
non_cases_250_2 = list((set(followup_visits_1y[:,0])-set(cases_250_2[:,0]))-set(previous_conditions_phe_visit[250.2]))

cases_282_5 = []
for visit_id, diag_date, ed_date in phenotype_visits_1y['282.5']:
    if visit_id not in visit_probability:
        continue
    if (diag_date-ed_date).days <= 365:
        cases_282_5.append([visit_id, (diag_date-ed_date).days])
            
cases_282_5 = np.array(pd.DataFrame(cases_282_5))
non_cases_282_5 = list((set(followup_visits_1y[:,0])-set(cases_282_5[:,0]))-set(previous_conditions_phe_visit[250.2]))

In [16]:
len(cases_411_2), len(cases_591), len(cases_585_1), len(cases_250_2), len(cases_282_5), len(non_cases_411_2), len(non_cases_591), len(non_cases_585_1), len(non_cases_250_2), len(non_cases_282_5)

(6513, 12163, 17603, 9674, 4163, 488569, 419962, 450359, 420373, 421048)

In [17]:
condition_cts = Counter([phe for visit_id in set(followup_visits_1y[:,0]) for phe in set(previous_conditions[visit_id])])

In [18]:
pd.DataFrame([[phe, condition_cts[phe]] for phe in condition_cts], columns=['phe', 'cts']).sort_values(by='cts', ascending=False)[:30]


Unnamed: 0,phe,cts
4,401.1,226903
0,512.7,178181
3,272.1,156472
223,646.0,154584
84,512.8,149556
72,760.0,138735
212,1009.0,132551
5,530.11,126552
124,339.0,117382
20,508.0,115328


In [19]:
top_30 = list(pd.DataFrame([[phe, condition_cts[phe]] for phe in condition_cts], columns=['phe', 'cts']).sort_values(by='cts', ascending=False)[:30]['phe'])


In [20]:
previous_conditions_top_30 = {}

for visit_id in tqdm(set(followup_visits_1y[:,0])):
    d = []
    for phe in top_30:
        if phe in previous_conditions[visit_id]:
            d.append(1)
        else:
            d.append(0)
    previous_conditions_top_30[visit_id] = d
    

100%|██████████| 525287/525287 [00:10<00:00, 51643.81it/s]


In [24]:
coxph_model_data = []
for visit_id in set(cases_411_2[:,0]):
    time_to_diag = min(cases_411_2[:,1][cases_411_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_411_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/411_2_multi_30_cox_new.csv')
cph.print_summary()

  problem_columns = (censors_only | deaths_only).difference(total).tolist()


ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [25]:
coxph_model_data = []
for visit_id in set(cases_411_2[:,0]):
    time_to_diag = min(cases_411_2[:,1][cases_411_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_411_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + top_30 + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/411_2_multi_30_cox_new.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,489181
number of events observed,612
partial log-likelihood,-6698.85
time fit was run,2023-06-16 18:56:21 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,-1.46,0.23,0.74,-2.9,-0.01,0.05,0.99,-1.97,0.05,4.36
age_adult,0.39,1.48,0.23,-0.06,0.84,0.94,2.32,1.7,0.09,3.48
age_senior,1.37,3.92,0.23,0.92,1.82,2.5,6.15,5.96,<0.005,28.57
sex_1,-0.41,0.66,0.09,-0.58,-0.24,0.56,0.78,-4.78,<0.005,19.13
AMERICAN INDIAN OR ALASKA,-0.71,0.49,1.0,-2.67,1.26,0.07,3.51,-0.71,0.48,1.06
ASIAN,-1.18,0.31,0.39,-1.93,-0.42,0.15,0.66,-3.05,<0.005,8.77
BLACK OR AFRICAN AMERICAN,-0.37,0.69,0.13,-0.62,-0.11,0.54,0.89,-2.83,<0.005,7.75
NAT.HAWAIIAN/OTH.PACIFIC,0.24,1.27,1.0,-1.73,2.2,0.18,9.05,0.24,0.81,0.3
WHITE,-0.45,0.63,0.1,-0.64,-0.27,0.53,0.77,-4.74,<0.005,18.84
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.34,0.71,0.1,-0.54,-0.13,0.58,0.88,-3.22,<0.005,9.59

0,1
Concordance,0.87
Partial AIC,13479.70
log-likelihood ratio test,1233.31 on 41 df
-log2(p) of ll-ratio test,767.77


In [27]:
coxph_model_data = []
for visit_id in set(cases_591[:,0]):
    time_to_diag = min(cases_591[:,1][cases_591[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_591:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(591.0, axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/591_multi_30_cox_new.csv')
cph.print_summary()

  inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
  inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
  problem_columns = (censors_only | deaths_only).difference(total).tolist()

>>> events = df['phenotype'].astype(bool)
>>> print(df.loc[events, 'NAT.HAWAIIAN/OTH.PACIFIC '].var())
>>> print(df.loc[~events, 'NAT.HAWAIIAN/OTH.PACIFIC '].var())

A very low variance means that the column NAT.HAWAIIAN/OTH.PACIFIC  completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [28]:
coxph_model_data = []
for visit_id in set(cases_591[:,0]):
    time_to_diag = min(cases_591[:,1][cases_591[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][:8] + demographics_data[visit_id][9:] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_591:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][:8] + demographics_data[visit_id][9:] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[:8] + demographics_variables[9:] + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(591.0, axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/591_multi_30_cox_new.csv')
cph.print_summary()

  inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
  problem_columns = (censors_only | deaths_only).difference(total).tolist()


ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [30]:
coxph_model_data = []
for visit_id in set(cases_591[:,0]):
    time_to_diag = min(cases_591[:,1][cases_591[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:8] + demographics_data[visit_id][9:] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_591:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:8] + demographics_data[visit_id][9:] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:8]+ demographics_variables[9:] + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(591.0, axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/591_multi_30_cox_new.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,420988
number of events observed,1026
partial log-likelihood,-11641.85
time fit was run,2023-06-16 19:01:11 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,-1.02,0.36,0.25,-1.51,-0.54,0.22,0.58,-4.15,<0.005,14.86
age_adult,-0.73,0.48,0.11,-0.94,-0.51,0.39,0.6,-6.54,<0.005,33.9
age_senior,-0.3,0.74,0.12,-0.53,-0.07,0.59,0.93,-2.6,0.01,6.73
sex_1,0.43,1.54,0.07,0.3,0.56,1.35,1.75,6.37,<0.005,32.28
AMERICAN INDIAN OR ALASKA,0.16,1.18,0.5,-0.82,1.15,0.44,3.15,0.32,0.75,0.42
ASIAN,0.07,1.08,0.19,-0.3,0.45,0.74,1.57,0.39,0.70,0.52
BLACK OR AFRICAN AMERICAN,0.06,1.07,0.09,-0.12,0.25,0.89,1.28,0.68,0.50,1.0
WHITE,-0.07,0.94,0.07,-0.21,0.08,0.81,1.09,-0.87,0.38,1.38
HISPANIC OR LATINO OR SPANISH ORIGIN,0.16,1.18,0.08,0.01,0.31,1.01,1.36,2.14,0.03,4.95
401.1,-0.21,0.81,0.09,-0.39,-0.03,0.68,0.97,-2.34,0.02,5.71

0,1
Concordance,0.82
Partial AIC,23361.70
log-likelihood ratio test,1339.85 on 39 df
-log2(p) of ll-ratio test,847.40


In [32]:
coxph_model_data = []
for visit_id in set(cases_585_1[:,0]):
    time_to_diag = min(cases_585_1[:,1][cases_585_1[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_585_1:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/585_1_multi_30_cox_new.csv')
cph.print_summary()

  problem_columns = (censors_only | deaths_only).difference(total).tolist()


ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [33]:
coxph_model_data = []
for visit_id in set(cases_585_1[:,0]):
    time_to_diag = min(cases_585_1[:,1][cases_585_1[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_585_1:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:] + top_30 + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/585_1_multi_30_cox_new.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,451912
number of events observed,1553
partial log-likelihood,-15930.88
time fit was run,2023-06-16 19:03:05 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,1.32,3.75,0.24,0.85,1.79,2.35,6.0,5.53,<0.005,24.86
age_adult,1.1,3.01,0.19,0.73,1.47,2.08,4.36,5.82,<0.005,27.31
age_senior,1.72,5.59,0.19,1.35,2.09,3.85,8.1,9.06,<0.005,62.8
sex_1,-0.53,0.59,0.06,-0.64,-0.42,0.53,0.66,-9.46,<0.005,68.21
AMERICAN INDIAN OR ALASKA,-0.23,0.8,0.58,-1.36,0.91,0.26,2.47,-0.4,0.69,0.53
ASIAN,0.04,1.04,0.17,-0.29,0.36,0.75,1.44,0.23,0.81,0.3
BLACK OR AFRICAN AMERICAN,0.27,1.31,0.08,0.12,0.42,1.13,1.52,3.52,<0.005,11.19
NAT.HAWAIIAN/OTH.PACIFIC,-0.74,0.48,1.0,-2.71,1.22,0.07,3.39,-0.74,0.46,1.12
WHITE,-0.05,0.96,0.06,-0.17,0.08,0.84,1.08,-0.71,0.48,1.06
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.17,0.84,0.07,-0.3,-0.04,0.74,0.96,-2.62,0.01,6.83

0,1
Concordance,0.94
Partial AIC,31943.76
log-likelihood ratio test,5990.79 on 41 df
-log2(p) of ll-ratio test,inf


In [34]:
coxph_model_data = []
for visit_id in set(cases_250_2[:,0]):
    time_to_diag = min(cases_250_2[:,1][cases_250_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_250_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/250_2_multi_30_cox_new.csv')
cph.print_summary()

  inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
  problem_columns = (censors_only | deaths_only).difference(total).tolist()

>>> events = df['phenotype'].astype(bool)
>>> print(df.loc[events, 'NAT.HAWAIIAN/OTH.PACIFIC '].var())
>>> print(df.loc[~events, 'NAT.HAWAIIAN/OTH.PACIFIC '].var())

A very low variance means that the column NAT.HAWAIIAN/OTH.PACIFIC  completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [36]:
coxph_model_data = []
for visit_id in set(cases_250_2[:,0]):
    time_to_diag = min(cases_250_2[:,1][cases_250_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][:8] + demographics_data[visit_id][9:] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_250_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][:8] + demographics_data[visit_id][9:] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[:8] +  demographics_variables[9:] + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/250_2_multi_30_cox_new.csv')
cph.print_summary()

  inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
  inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
  inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
  inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
  inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
  inv_h_dot_g_T = spsolve(-h, g, assume_a="pos", check_finite=False)
  problem_columns = (censors_only | deaths_only).difference(total).tolist()


ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [38]:
coxph_model_data = []
for visit_id in set(cases_250_2[:,0]):
    time_to_diag = min(cases_250_2[:,1][cases_250_2[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:8] + demographics_data[visit_id][9:] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_250_2:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id][1:8] + demographics_data[visit_id][9:] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables[1:8] +  demographics_variables[9:] + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/250_2_multi_30_cox_new.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,421267
number of events observed,894
partial log-likelihood,-9413.64
time fit was run,2023-06-16 19:06:07 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,1.74,5.71,0.71,0.35,3.13,1.43,22.86,2.46,0.01,6.18
age_adult,3.19,24.27,0.58,2.05,4.33,7.77,75.84,5.49,<0.005,24.53
age_senior,3.94,51.52,0.58,2.8,5.08,16.48,161.02,6.78,<0.005,36.27
sex_1,-0.13,0.88,0.07,-0.27,0.0,0.76,1.0,-1.89,0.06,4.1
AMERICAN INDIAN OR ALASKA,-0.94,0.39,1.0,-2.91,1.02,0.05,2.77,-0.94,0.35,1.53
ASIAN,-0.17,0.84,0.2,-0.55,0.21,0.57,1.24,-0.87,0.38,1.38
BLACK OR AFRICAN AMERICAN,-0.09,0.92,0.1,-0.29,0.11,0.75,1.11,-0.88,0.38,1.41
WHITE,-0.54,0.59,0.08,-0.7,-0.38,0.5,0.69,-6.56,<0.005,34.14
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.13,0.88,0.09,-0.3,0.04,0.74,1.04,-1.54,0.12,3.02
401.1,-0.48,0.62,0.1,-0.69,-0.28,0.5,0.75,-4.67,<0.005,18.35

0,1
Concordance,0.92
Partial AIC,18905.29
log-likelihood ratio test,2776.97 on 39 df
-log2(p) of ll-ratio test,inf


In [40]:
coxph_model_data = []
for visit_id in set(cases_282_5[:,0]):
    time_to_diag = min(cases_282_5[:,1][cases_282_5[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_282_5:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/282_5_multi_30_cox_new.csv')
cph.print_summary()


  problem_columns = (censors_only | deaths_only).difference(total).tolist()

>>> events = df['phenotype'].astype(bool)
>>> print(df.loc[events, '250.2'].var())
>>> print(df.loc[~events, '250.2'].var())

A very low variance means that the column 250.2 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [41]:
coxph_model_data = []
for visit_id in set(cases_282_5[:,0]):
    time_to_diag = min(cases_282_5[:,1][cases_282_5[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_282_5:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/282_5_multi_30_cox_new.csv')
cph.print_summary()

  problem_columns = (censors_only | deaths_only).difference(total).tolist()

>>> events = df['phenotype'].astype(bool)
>>> print(df.loc[events, '507.0'].var())
>>> print(df.loc[~events, '507.0'].var())

A very low variance means that the column 507.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [42]:
coxph_model_data = []
for visit_id in set(cases_282_5[:,0]):
    time_to_diag = min(cases_282_5[:,1][cases_282_5[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_282_5:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
coxph_model_data = coxph_model_data.drop(507.0, axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/282_5_multi_30_cox_new.csv')
cph.print_summary()

  problem_columns = (censors_only | deaths_only).difference(total).tolist()

>>> events = df['phenotype'].astype(bool)
>>> print(df.loc[events, '508.0'].var())
>>> print(df.loc[~events, '508.0'].var())

A very low variance means that the column 508.0 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [45]:
coxph_model_data = []
for visit_id in set(cases_282_5[:,0]):
    time_to_diag = min(cases_282_5[:,1][cases_282_5[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_282_5:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
coxph_model_data = coxph_model_data.drop(507.0, axis=1)
coxph_model_data = coxph_model_data.drop(508.0, axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/282_5_multi_30_cox_new.csv')
cph.print_summary()

  problem_columns = (censors_only | deaths_only).difference(total).tolist()

>>> events = df['phenotype'].astype(bool)
>>> print(df.loc[events, '716.9'].var())
>>> print(df.loc[~events, '716.9'].var())

A very low variance means that the column 716.9 completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [46]:
coxph_model_data = []
for visit_id in set(cases_282_5[:,0]):
    time_to_diag = min(cases_282_5[:,1][cases_282_5[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_282_5:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
coxph_model_data = coxph_model_data.drop(507.0, axis=1)
coxph_model_data = coxph_model_data.drop(508.0, axis=1)
coxph_model_data = coxph_model_data.drop(716.9, axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/282_5_multi_30_cox_new.csv')
cph.print_summary()

  problem_columns = (censors_only | deaths_only).difference(total).tolist()

>>> events = df['phenotype'].astype(bool)
>>> print(df.loc[events, 'AMERICAN INDIAN OR ALASKA'].var())
>>> print(df.loc[~events, 'AMERICAN INDIAN OR ALASKA'].var())

A very low variance means that the column AMERICAN INDIAN OR ALASKA completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [48]:
coxph_model_data = []
for visit_id in set(cases_282_5[:,0]):
    time_to_diag = min(cases_282_5[:,1][cases_282_5[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_282_5:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
coxph_model_data = coxph_model_data.drop(507.0, axis=1)
coxph_model_data = coxph_model_data.drop(508.0, axis=1)
coxph_model_data = coxph_model_data.drop(716.9, axis=1)
coxph_model_data = coxph_model_data.drop('AMERICAN INDIAN OR ALASKA', axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/282_5_multi_30_cox_new.csv')
cph.print_summary()

  problem_columns = (censors_only | deaths_only).difference(total).tolist()

>>> events = df['phenotype'].astype(bool)
>>> print(df.loc[events, 'ASIAN'].var())
>>> print(df.loc[~events, 'ASIAN'].var())

A very low variance means that the column ASIAN completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [51]:
coxph_model_data = []
for visit_id in set(cases_282_5[:,0]):
    time_to_diag = min(cases_282_5[:,1][cases_282_5[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_282_5:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
coxph_model_data = coxph_model_data.drop(507.0, axis=1)
coxph_model_data = coxph_model_data.drop(508.0, axis=1)
coxph_model_data = coxph_model_data.drop(716.9, axis=1)
coxph_model_data = coxph_model_data.drop('AMERICAN INDIAN OR ALASKA', axis=1)
coxph_model_data = coxph_model_data.drop('ASIAN', axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/282_5_multi_30_cox_new.csv')
cph.print_summary()

  problem_columns = (censors_only | deaths_only).difference(total).tolist()

>>> events = df['phenotype'].astype(bool)
>>> print(df.loc[events, 'NAT.HAWAIIAN/OTH.PACIFIC '].var())
>>> print(df.loc[~events, 'NAT.HAWAIIAN/OTH.PACIFIC '].var())

A very low variance means that the column NAT.HAWAIIAN/OTH.PACIFIC  completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.



ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [52]:
coxph_model_data = []
for visit_id in set(cases_282_5[:,0]):
    time_to_diag = min(cases_282_5[:,1][cases_282_5[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_282_5:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
coxph_model_data = coxph_model_data.drop(507.0, axis=1)
coxph_model_data = coxph_model_data.drop(508.0, axis=1)
coxph_model_data = coxph_model_data.drop(716.9, axis=1)
coxph_model_data = coxph_model_data.drop('AMERICAN INDIAN OR ALASKA', axis=1)
coxph_model_data = coxph_model_data.drop('ASIAN', axis=1)
coxph_model_data = coxph_model_data.drop('NAT.HAWAIIAN/OTH.PACIFIC ', axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/282_5_multi_30_cox_new.csv')
cph.print_summary()

  problem_columns = (censors_only | deaths_only).difference(total).tolist()


ConvergenceError: Convergence halted due to matrix inversion problems. Suspicion is high collinearity. Please see the following tips in the lifelines documentation: https://lifelines.readthedocs.io/en/latest/Examples.html#problems-with-convergence-in-the-cox-proportional-hazard-modelMatrix is singular.

In [53]:
coxph_model_data = []
for visit_id in set(cases_282_5[:,0]):
    time_to_diag = min(cases_282_5[:,1][cases_282_5[:,0]==visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] +  previous_conditions_top_30[visit_id] + [covid_prob, 1, time_to_diag]))
    
for visit_id in non_cases_282_5:
    time_to_diag = min([365] + followup_tm_1y[visit_id])
    covid_prob = visit_probability[visit_id]
    coxph_model_data.append((demographics_data[visit_id] + previous_conditions_top_30[visit_id] + [covid_prob, 0, time_to_diag]))
    
coxph_model_data = pd.DataFrame(coxph_model_data, columns=(demographics_variables + top_30 + ['covid_prob','phenotype', 'days']))
coxph_model_data = coxph_model_data.drop(250.2, axis=1)
coxph_model_data = coxph_model_data.drop(507.0, axis=1)
coxph_model_data = coxph_model_data.drop(508.0, axis=1)
coxph_model_data = coxph_model_data.drop(716.9, axis=1)
coxph_model_data = coxph_model_data.drop('AMERICAN INDIAN OR ALASKA', axis=1)
coxph_model_data = coxph_model_data.drop('ASIAN', axis=1)
coxph_model_data = coxph_model_data.drop('NAT.HAWAIIAN/OTH.PACIFIC ', axis=1)
coxph_model_data = coxph_model_data.drop('age_child', axis=1)
cph = CoxPHFitter()
cph.fit(coxph_model_data, 'days', 'phenotype')
cph.summary.to_csv('data/282_5_multi_30_cox_new.csv')
cph.print_summary()

0,1
model,lifelines.CoxPHFitter
duration col,'days'
event col,'phenotype'
baseline estimation,breslow
number of observations,421272
number of events observed,224
partial log-likelihood,-2166.45
time fit was run,2023-06-16 19:15:25 UTC

Unnamed: 0,coef,exp(coef),se(coef),coef lower 95%,coef upper 95%,exp(coef) lower 95%,exp(coef) upper 95%,z,p,-log2(p)
age_adol,-2.37,0.09,0.51,-3.37,-1.37,0.03,0.25,-4.65,<0.005,18.22
age_adult,-2.03,0.13,0.15,-2.33,-1.73,0.1,0.18,-13.36,<0.005,132.78
age_senior,-4.32,0.01,0.51,-5.33,-3.32,0.0,0.04,-8.43,<0.005,54.73
sex_1,-0.2,0.82,0.14,-0.48,0.07,0.62,1.07,-1.46,0.15,2.79
BLACK OR AFRICAN AMERICAN,2.63,13.89,0.17,2.29,2.97,9.9,19.48,15.24,<0.005,171.87
WHITE,-1.41,0.25,0.34,-2.08,-0.73,0.12,0.48,-4.08,<0.005,14.43
HISPANIC OR LATINO OR SPANISH ORIGIN,-0.23,0.79,0.17,-0.57,0.1,0.57,1.11,-1.35,0.18,2.51
401.1,-3.0,0.05,1.03,-5.01,-0.98,0.01,0.37,-2.92,<0.005,8.15
512.7,-0.74,0.48,0.53,-1.77,0.3,0.17,1.35,-1.39,0.16,2.62
272.1,-0.01,0.99,0.76,-1.5,1.48,0.22,4.41,-0.01,0.99,0.01

0,1
Concordance,0.96
Partial AIC,4400.91
log-likelihood ratio test,1171.25 on 34 df
-log2(p) of ll-ratio test,741.99
