In [1]:
import pymysql 
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
from collections import defaultdict, Counter
import _pickle as pickle
import random
from scipy.stats import epps_singleton_2samp, wasserstein_distance, ks_2samp, mannwhitneyu
import matplotlib.pyplot as plt
import seaborn as sns
from random import sample
from statsmodels.stats.multitest import fdrcorrection

In [2]:
# start tunnel first: ssh -f [uni]@mimir.dbmi.columbia.edu -L 3307:127.0.0.1:3306 -N

conn = pymysql.connect(host="127.0.0.1", 
                       user="", #uni
                       port = ,
                       passwd="", #sql password
                       db = "" ) #database
cur = conn.cursor()

In [3]:
visit_probability = {}
for visit_id, prob in np.array(pd.read_csv('data/rfr_model_depth_69_trees_190_preds.csv', header=None)):
    visit_probability[int(visit_id)] = prob
    
len(visit_probability)

1573113

In [4]:
phenotype_visits = []

In [5]:
cur.execute('''select phecode, a.visit_id, a.contact_date, b.ed_dt
                from user_vr2430.vfinal_1_predict_covid_conditions a
                join (select visit_id, pat_mrn_id, ed_dt from user_vr2430.vfinal_1_predict_covid_visits) b using (pat_mrn_id)
                join clinical_merge_v5_2022q1.phecode_icd10 using (icd10)
                where a.contact_date >= b.ed_dt and a.visit_id != b.visit_id;''')

for phe, visit, ct, ed_dt in cur.fetchall():
    if visit in visit_probability:
        phenotype_visits.append([phe,visit, ct, ed_dt])

phenotype_visits = np.array(pd.DataFrame(phenotype_visits))

len(phenotype_visits), len(set(phenotype_visits[:,0]))

(7825382, 1048)

In [6]:
cur.execute('''select distinct a.visit_id, a.contact_date, b.ed_dt
                from user_vr2430.vfinal_1_predict_covid_conditions a
                join (select visit_id, pat_mrn_id, ed_dt from user_vr2430.vfinal_1_predict_covid_visits) b using (pat_mrn_id)
                where a.contact_date >= b.ed_dt and a.visit_id != b.visit_id;''')

followup_visits = []
for visit, ct, ed_dt in cur.fetchall():
    if visit in visit_probability:
        followup_visits.append([visit, ct, ed_dt])
        
len(followup_visits)

5002965

In [7]:
followup_visits = np.array(pd.DataFrame(followup_visits))

In [8]:
len(set(phenotype_visits[:,1])), len(set(followup_visits[:,0]))

(277994, 529295)

In [9]:
previous_conditions = defaultdict(list)

file = open('data/previous_conditions_phe.csv', 'r')

for line in tqdm(file):
    if 'phecode' in line:
        continue
    phe = float(line.split(',')[0])
    visit_id = int(line.split(',')[1])
    if phe not in previous_conditions[visit_id]:
        previous_conditions[visit_id].append(phe)
        
    

780740814it [18:18, 710893.32it/s]


In [10]:
cur.execute('''select a.visit_id, phecode
                from clinical_merge_v5_2022q1.phecode_icd10
                inner join user_vr2430.vfinal_1_predict_covid_conditions b using (icd10)
                inner join user_vr2430.vfinal_1_predict_covid_visits a using (pat_mrn_id)
                where contact_date < st_dt;''')

for visit_id, phe in cur.fetchall():
    phe = float(line.split(',')[0])
    visit_id = int(line.split(',')[1])
    if phe not in previous_conditions[visit_id]:
        previous_conditions[visit_id].append(phe)

In [11]:
previous_conditions_phe_visit = defaultdict(list)

for visit_id in tqdm(previous_conditions):
    for phe in previous_conditions[visit_id]:
        previous_conditions_phe_visit[phe].append(visit_id)

100%|██████████| 1305172/1305172 [00:07<00:00, 173939.44it/s]


In [12]:
phenotype_visits_7 = defaultdict(list)
phenotype_visits_14 = defaultdict(list)
phenotype_visits_21 = defaultdict(list)
phenotype_visits_28 = defaultdict(list)
phenotype_visits_3m = defaultdict(list)
phenotype_visits_6m = defaultdict(list)
phenotype_visits_9m = defaultdict(list)
phenotype_visits_1y = defaultdict(list)

for phe, visit_id, diag_date, ed_date in tqdm(phenotype_visits):
        if visit_id not in visit_probability:
            continue
        if phe in previous_conditions[visit_id]:
            continue
        if (diag_date-ed_date).days <= 7:
            phenotype_visits_7[phe].append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 14:
            phenotype_visits_14[phe].append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 21:
            phenotype_visits_21[phe].append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 28:
            phenotype_visits_28[phe].append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 91:
            phenotype_visits_3m[phe].append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 183:
            phenotype_visits_6m[phe].append([visit_id, diag_date, ed_date])  
        if (diag_date-ed_date).days <= 274:
            phenotype_visits_9m[phe].append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 365:
            phenotype_visits_1y[phe].append([visit_id, diag_date, ed_date])

            
len(phenotype_visits_7), len(phenotype_visits_14), len(phenotype_visits_21), len(phenotype_visits_28), len(phenotype_visits_3m) ,len(phenotype_visits_6m), len(phenotype_visits_9m) ,len(phenotype_visits_1y)

100%|██████████| 7825382/7825382 [00:57<00:00, 136689.14it/s]


(984, 1002, 1013, 1018, 1037, 1043, 1048, 1048)

In [13]:
followup_visits_7 = []
followup_visits_14 = []
followup_visits_21 = []
followup_visits_28 = []
followup_visits_3m = []
followup_visits_6m = []
followup_visits_9m = []
followup_visits_1y = []

for visit_id, diag_date, ed_date in tqdm(followup_visits):
        if visit_id not in visit_probability:
            continue
        if (diag_date-ed_date).days <= 7:
            followup_visits_7.append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 14:
            followup_visits_14.append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 21:
            followup_visits_21.append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 28:
            followup_visits_28.append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 91:
            followup_visits_3m.append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 183:
            followup_visits_6m.append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <= 274:
            followup_visits_9m.append([visit_id, diag_date, ed_date])
        if (diag_date-ed_date).days <=365:
            followup_visits_1y.append([visit_id, diag_date, ed_date])

len(followup_visits_7), len(followup_visits_14), len(followup_visits_21), len(followup_visits_28), len(followup_visits_3m) ,len(followup_visits_6m), len(followup_visits_9m) ,len(followup_visits_1y)

100%|██████████| 5002965/5002965 [00:25<00:00, 196435.42it/s]


(266074, 502360, 717314, 922617, 2239595, 3471263, 4184347, 4589280)

In [14]:
results_7 = {}

for phe in tqdm(phenotype_visits_7):
    if phe == '':
        continue
    case_visits = []
    for visit_id, diag_date, ed_date in phenotype_visits_7[phe]:
        if visit_id not in visit_probability:
            continue
        if (diag_date-ed_date).days <= 7:
            case_visits.append(visit_id)
    case_visits = list(set(case_visits))
    non_case_visits = list(set(np.array(pd.DataFrame(followup_visits_7))[:,0])-set(case_visits)-set(previous_conditions_phe_visit[float(phe)]))
    phenotype_7 = [visit_probability[i] for i in case_visits]
    non_phenotype_7 = [visit_probability[i] for i in non_case_visits]
    results_7[float(phe)] = list(mannwhitneyu(phenotype_7, non_phenotype_7))
    

100%|██████████| 984/984 [07:52<00:00,  2.08it/s]


In [15]:
results_14 = {}

for phe in tqdm(phenotype_visits_14):
    if phe == '':
        continue
    case_visits = []
    for visit_id, diag_date, ed_date in phenotype_visits_14[phe]:
        if visit_id not in visit_probability:
            continue
        if (diag_date-ed_date).days <= 14:
            case_visits.append(visit_id)
    case_visits = list(set(case_visits))
    non_case_visits = list(set(np.array(pd.DataFrame(followup_visits_14))[:,0])-set(case_visits)-set(previous_conditions_phe_visit[float(phe)]))
    phenotype_14 = [visit_probability[i] for i in case_visits]
    non_phenotype_14 = [visit_probability[i] for i in non_case_visits]
    results_14[float(phe)] = list(mannwhitneyu(phenotype_14, non_phenotype_14))
    

100%|██████████| 1002/1002 [13:04<00:00,  1.28it/s]


In [16]:
results_21 = {}

for phe in tqdm(phenotype_visits_21):
    if phe == '':
        continue
    case_visits = []
    for visit_id, diag_date, ed_date in phenotype_visits_21[phe]:
        if visit_id not in visit_probability:
            continue
        if (diag_date-ed_date).days <= 21:
            case_visits.append(visit_id)
    case_visits = list(set(case_visits))
    non_case_visits = list(set(np.array(pd.DataFrame(followup_visits_21))[:,0])-set(case_visits)-set(previous_conditions_phe_visit[float(phe)]))
    phenotype_21 = [visit_probability[i] for i in case_visits]
    non_phenotype_21 = [visit_probability[i] for i in non_case_visits]
    results_21[float(phe)] = list(mannwhitneyu(phenotype_21, non_phenotype_21))
    

100%|██████████| 1013/1013 [17:15<00:00,  1.02s/it]


In [17]:
results_28 = {}

for phe in tqdm(phenotype_visits_28):
    if phe == '':
        continue
    case_visits = []
    for visit_id, diag_date, ed_date in phenotype_visits_28[phe]:
        if visit_id not in visit_probability:
            continue
        if (diag_date-ed_date).days <= 28:
            case_visits.append(visit_id)
    case_visits = list(set(case_visits))
    non_case_visits = list(set(np.array(pd.DataFrame(followup_visits_28))[:,0])-set(case_visits)-set(previous_conditions_phe_visit[float(phe)]))
    phenotype_28 = [visit_probability[i] for i in case_visits]
    non_phenotype_28 = [visit_probability[i] for i in non_case_visits]
    results_28[float(phe)] = list(mannwhitneyu(phenotype_28, non_phenotype_28))
    

100%|██████████| 1018/1018 [21:12<00:00,  1.25s/it]


In [18]:
results_91 = {}

for phe in tqdm(phenotype_visits_3m):
    if phe == '':
        continue
    case_visits = []
    for visit_id, diag_date, ed_date in phenotype_visits_3m[phe]:
        if visit_id not in visit_probability:
            continue
        if (diag_date-ed_date).days <= 91:
            case_visits.append(visit_id)
    case_visits = list(set(case_visits))
    non_case_visits = list(set(np.array(pd.DataFrame(followup_visits_3m))[:,0])-set(case_visits)-set(previous_conditions_phe_visit[float(phe)]))
    phenotype_91 = [visit_probability[i] for i in case_visits]
    non_phenotype_91 = [visit_probability[i] for i in non_case_visits]
    results_91[float(phe)] = list(mannwhitneyu(phenotype_91, non_phenotype_91))
    

100%|██████████| 1037/1037 [40:53<00:00,  2.37s/it]


In [19]:
results_183 = {}

for phe in tqdm(phenotype_visits_6m):
    if phe == '':
        continue
    case_visits = []
    for visit_id, diag_date, ed_date in phenotype_visits_6m[phe]:
        if visit_id not in visit_probability:
            continue
        if (diag_date-ed_date).days <= 183:
            case_visits.append(visit_id)
    case_visits = list(set(case_visits))
    non_case_visits = list(set(np.array(pd.DataFrame(followup_visits_6m))[:,0])-set(case_visits)-set(previous_conditions_phe_visit[float(phe)]))
    phenotype_183 = [visit_probability[i] for i in case_visits]
    non_phenotype_183 = [visit_probability[i] for i in non_case_visits]
    results_183[float(phe)] = list(mannwhitneyu(phenotype_183, non_phenotype_183))
    

100%|██████████| 1043/1043 [1:00:51<00:00,  3.50s/it]


In [20]:
results_274 = {}

for phe in tqdm(phenotype_visits_9m):
    if phe == '':
        continue
    case_visits = []
    for visit_id, diag_date, ed_date in phenotype_visits_9m[phe]:
        if visit_id not in visit_probability:
            continue
        if (diag_date-ed_date).days <= 274:
            case_visits.append(visit_id)
    case_visits = list(set(case_visits))
    non_case_visits = list(set(np.array(pd.DataFrame(followup_visits_9m))[:,0])-set(case_visits)-set(previous_conditions_phe_visit[float(phe)]))
    phenotype_274 = [visit_probability[i] for i in case_visits]
    non_phenotype_274 = [visit_probability[i] for i in non_case_visits]
    results_274[float(phe)] = list(mannwhitneyu(phenotype_274, non_phenotype_274))
    

100%|██████████| 1048/1048 [1:11:42<00:00,  4.11s/it]


In [21]:
results_365 = {}

for phe in tqdm(phenotype_visits_1y):
    if phe == '':
        continue
    case_visits = []
    for visit_id, diag_date, ed_date in phenotype_visits_1y[phe]:
        if visit_id not in visit_probability:
            continue
        if (diag_date-ed_date).days <= 365:
            case_visits.append(visit_id)
    case_visits = list(set(case_visits))
    non_case_visits = list(set(np.array(pd.DataFrame(followup_visits_1y))[:,0])-set(case_visits)-set(previous_conditions_phe_visit[float(phe)]))
    phenotype_365 = [visit_probability[i] for i in case_visits]
    non_phenotype_365 = [visit_probability[i] for i in non_case_visits]
    results_365[float(phe)] = list(mannwhitneyu(phenotype_365, non_phenotype_365))
    

100%|██████████| 1048/1048 [1:16:21<00:00,  4.37s/it]


In [22]:
pickle.dump(results_7, open('data/phenotype_mann_whitney_new_7.p', 'wb'))
pickle.dump(results_14, open('data/phenotype_mann_whitney_new_14.p', 'wb'))
pickle.dump(results_21, open('data/phenotype_mann_whitney_new_21.p', 'wb'))
pickle.dump(results_28, open('data/phenotype_mann_whitney_new_28.p', 'wb'))
pickle.dump(results_91, open('data/phenotype_mann_whitney_new_3m.p', 'wb'))
pickle.dump(results_183, open('data/phenotype_mann_whitney_new_6m.p', 'wb'))
pickle.dump(results_274, open('data/phenotype_mann_whitney_new_9m.p', 'wb'))
pickle.dump(results_365, open('data/phenotype_mann_whitney_new_1y.p', 'wb'))