In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import os
import sys
import multiprocessing as mp

sys.path.insert(0, os.path.abspath(".."))
from dotenv import load_dotenv
from psycopg2.extras import execute_batch
import seaborn as sns
import pandas as pd
import numpy as np

from db import PostgresDB
from schemas import Proband
from cohorts import Cohort
from matplotlib import pyplot as plt
import sql_queries as sq

In [3]:
from helper import psycop_to_asyncpg_string

In [6]:
psycop_to_asyncpg_string(sq.sepsis_cohort)

'SELECT sep.subject_id, sep.stay_id, sta.hadm_id FROM mimiciv_derived.sepsis3 sep, mimiciv_icu.icustays sta, mimiciv_derived.age a, mimiciv_hosp.patients p WHERE sep.stay_id = sta.stay_id AND sta.hadm_id = a.hadm_id AND p.subject_id = sta.subject_id AND a.age >= $0 AND a.age <= $1 AND p.gender = $2'

In [3]:
load_dotenv()


True

In [28]:
db = PostgresDB(
    db_name=os.getenv("DB_NAME"),
    host=os.getenv("DB_HOST"),
    port=os.getenv("DB_PORT"),
    user=os.getenv("DB_USER"),
)


Connected to database


In [5]:
cohort = Cohort.from_query(
    db=db,
    query="""
        SELECT sep.subject_id, sta.hadm_id
        FROM mimiciv_derived.sepsis3 sep, mimiciv_icu.icustays sta
        WHERE sep.stay_id = sta.stay_id LIMIT 100; 
    """,
    name="sepsis3",
)


In [6]:
len(cohort.participants)


100

In [7]:
cohort.initialize_data(with_tfidf_diagnoses=True)


In [10]:
similarity_scores = cohort.compare_encounters(
    scale_by_distribution=True, normalize_categories=True
)


Finished encounter 26184834
Finished encounter 23581541
Finished encounter 20345487
Finished encounter 23822395
Finished encounter 28994087
Finished encounter 22725460
Finished encounter 20321825
Finished encounter 23473524
Finished encounter 28662225
Finished encounter 21329021
Finished encounter 24982426
Finished encounter 28094813
Finished encounter 26048429
Finished encounter 20214994
Finished encounter 23559586
Finished encounter 23559586
Finished encounter 24181354
Finished encounter 22869003
Finished encounter 23920883
Finished encounter 25777141
Finished encounter 26488315
Finished encounter 28128182
Finished encounter 29988601
Finished encounter 23251352
Finished encounter 29242151
Finished encounter 22081550
Finished encounter 27411876
Finished encounter 24817563
Finished encounter 20626031
Finished encounter 28661809
Finished encounter 22942076
Finished encounter 23295760
Finished encounter 22987108
Finished encounter 20338077
Finished encounter 28324362
Finished encounter 2

In [15]:
def insert_demographics_sim(cur, input_batch):
    query = """
        INSERT INTO demographics_similarity (hadm_id_a, hadm_id_b, raw_similarity_value, cohort_name)
        VALUES (%s, %s, %s, %s);
    """
    execute_batch(cur, query, input_batch)


def insert_icd_sim(cur, input_batch):
    query = """
        INSERT INTO icd_diagnoses_similarity (hadm_id_a, hadm_id_b, raw_similarity_value, cohort_name)
        VALUES (%s, %s, %s, %s);
    """
    execute_batch(cur, query, input_batch)


def insert_lab_sim(cur, input_batch):
    query = """
        INSERT INTO labevents_similarity (hadm_id_a, hadm_id_b, raw_similarity_value, cohort_name)
        VALUES (%s, %s, %s, %s);
    """
    execute_batch(cur, query, input_batch)


def insert_vitalsign_sim(cur, input_batch):
    query = """
        INSERT INTO vitalsigns_similarity (hadm_id_a, hadm_id_b, raw_similarity_value, cohort_name)
        VALUES (%s, %s, %s, %s);
    """
    execute_batch(cur, query, input_batch)


def insert_inputevents_sim(cur, input_batch):
    query = """
        INSERT INTO inputevents_similarity (hadm_id_a, hadm_id_b, raw_similarity_value, cohort_name)
        VALUES (%s, %s, %s, %s);
    """
    execute_batch(cur, query, input_batch)


In [31]:
cur = db.conn.cursor()

seen_encounters = set()
demographics_sim_batch = []
diagnoses_sim_batch = []
labevents_sim_batch = []
vitalsigns_sim_batch = []
inputevents_sim_batch = []

for item in similarity_scores:
    encounter_a = item["encounter_a"]
    encounter_b = item["encounter_b"]

    if encounter_a == encounter_b:
        continue
    elif tuple(sorted([encounter_a, encounter_b])) in seen_encounters:
        continue
    else:
        seen_encounters.add(tuple(sorted([encounter_a, encounter_b])))
    demographics_sim = item["similarity"]["demographics_sim"]
    demographics_sim_batch.append(
        (encounter_a, encounter_b, demographics_sim, cohort.name)
    )
    diagnoses_sim = item["similarity"]["diagnoses_sim"]
    diagnoses_sim_batch.append((encounter_a, encounter_b, diagnoses_sim, cohort.name))
    labevents_sim = item["similarity"]["labevents_sim"]
    labevents_sim_batch.append((encounter_a, encounter_b, labevents_sim, cohort.name))
    vitalsigns_sim = item["similarity"]["vitalsigns_sim"]
    vitalsigns_sim_batch.append((encounter_a, encounter_b, vitalsigns_sim, cohort.name))
    inputevents_sim = item["similarity"]["inputevents_sim"]
    inputevents_sim_batch.append(
        (encounter_a, encounter_b, inputevents_sim, cohort.name)
    )
insert_demographics_sim(cur, demographics_sim_batch)
insert_icd_sim(cur, diagnoses_sim_batch)
insert_lab_sim(cur, labevents_sim_batch)
insert_vitalsign_sim(cur, vitalsigns_sim_batch)
insert_inputevents_sim(cur, inputevents_sim_batch)
db.conn.commit()
cur.close()


UniqueViolation: duplicate key value violates unique constraint "demographics_similarity_pkey"
DETAIL:  Key (hadm_id_a, hadm_id_b)=(26184834, 23581541) already exists.


In [26]:
db.close()
