In [209]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

In [210]:
file_dir="../filtered_data/"    #change this to your data directory

patients = file_dir + "selected_patients.csv"
encounters = file_dir + "selected_patients_encounter.csv"
conditions = file_dir + "selected_patients_conditions.csv"
procedures = file_dir + "selected_patients_procedures.csv"

In [211]:
first_encounters = pd.read_csv(file_dir + "first_encounter_per_patient.csv")
df_chained_fixed = pd.read_csv(file_dir + "chained_encounters_patient_specific.csv")
df_encounters = pd.read_csv(encounters)
df_conditions = pd.read_csv(conditions)
df_procedures = pd.read_csv(procedures)
df_patients = pd.read_csv(patients)

In [212]:
first_encounters_ids = set(first_encounters['ENCOUNTER_ID'])
procedures_ids = set(df_procedures['ENCOUNTER'])
conditions_ids = set(df_conditions['ENCOUNTER'])
death_certification_ids = set(df_encounters[df_encounters['CODE'] == 308646001]['Id'])

valid_encounters = first_encounters_ids.union(procedures_ids, conditions_ids, death_certification_ids)
filtered_encounters = df_encounters[df_encounters['Id'].isin(valid_encounters)]


In [214]:
#get count of nans in each column
filtered_encounters.isna().sum()

Id                   0
START                0
PATIENT              0
ENCOUNTERCLASS       0
CODE                 0
DESCRIPTION          0
REASONCODE           0
REASONDESCRIPTION    0
dtype: int64

In [215]:
df_procedures.isna().sum()

START                0
STOP                 0
PATIENT              0
ENCOUNTER            0
CODE                 0
DESCRIPTION          0
REASONCODE           0
REASONDESCRIPTION    0
dtype: int64

In [216]:
df_conditions.isna().sum()

START          0
PATIENT        0
ENCOUNTER      0
CODE           0
DESCRIPTION    0
dtype: int64

In [217]:
filtered_encounters['START'] = pd.to_datetime(filtered_encounters['START'])

enc_proc_map = (
    df_procedures.groupby('ENCOUNTER')['CODE']
    .apply(lambda x: frozenset(x))
    .to_dict()
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_encounters['START'] = pd.to_datetime(filtered_encounters['START'])


In [237]:
enc_proc_map

{'021795ad-8bf0-4ac9-ad45-7ef76f910a52': frozenset({398171003,
            703423002,
            16335031000119103}),
 '02944f04-9d27-ad60-f2f4-e9a7ec5ac17f': frozenset({398171003,
            703423002,
            16335031000119103}),
 '02dbf6e3-9fab-2db3-656c-f1da79519036': frozenset({398171003,
            703423002,
            16335031000119103}),
 '061de8b1-71e7-6e40-09d1-a64820e57b84': frozenset({398171003,
            703423002,
            16335031000119103}),
 '0812e504-1bb9-117d-c3b2-0fe5884b5cd0': frozenset({398171003,
            703423002,
            16335031000119103}),
 '09d3cf1b-22e9-f000-027b-d55bcbd7c5aa': frozenset({398171003,
            703423002,
            16335031000119103}),
 '0bcb63d9-3041-b6b1-86f8-875e32837674': frozenset({398171003, 703423002}),
 '0c756dd0-b3da-78fd-59e2-fb1b513823b4': frozenset({398171003,
            703423002,
            16335031000119103}),
 '0e790aa4-f927-3cef-67f2-6deb754f1176': frozenset({398171003, 703423002}),
 '0e9de9b7-1400

In [220]:
filtered_encounters["procedure_set"] = filtered_encounters['Id'].map(enc_proc_map)
filtered_encounters["procedure_set"] = filtered_encounters["procedure_set"].apply(
    lambda x: x if isinstance(x, frozenset) else frozenset()
) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_encounters["procedure_set"] = filtered_encounters['Id'].map(enc_proc_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_encounters["procedure_set"] = filtered_encounters["procedure_set"].apply(


In [221]:
filtered_encounters = filtered_encounters.sort_values(by=["PATIENT", "START"])

In [223]:
def deduplicate_encounters(df):
    to_keep = [True]
    for i in range(1, len(df)):
        prev_set = df.iloc[i - 1]["procedure_set"]
        curr_set = df.iloc[i]["procedure_set"]
        if(curr_set == frozenset() or prev_set == frozenset()):
            to_keep.append(True)
            continue
        same_set = prev_set == curr_set
        same_patient = df.iloc[i]["PATIENT"] == df.iloc[i - 1]["PATIENT"]
        same_prev = same_set and same_patient
        to_keep.append(not same_prev)
    return df[to_keep]


def deduplicate_and_reduce_encounters(df):
    to_keep = [True]
    for i in range(1, len(df)):
        curr = df.iloc[i]
        prev = df.iloc[i - 1]

        curr_set = curr["procedure_set"]
        prev_set = prev["procedure_set"]
        same_patient = curr["PATIENT"] == prev["PATIENT"]

        if curr_set == frozenset() or prev_set == frozenset():
            to_keep.append(True)
            continue

        if same_patient and curr_set.issubset(prev_set):
            to_keep.append(False)
        else:
            to_keep.append(True)
    
    return df[to_keep]


In [224]:
filtered_encounters_main = deduplicate_and_reduce_encounters(filtered_encounters)
filtered_encounters_main = deduplicate_and_reduce_encounters(filtered_encounters_main)
# filtered_encounters_main = deduplicate_encounters(filtered_encounters)

In [226]:

patients_df = df_patients
encounters_df = filtered_encounters_main

# Ensure datetime format
encounters_df['START'] = pd.to_datetime(encounters_df['START'])

# Get the first encounter per patient by earliest 'START' time
first_encounters_df = (
    encounters_df.sort_values(by='START')
                 .groupby('PATIENT', as_index=False)
              .first()
)

first_encounters_df = first_encounters_df[['PATIENT', 'Id']]
first_encounters_df.columns = ['PATIENT', 'ENCOUNTER_ID']



In [228]:
grouped = encounters_df.sort_values(by='START').groupby('PATIENT')

chained_encounters_fixed = []
for patient_id, group in grouped:
    sorted_group = group.sort_values(by='START')
    ids = sorted_group['Id'].tolist()
    for i in range(len(ids) - 1):
        chained_encounters_fixed.append((patient_id, ids[i], ids[i+1]))

df_chained_fixed = pd.DataFrame(chained_encounters_fixed, columns=["PATIENT", "ENCOUNTER_ID_1", "ENCOUNTER_ID_2"])

In [230]:
#get the unique procedure codes and their descriptions from the procedure df and put into csv file
# This will be used to create nodes in the graph
# and to create the procedure nodes in the graph
unique_procedure_nodes = df_procedures[['CODE', 'DESCRIPTION']].drop_duplicates().reset_index(drop=True)

In [231]:
unique_procedure_nodes

Unnamed: 0,CODE,DESCRIPTION
0,399208008,Plain chest X-ray (procedure)
1,418891003,Computed tomography of chest and abdomen
2,91602002,Thoracentesis (procedure)
3,14768001,Peripheral blood smear interpretation
4,415300000,Review of systems (procedure)
5,430193006,Medication Reconciliation (procedure)
6,162676008,Brief general examination (procedure)
7,698354004,Magnetic resonance imaging for measurement of ...
8,398171003,Hearing examination (procedure)
9,703423002,Combined chemotherapy and radiation therapy (p...


In [232]:
unique_condition_nodes = df_conditions[['CODE', 'DESCRIPTION']].drop_duplicates().reset_index(drop=True)

In [233]:
unique_condition_nodes

Unnamed: 0,CODE,DESCRIPTION
0,162573006,Suspected lung cancer (situation)
1,254632001,Small cell carcinoma of lung (disorder)
2,271737000,Anemia (disorder)
3,67811000119102,Primary small cell malignant neoplasm of lung ...
4,254637007,Non-small cell lung cancer (disorder)
5,424132000,Non-small cell carcinoma of lung TNM stage 1 ...


In [235]:
#load everything for graph to csv
graph_data = "../filtered_data/graph_data"
os.makedirs(graph_data, exist_ok=True)

first_encounters_df.to_csv(os.path.join(graph_data, "first_encounters.csv"), index=False)
df_chained_fixed.to_csv(os.path.join(graph_data, "chained_encounters_patient_specific.csv"), index=False)
unique_procedure_nodes.to_csv(os.path.join(graph_data, "unique_procedure_nodes.csv"), index=False)
unique_condition_nodes.to_csv(os.path.join(graph_data, "unique_condition_nodes.csv"), index=False)
filtered_encounters_main.to_csv(os.path.join(graph_data, "filtered_encounters.csv"), index=False)
df_patients.to_csv(os.path.join(graph_data, "patients.csv"), index=False)
df_conditions.to_csv(os.path.join(graph_data, "selected_patients_conditions.csv"), index=False)
df_procedures.to_csv(os.path.join(graph_data, "selected_patients_procedures.csv"), index=False)