In [None]:
! pip install -U kaleido



In [1]:
import pandas as pd
markerdb = pd.read_csv('marker_proteins.tsv', sep='\t', header=None)
markerdb.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1,Glycated hemoglobin,115,Diabetes Mellitus Type 2,Adult: >=18 yrs old,Both,Serum,>6.5 %,%,The specific age range is for 19-79 yrs old,"Lee, J. M., Wu, E. L., Tarini, B., Herman, W. ...",21195416.0
1,1,Glycated hemoglobin,115,Diabetes Mellitus Type 2,Adolescent:13-18 yrs old,Both,Serum,>6.5 %,%,The specific age range is for 12-18 yrs old,"Lee, J. M., Wu, E. L., Tarini, B., Herman, W. ...",21195416.0


In [2]:

import pandas as pd
import ast

# Specify the file path
file_path = "Imputed_vs_Non_Imputed_Protein_Markers.csv"

# Initialize lists to store the columns
col1 = []
col2 = []
col3 = []
list_column = []

# Open and read the file manually
with open(file_path, 'r') as file:
    for line in file:
        # Strip newline characters and split the line by comma
        parts = line.strip().split(', ', 3)
        if len(parts) == 4:
            col1.append(parts[0])
            col2.append(parts[1])
            col3.append(parts[2])
            list_column.append(ast.literal_eval(parts[3]))
# Create a DataFrame from the lists
patient_markers = pd.DataFrame({
    'Patient': col1,
    'Imputation': col2,
    'marker_count': col3,
    'markers': list_column
})

# Display the dataframe
patient_markers.head(2)

Unnamed: 0,Patient,Imputation,marker_count,markers
0,HS2_RE2,wo_Imputation,12,"[P02649, P02647, P00450, P01024, P01008, P0279..."
1,HS2_RE2,w_Imputation,13,"[P02792, P61626, P02675, P02766, P01024, P0264..."


In [30]:
#for every unique patient, find absolute difference between imputed and non-imputed marker_count and add to a running total 
#to find the total difference
total_difference = 0
for patient in patient_markers['Patient'].unique():
    imputed = patient_markers[(patient_markers['Patient'] == patient) & (patient_markers['Imputation'] == 'w_Imputation')]['marker_count'].values[0]
    non_imputed = patient_markers[(patient_markers['Patient'] == patient) & (patient_markers['Imputation'] == 'wo_Imputation')]['marker_count'].values[0]
    difference = abs(int(imputed) -int(non_imputed))
    total_difference += difference
print(f"Additional average biomarkers gained with imputation is {total_difference/10}")
print(total_difference)

Additional average biomarkers gained with imputation is 3.0
30


In [29]:
imputed_biomarkers = []
non_imputed_biomarkers = []
for patient in patient_markers['Patient'].unique():
    imputed_biomarkers.append(patient_markers[(patient_markers['Patient'] == patient) & (patient_markers['Imputation'] == 'w_Imputation')]["markers"].values[0])
    non_imputed_biomarkers.append(patient_markers[(patient_markers['Patient'] == patient) & (patient_markers['Imputation'] == 'wo_Imputation')]["markers"].values[0])

#flatten the list of lists
imputed_biomarkers = [item for sublist in imputed_biomarkers for item in sublist]
non_imputed_biomarkers = [item for sublist in non_imputed_biomarkers for item in sublist]

len(list(set(imputed_biomarkers))), len(list(set(non_imputed_biomarkers)))

(20, 18)

In [3]:
patient_map_dict = {'HS2_RE2': "HS1",
 'HS3RE_sub1' : "HS2",
 'HS4': "HS3",
 'HS5' : "HS4",
 'HS6': "HS5",
 'HS7' : "HS6",
 'HS8': "HS7",
 'HS9' : "HS8",
 'HS10': "HS9",
 'HS11': "HS10"}

patient_markers['Patient'] = patient_markers['Patient'].map(patient_map_dict)

In [4]:
#read in json file
file_path = "MarkerDB_Proteins_with_Uniprot_Mapping.json"

import json
with open(file_path) as f:
    uniprot_mapping = json.load(f)

uniprot_mapping

#reverse keys and values
uniprot_mapping = {v: k for k, v in uniprot_mapping.items()}
uniprot_mapping['P68871']

'Hemoglobin'

In [5]:
markerdb.loc[markerdb[markerdb[1] == "C-reactive protein"].index][3].tolist()

['Inflammatory Bowel Disease',
 'Non-specific Inflammation',
 'Normal',
 'Normal',
 'Pregnancy',
 'Normal',
 'Viral Infection',
 'Bacterial Infections',
 'Diabetes Mellitus Type 2',
 'Hypertension',
 'Cardiovascular Disease',
 'Obstructive sleep apnea']

In [6]:
markers_dict = {}

for patient in patient_markers['Patient']:
  #w_Imputation
  imputed_markers = patient_markers.loc[(patient_markers['Patient'] == patient) & (patient_markers['Imputation'] == 'w_Imputation'), 'markers'].tolist()[0]
  #wo_Imputation
  non_imputed_markers = patient_markers.loc[(patient_markers['Patient'] == patient) & (patient_markers['Imputation'] == 'wo_Imputation'), 'markers'].tolist()[0]

  #uniprot_mapping has uniprot ids as values and protein names as keys
  #I want to cover the uniprot ids in imputed_markers and non_imputed_markers
  # into protein names
  imputed_protein_names = [uniprot_mapping[uniprot_id] for uniprot_id in imputed_markers]
  non_imputed_protein_names = [uniprot_mapping[uniprot_id] for uniprot_id in non_imputed_markers]

  imputed_disease = {m : markerdb.loc[markerdb[markerdb[1] == m].index][3].tolist() for m in imputed_protein_names}
  non_imputed_disease = {m : markerdb.loc[markerdb[markerdb[1] == m].index][3].tolist() for m in non_imputed_protein_names}

  #create a dataframe with three columns: protein, disease, w/woImputation
  df = pd.DataFrame(columns=['Protein', 'Disease', 'Imputation'])
  for protein in imputed_disease.keys():
    for disease in imputed_disease[protein]:
      df = pd.concat([df, pd.DataFrame({'Protein': [protein], 'Disease': [disease], 'Imputation': ['w_Imputation']})], ignore_index=True)
  for protein in non_imputed_disease.keys():
    for disease in non_imputed_disease[protein]:
      df = pd.concat([df, pd.DataFrame({'Protein': [protein], 'Disease': [disease], 'Imputation': ['wo_Imputation']})], ignore_index=True)

  #remove rows df[df == 'w_Imputation']['Protein'] if they are in rows df[df == 'wo_Imputation']['Protein']
  #df = df.drop_duplicates(subset=[['Protein', 'Disease']], keep='last')
  #for Imputation == 'w_Imputation', only keep protein,disease combinations that are not in Imputation =='wo_Imputation'
  df = df.drop_duplicates(subset=['Protein', 'Disease'], keep='last')

  # Define colors for each imputation type
  color_map = {
      'w_Imputation': 'orange',
      'wo_Imputation': 'lightblue'
  }

  # Map colors to the DataFrame
  df['Color'] = df['Imputation'].map(color_map)

  #add to markers dictionary
  markers_dict[patient] = df

In [51]:
#combine all dataframes in markers_dict into one dataframe
combined_df = pd.concat(markers_dict.values(), ignore_index=True)
combined_df.head(2)

combined_df = combined_df[combined_df['Imputation'] == 'w_Imputation']
#store ['protein', 'disease'] in a list by iterating through rows 
protein_disease = []
for index, row in combined_df.iterrows():
    protein_disease.append([row['Protein'], row['Disease']])

len(protein_disease)

116

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pandas as pd
import kaleido

# Number of patients
num_patients = len(markers_dict)

# Determine the number of rows and columns
num_rows = 5
num_cols = 2

# Create subplots, specifying the type for each subplot
fig = make_subplots(rows=num_rows, cols=num_cols,
                    subplot_titles=list(markers_dict.keys()),
                    specs=[[{"type": "sankey"}] * num_cols] * num_rows) # This line is changed

# Loop through each patient and their corresponding DataFrame
for i, (patient, df) in enumerate(markers_dict.items()):
    row = i // num_cols + 1
    col = i % num_cols + 1

    # Create a list of unique nodes
    all_nodes = list(pd.concat([df['Protein'], df['Disease']]).unique())
    # Create a mapping from node name to index
    node_mapping = {node: idx for idx, node in enumerate(all_nodes)}

    # Map source and target nodes to their indices
    df['Source'] = df['Protein'].map(node_mapping)
    df['Target'] = df['Disease'].map(node_mapping)

    # Create the Sankey diagram
    sankey = go.Sankey(
        node=dict(
            pad=150,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=all_nodes,
        ),
        link=dict(
            source=df['Source'],
            target=df['Target'],
            value=[1] * len(df),  # The value is 1 for all of them
            color=df['Color']
        )
    )

    # Add the Sankey diagram to the appropriate subplot
    fig.add_trace(sankey, row=row, col=col)

# Update layout
fig.update_layout(
    height=300 * num_rows,  # Adjust height based on number of rows
    #title_text="Sankey Diagrams for Each Patient",
    font_size=6
)

# Display the figure
fig.show()


In [None]:
#!pip install -U kaleido
# Save the figure to a PDF file
fig.write_image("patient_marker_sankey_diagrams.pdf", format="pdf")