<a href="https://colab.research.google.com/github/yutaro-tanaka-yt2705/cbioportal_codebook/blob/main/widgets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pprint
!pip install bravado -q
from bravado.client import SwaggerClient

from google.colab import drive
drive.mount('/content/drive')

def launch_cbioportal():
  cbioportal = SwaggerClient.from_url('https://www.cbioportal.org/api/api-docs',
                                config={"validate_requests":False,"validate_responses":False,"validate_swagger_spec":False})
  return cbioportal

def select_studies(cbioportal):
  list_of_studies = cbioportal.Studies.getAllStudiesUsingGET().result()
  study_columns = dir(list_of_studies[0])
  study_df = pd.DataFrame()
  for column in study_columns:
    study_df[str(column)] = [x[column] for x in list_of_studies]
  study_df = study_df[['allSampleCount', 'studyId', 'name']]
  #study_df.head()
  #select study_ids
  return 'msk_impact_2017'

def select_samples(cbioportal):
  samples = cbioportal.Sample_Lists.getAllSampleListsInStudyUsingGET(studyId=studyId).result()
  sample_columns = dir(samples[0])
  sample_df = pd.DataFrame()
  for column in sample_columns:
    sample_df[str(column)] = [x[column] for x in samples]
  #select sample_ids  
  return 'msk_impact_2017_all'

def select_molprofile(cbioportal):
  mprofiles = cbioportal.Molecular_Profiles.getAllMolecularProfilesInStudyUsingGET(studyId=studyId).result()
  mprofile_columns = dir(mprofiles[0])
  mprofile_df = pd.DataFrame()
  for column in mprofile_columns:
    mprofile_df[str(column)] = [x[column] for x in mprofiles]
  mprofile_df = mprofile_df[['datatype', 'description', 'molecularAlterationType', 'molecularProfileId', 'name']]
  return 'msk_impact_2017_mutations'

def get_raw_mutation_table(cbioportal, mprofile, sample_list):
  mutations = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
    molecularProfileId=mprofile,
    sampleListId=sample_list,
    projection='DETAILED'
  ).result()

  #pull sorted mutation data
  mut_list_entry = []
  for i in tqdm(range(len(mutations))):
    mut = str(mutations[i]).split(', ')
    list_entry = []
    for m in mut:
      if 'hugoGeneSymbol' in m:
        gene = m.replace("'", '').replace('hugoGeneSymbol=', '')
        list_entry.append(gene)
      if 'endPosition' in m:
        pos = m.replace('endPosition=', '')
        list_entry.append(pos)
      if 'patientId' in m:
        pat_id = m.replace("patientId=", '').replace("'", '')
        list_entry.append(pat_id)
      if 'proteinChange' in m:
        prot_id = m.replace('proteinChange=', '').replace("'", '')
        list_entry.append(prot_id)
    mut_list_entry.append(list_entry)
  mut_list_entry_df = pd.DataFrame(mut_list_entry, columns = ['endpos', 'gene', 'pat_id', 'prot_id'])

  #make binary table from mut list entry df. 
  genes = mut_list_entry_df.gene.unique().tolist()
  patients = mut_list_entry_df.pat_id.unique().tolist()

  mutation_table_list = []

  for patient in tqdm(patients):
    mutation_table_individual_list = [patient]
    genes_in_patient = mut_list_entry_df[mut_list_entry_df.pat_id == patient].gene.unique().tolist()
    for gene in genes:
      if gene in genes_in_patient:
        mutation_table_individual_list.append(1)
      else:
        mutation_table_individual_list.append(0)
    mutation_table_list.append(mutation_table_individual_list)

  #make binary table dataframe
  columns = ['pat_id']
  for gene in mut_list_entry_df.gene.unique().tolist():
    columns.append(gene)
  mutation_table_df = pd.DataFrame(mutation_table_list, columns = columns)

  return mutation_table_df

def obtain_clinical_data(cbioportal, studyId):
  clinical_data = cbioportal.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId=studyId).result()
  clinical_columns = dir(clinical_data[0])
  clinical_df = pd.DataFrame()
  for column in clinical_columns:
    clinical_df[str(column)] = [x[column] for x in clinical_data]
  clinical_df = clinical_df[clinical_df.clinicalAttributeId == 'CANCER_TYPE'][['patientId', 'value']]
  clinical_df.columns = ['pat_id', 'cancer_type']
  return clinical_df

def merge_tables(mutation_table, clinical_table):
  mutation_table = clinical_table.merge(mutation_table, on = 'pat_id', how = 'left')
  return mutation_table

def gene_cancer_lists(mutation_table):
  genes = mutation_table.columns.unique().tolist()[2:]
  cancers = mutation_table.cancer_type.value_counts().to_dict()
  return genes, cancers

#full script
cbioportal = launch_cbioportal()
studyId = select_studies(cbioportal)
sample_list = select_samples(cbioportal)
mprofile = select_molprofile(cbioportal)
mutation_table_df = get_raw_mutation_table(cbioportal, mprofile, sample_list)
clinical_df = obtain_clinical_data(cbioportal, studyId)

mutation_table = merge_tables(mutation_table_df, clinical_df)
genes, cancers = gene_cancer_lists(mutation_table)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


100%|██████████| 78142/78142 [00:06<00:00, 12639.90it/s]
100%|██████████| 9593/9593 [01:28<00:00, 108.63it/s]


In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import math
from tqdm import tqdm, trange

from statsmodels.stats.multitest import multipletests
from google.colab import drive

drive.mount('/content/drive')

data = pd.read_csv(
    "drive/MyDrive/hackathon/msk_impact_mutation_table_wcancertype.csv")
data.head()

def construct_adjacency(data, list_of_genes = ["all"],
                        list_of_tumors = ["cohort"], metric = "chi-sq"):
  '''
  This function takes a processed data set, where each column is a gene
  encoded as binary variables, and each row is a unique patient, and outputs
  an adjacency matrix using a certain metric
  data: pandas data frame
  list_of_genes: an array that selects which genes to study
  list_of_tumors: an array that selects which tumors to study and plot
  metric: a string that indicates which metric to use to study the relationship
  between nodes/genes
  '''
  # use arrays to subset the data here

  #iteratively make networks for each tumor type
  #if any input is "cohort", then use all tumors together

  data = data.drop("pat_id", axis= 1)
  if list_of_genes[0] != "all":
    data = data[list_of_genes]
    #use all genes otherwise
  
  list_mat = []
  if list_of_tumors[0] != "cohort":
    for tumor in list_of_tumors:
      df_subset = data[data["cancer_type"]==tumor]
      list_mat.append(choose_adjacency_method(df_subset, metric))
  else:
    df_subset = data 
    list_mat.append(choose_adjacency_method(df_subset, metric))
  return list_mat

def choose_adjacency_method(data, metric = "chi-sq"):
  if metric == "proportion":
    #use Kyle's function
    adj_m = calcProp(data)
  else:
    #use Aziz's function
    adj_m = chi_sq_adjacency(data)
  return adj_m
def calcProp(df):
  '''
  A MapReduce inspired (but non parallelized hehehe) method to calculate
  the proportion stuff Aziz was talking about

  Result:
    Let z = number of times two genes appeared mutated together
        y = number of times only one of the two genes appears mutated
    
    We return an adjacency matrix containing z/(z+y) values for each of the
    (number of genes) choose 2 combinations of genes
  '''

  #togetherness = dict() # Both genes appear mutated
  #uniqueness = dict() # Only one of the genes appears mutated

  # Create labels for adjacency matrix
  genes = df.columns[2:] # remove column label for patient IDs and cancer types
  gene_pairs = combinations(genes, 2)
  pairs = [pair for pair in gene_pairs]

  adjacency = pd.DataFrame(columns=genes, index=genes)

  pbar = tqdm(total = len(pairs))
  for pair in pairs:
    gene_a = pair[0]
    gene_b = pair[1]

    # Select only gene pair
    _info = df[[gene_a, gene_b]]
    
    # Create new column containing sum
    _info['Sum'] = _info[gene_a] + _info[gene_b]

    # Count 1's and 2's
    z = _info[_info['Sum'] == 2].shape[0] # Both genes appear mutated
    y = _info[_info['Sum'] == 1].shape[0] # Only of of the genes is mutated
    adjacency[gene_b][gene_a] = z/(z+y)

    pbar.update(1)

  pbar.close()

  return adjacency

def chi_sq_adjacency(data):
  #drop cancer_type here
  data = data.drop("cancer_type", axis = 1)
  n_cols = len(data.columns)
  adj_m = pd.DataFrame(np.zeros(shape=(n_cols,n_cols)),
                       columns = data.columns,
                       index =data.columns)
  p_vals = []
  for j in trange(n_cols):
    for i in range(j):
      #i: row, j:col
      
      if (i !=j):
        f1 = data.columns[i]
        f2 = data.columns[j]
        tab = pd.crosstab(index=data[f1], columns=data[f2])
        _, p, _, _ = stats.chi2_contingency(tab)
        p_vals.append(p)

  _,adj_p,_,_ = multipletests(p_vals, method  = "fdr_bh")
  adj_p = adj_p.tolist()
  for j in range(n_cols):
    for i in range(j):
      if (i !=j):
        adj_val = adj_p.pop(0)
        if adj_val == 0:
          adj_m.iloc[i,j] = 10000
        else:
          adj_m.iloc[i,j] = -1 * math.log(adj_val, 10)
  return adj_m      

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import requests
import random
import ipywidgets as wid

def multi_checkbox_widget(options_dict):
    """ Widget with a search field and lots of checkboxes """
    search_widget = wid.Text()
    output_widget = wid.Output()
    options = [x for x in options_dict.values()]
    options_layout = wid.Layout(
        overflow='auto',
        border='1px solid black',
        width='300px',
        height='300px',
        flex_flow='column',
        display='flex'
    )
    
    #selected_widget = wid.Box(children=[options[0]])
    options_widget = wid.VBox(options, layout=options_layout)
    #left_widget = wid.VBox(search_widget, selected_widget)
    multi_select = wid.VBox([search_widget, options_widget])

    @output_widget.capture()
    def on_checkbox_change(change):
        
        selected_recipe = change["owner"].description
        #print(options_widget.children)
        #selected_item = wid.Button(description = change["new"])
        #selected_widget.children = [] #selected_widget.children + [selected_item]
        options_widget.children = sorted([x for x in options_widget.children], key = lambda x: x.value, reverse = True)
        
    for checkbox in options:
        checkbox.observe(on_checkbox_change, names="value")

    # Wire the search field to the checkboxes
    @output_widget.capture()
    def on_text_change(change):
        search_input = change['new']
        if search_input == '':
            # Reset search field
            new_options = sorted(options, key = lambda x: x.value, reverse = True)
        else:
            # Filter by search field using difflib.
            #close_matches = difflib.get_close_matches(search_input, list(options_dict.keys()), cutoff=0.0)
            close_matches = [x for x in list(options_dict.keys()) if str.lower(search_input.strip('')) in str.lower(x)]
            new_options = sorted(
                [x for x in options if x.description in close_matches], 
                key = lambda x: x.value, reverse = True
            ) #[options_dict[x] for x in close_matches]
        options_widget.children = new_options

    search_widget.observe(on_text_change, names='value')
    display(output_widget)
    return multi_select

    options_dict = {
    x: wid.Checkbox(
        description=x, 
        value=False,
        style={"description_width":"0px"}
    ) for x in descriptions
}

def f(**args):
    results = [key for key, value in args.items() if value]
    display(results)

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

user_inputs = {}

user_inputs['edge_weight_rb'] = widgets.RadioButtons(
    options=['proportion', 'chi-sq'],
    description='Edge Weight Calculation:',
    style={"description_width":"40px"},
    disabled=False
)
# display(user_inputs['edge_weight_rb'])
# user_inputs['tumor_types_lb'] = widgets.Label(value="Tumor Types")
# display(widgets.Label(value="Tumor Types"))
# user_inputs['tumor_types_lb'].layout.visibility = 'hidden'

# tumor_types = ["data1", "data2", "data3", "data4"]
# selected_data = []
# user_inputs['checkboxes'] = [widgets.Checkbox(value=False, description=label) for label in tumor_types]
# for cb in user_inputs['checkboxes']:
#   display(cb)
# options_dict = {
#     x: widgets.Checkbox(
#         description=x, 
#         value=False,
#         style={"description_width":"0px"}
#     ) for x in tumor_types
# }
# user_inputs['tumor_cb'] = multi_checkbox_widget(options_dict)
# display(user_inputs['tumor_cb'])
# user_inputs['tumor_cb'].layout.visibility = 'hidden'

selected_tumor_types = wid.interactive_output(f, options_dict)
# display(wid.HBox(user_inputs['tumor_cb']))

In [None]:
def draw_graph():
  ### GET INPUT FROM WIDGETS STORED IN USER_INPUTS
  edge_weight_type = user_inputs['edge_weight_rb'].value

In [None]:
import pandas as pd

def get_graphml(df, filename='graph.xml'):
  import pandas as pd
  import numpy as np
  import networkx as nx
  from google.colab import files
  a = df.to_numpy()
  thresh = np.percentile(a.flatten()[a.flatten() != 0], 99)
  a[a < thresh] = 0
  df2 = pd.DataFrame(a, columns=df.columns, index=df.columns)
  G = nx.from_pandas_adjacency(df2)
  G.remove_nodes_from(list(nx.isolates(G)))
  nx.write_graphml(G, filename)
  files.download(filename)

def on_draw_click(btn):
  # adj_mat = construct_adjacency(mutation_table, list_of_genes = ["all"],
                        # list_of_tumors = [user_inputs['tumors_rb'].value], metric = user_inputs['edge_weight_rb'].value)
  adj_mat = pd.read_csv("/content/drive/MyDrive/hackathon/chi-sq_matrix.csv", index_col=0)
  get_graphml(adj_mat)

user_inputs['input_file'] = widgets.FileUpload(
    accept='',
    multiple=False,
    description='Upload data'
)

user_inputs["input_file"].observe(on_file_upload, names='value')
display(user_inputs['input_file'])

def on_file_upload(value):
  return
  # a callback function on upload
  # value = value['new']
  # csv_path = list(value.keys())[0]
  # cbioportal = launch_cbioportal()
  # studyId = select_studies(cbioportal)
  # sample_list = select_samples(cbioportal)
  # mprofile = select_molprofile(cbioportal)
  # mutation_table_df = get_raw_mutation_table(cbioportal, mprofile, sample_list)
  # clinical_df = obtain_clinical_data(cbioportal, studyId)
  # print(csv_path)
  # mutation_table = pd.read_csv(csv_path)
  # mutation_table = merge_tables(mutation_table_df, clinical_df)
  # genes, cancers = gene_cancer_lists(mutation_table)
sorted_cancers = sorted(cancers.items(), key=lambda x: x[1], reverse=True)

display(user_inputs['edge_weight_rb'])

### GET TUMOR TYPES CODE
# tumor_types = ["data1", "data2", "data3", "data4"]
# tumors_dict = {
#     x: widgets.Checkbox(
#         description=x[0] + " (" + str(x[1]) + ")", 
#         value=False,
#         style={"description_width":"0px"}
#     ) for x in sorted_cancers
# }

display(widgets.Label(value="Tumor Types"))
# user_inputs['tumor_cb'] = multi_checkbox_widget(tumors_dict)
# display(user_inputs['tumor_cb'])

pty_sorted_cancers = []
for x in sorted_cancers:
  pty_sorted_cancers.append(x[0] + " (" + str(x[1]) + ")")

user_inputs['tumors_rb'] = widgets.RadioButtons(options=pty_sorted_cancers)
display(user_inputs['tumors_rb'])

# gene_types = genes
genes_dict = {
    x: widgets.Checkbox(
        description=x, 
        value=False,
        style={"description_width":"0px"}
    ) for x in genes
}

display(widgets.Label(value="Gene Types"))
user_inputs['gene_cb'] = multi_checkbox_widget(genes_dict)
display(user_inputs['gene_cb'])

user_inputs['draw_button'] = widgets.Button(
  description='Draw Graph',
  disabled=False,
  button_style='', # 'success', 'info', 'warning', 'danger' or ''
  icon='check' # (FontAwesome names without the `fa-` prefix)
)
user_inputs['draw_button'].on_click(on_draw_click)
display(user_inputs['draw_button'])


FileUpload(value={}, description='Upload data')

RadioButtons(description='Edge Weight Calculation:', options=('proportion', 'chi-sq'), style=DescriptionStyle(…

Label(value='Tumor Types')

RadioButtons(options=('Non-Small Cell Lung Cancer (1668)', 'Breast Cancer (1337)', 'Colorectal Cancer (1007)',…

Label(value='Gene Types')

Output()

VBox(children=(Text(value=''), VBox(children=(Checkbox(value=False, description='AKT1', style=DescriptionStyle…

Button(description='Draw Graph', icon='check', style=ButtonStyle())

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
construct_adjacency(mutation_table, list_of_genes = ["all"],
                        list_of_tumors = ["cohort"], metric = "chi-sq")

 77%|███████▋  | 317/413 [09:59<03:01,  1.89s/it]


KeyboardInterrupt: ignored

In [None]:
# display box (input: array of options, initially empty)

# file upload () => {
    update the array of options
}

IndentationError: ignored

In [None]:
user_inputs['tumors_rb'].value
u

KeyError: ignored