## Annotate `'msk_impact_2017'` study's mutation endpoints from cBioPortal using OncoKB REST API.

First import `bravado` and `pandas`.

Initialize REST APIs for cBioPortal and OncoKB.

In [None]:
from bravado.client import SwaggerClient
from bravado.requests_client import RequestsClient
from bravado.swagger_model import load_file, Loader

from pprint import pprint
from copy import *

import matplotlib
import pandas

%matplotlib inline

plot_w, plot_h = matplotlib.rcParams['figure.figsize']

cbioportal = SwaggerClient.from_url('https://www.cbioportal.org/api/api-docs',
                                config={"validate_requests":False,"validate_responses":False})

# OncoKB swagger is generated separately from the main website, we need to manually change the host to www.oncokb.org 
oncokb_swagger_url='https://www.oncokb.org/api/v1/v2/api-docs?group=Public%20APIs'
http_client = RequestsClient()
loader = Loader(http_client)
spec_dict = loader.load_spec('https://www.oncokb.org/api/v1/v2/api-docs?group=Public%20APIs')
spec_dict['host'] = 'www.oncokb.org'
oncokb = SwaggerClient.from_spec(spec_dict,
                                config={"validate_requests":False,"validate_responses":False})

### Define method to run oncokb annotation in POST.
#### Please ask for the oncokb token if you don't have an account and update `oncokb_token` below

In [None]:
def make_oncokb_post_request(requests):
    oncokb_token=''
    oncokb_request_headers = {
      'Authorization': 'Bearer {}'.format(oncokb_token)
    }

    request_options = {
        # === bravado config ===
        'headers': oncokb_request_headers,
    }
    return json.loads(oncokb.Annotations.annotateMutationsByProteinChangePostUsingPOST_1(
            _request_options=request_options,
            body=requests
        ).response().result)

Define method to process mutations and get annotated

In [None]:
POST_LENGTH_THRESHOLD=1000
def annotate_mutations(mutations):
    msk_mutation_annotations = []
    requests = []
    sub_mutations = []
    for idx, m in enumerate(mutations):
        QueryGene = oncokb.get_model('QueryGene')
        AnnotateMutationByProteinChangeQuery = oncokb.get_model('AnnotateMutationByProteinChangeQuery')
        requests.append(
            AnnotateMutationByProteinChangeQuery(gene=QueryGene(entrezGeneId=m.entrezGeneId), alteration=m.proteinChange))
        sub_mutations.append(m)
        if len(requests) == POST_LENGTH_THRESHOLD:
            resp = make_oncokb_post_request(requests)
            msk_mutation_annotations.extend(zip(sub_mutations, resp))
            print('Annotated {}/{} mutations'.format(idx+1, len(mutations)))
            requests = []
            sub_mutations = []
    if (len(requests) > 0):
        resp = make_oncokb_post_request(requests)
        msk_mutation_annotations.extend(zip(sub_mutations, resp))
        requests = []
        sub_mutations = []
    return msk_mutation_annotations

Retrieve all available clinical data in `'msk_impact_2017'` study:

In [None]:
msk_clinical_data=cbioportal.Clinical_Data.getAllClinicalDataInStudyUsingGET(studyId='msk_impact_2017').result()

The clinical data returned from `getAllClinicalDataInStudyUsingGET()` can be inserted into a DataFrame.

In [None]:
%%time

parsed_clinical_data = dict()
for cd in msk_clinical_data:
    if cd.uniqueSampleKey not in parsed_clinical_data:
        parsed_clinical_data[cd.uniqueSampleKey] = dict()
        parsed_clinical_data[cd.uniqueSampleKey]['uniqueSampleKey'] = cd.uniqueSampleKey
        parsed_clinical_data[cd.uniqueSampleKey][cd.clinicalAttributeId] = cd.value
        parsed_clinical_data[cd.uniqueSampleKey]['patientId'] = cd.patientId
        parsed_clinical_data[cd.uniqueSampleKey]['sampleId'] = cd.sampleId
        parsed_clinical_data[cd.uniqueSampleKey]['studyId'] = cd.studyId
        parsed_clinical_data[cd.uniqueSampleKey]['uniquePatientKey'] = cd.uniquePatientKey
    else:
        parsed_clinical_data[cd.uniqueSampleKey][cd.clinicalAttributeId] = cd.value

cd_df = pandas.DataFrame.from_dict([cd[1] for cd in parsed_clinical_data.items()])

In [None]:
print("number of clinical data endpoints:", len(parsed_clinical_data))

Here, the study samples can be categorized under `'CANCER_TYPE'`.

In [None]:
msk_clinical_data_sorted_cancer_type = cd_df.groupby('CANCER_TYPE')['uniqueSampleKey'].nunique().sort_values(ascending=False)

Here is a plot of the studies' samples sorted by `'CANCER_TYPE'`:

Obtain mutations for `'msk_impact_2017'`:

In [None]:
%%time

mutations = cbioportal.Mutations.getMutationsInMolecularProfileBySampleListIdUsingGET(
    molecularProfileId='msk_impact_2017_mutations',
    sampleListId='msk_impact_2017_all',
    projection='DETAILED'
).result()

Combine `'msk_impact_2017'` clinical data endpoints with `'msk_impact_2017'` mutations endpoints.

Obtain the set of mutation entrez gene IDs (this makes it easier to count the total number of `entrezGeneId`s):

In [None]:
mutations_entrezGeneId = set()
for m in mutations:
    mutations_entrezGeneId.add(m.entrezGeneId)

print("There are {0} `'msk_impact_2017'` mutation entrez gene IDs!".format(len(mutations_entrezGeneId)))

Use gene mutation ID and proteinChange to filter OncoKB's Annotations:

In [None]:
msk_cd_with_mutations = deepcopy(parsed_clinical_data)
for m in mutations:
    if 'mutations' not in msk_cd_with_mutations[m.uniqueSampleKey]:
        msk_cd_with_mutations[m.uniqueSampleKey]['mutations'] = []
    msk_cd_with_mutations[m.uniqueSampleKey]['mutations'].append(m)

print("There are {0} `'msk_impact_2017'` study samples without mutations.\n".format(len([kv[1] for kv in msk_cd_with_mutations.items() if ('mutations' not in kv[1])])))
print("Total number of mutations in `'msk_impact_2017'` study:", len(mutations))

## Requesting annotations for all `'msk_impact_2017'` study mutations (might take ~5 minutes)!

In [None]:
%%time

import json

msk_mutation_annotations = annotate_mutations(mutations)

assert len(msk_mutation_annotations) == len(mutations), "some annotation requests failed"

In [None]:
msk_mutation_annotations[0]

In [None]:
msk_annotated_mutation_cancer_types = [
    {**{'oncogenic': m[1]['oncogenic'], 'uniquePatientKey': m[0]['uniquePatientKey'],'CANCER_TYPE': parsed_clinical_data[m[0].uniqueSampleKey]['CANCER_TYPE']}}
    for m in msk_mutation_annotations]

In [None]:
len(msk_annotated_mutation_cancer_types)

In [None]:
msk_annotated_mutation_cancer_types[0]

In [None]:
msk_annotated_mutation_cancer_types_driver=[am for am in msk_annotated_mutation_cancer_types if am['oncogenic'] == 'Oncogenic' or am['oncogenic'] == 'Likely Oncogenic' or am['oncogenic'] == 'Predicted Oncogenic']





In [None]:
len(msk_annotated_mutation_cancer_types_driver)

In [None]:
msk_annotated_mutation_cancer_types_driver[0]

In [None]:
df = pandas.DataFrame.from_dict(msk_annotated_mutation_cancer_types_driver)

In [None]:
result = {}
for idx, group in df.groupby(["CANCER_TYPE","uniquePatientKey"]):
    cancer_type, key = idx
    if cancer_type not in result:
        result[cancer_type] = []
    result[cancer_type].append(len(group))
    
data = []
columns = []
for idx, data1 in result.items():
    columns.append(idx)
    data.append(data1)

In [None]:
print(data)

In [None]:
reverted_df = pandas.DataFrame(data).T

In [None]:
reverted_df.columns=columns

In [None]:
meds = reverted_df.median()

In [None]:
print(meds)

In [None]:

meds.sort_values(ascending=True, inplace=True)
reverted_df = reverted_df[meds.index]

In [None]:
reverted_df.plot.box(figsize=(20,20), vert=False)