In [1]:
import requests
import json
import numpy as np 
import pandas as pd
import os
import pickle
import regex as re
from zipfile import ZipFile
import gzip
import tarfile
from fuzzywuzzy import process
from itertools import *
import collections

In [2]:
def get_clinical_data_online(filters_table, fields_table, cases_endpoint):
    params_table = {
    "filters": json.dumps(filters_table),
    "fields": ",".join(fields_table),
    "format": "TSV",
    "size": "5000"
    }
    response_table = requests.get(cases_endpoint, params = params_table)
    clinical_data = response_table.content.decode("utf-8")
    clinical_data_processed = clinical_data.replace("\t", ",")
    clinical_data_processed = clinical_data_processed.replace("\r\n", ",")
    cd = clinical_data_processed.split(",")
    
    #find index with first occurance of a number 
    for i, value in enumerate(cd):
        if re.search(r"\d", value)!= None:
            reshape_value = i
            break
    
    cd_array = np.asarray(cd[0:-1]) 
    cd_array = cd_array.reshape(-1, reshape_value)
    cd_df = pd.DataFrame(cd_array[1:], columns=cd_array[0])
    column_renaming_dictionary = {value:value.split('.')[-1] for value in cd_df.columns}
    cd_df.rename(columns = column_renaming_dictionary, inplace = True)
    return cd_df

In [3]:
def get_clinical_data_files_locally(root_dir):
    clinical_dfs = []
    for root, dirs, files in os.walk(root_dir):
        for file in files:
            if file.endswith(".txt") and "MANIFEST" not in file:
                clinical_dfs.append(pd.read_csv(os.path.join(root,file), sep = '\t', skiprows = [2], header = 1)) #skiprows= 2, header = 0
    return clinical_dfs

In [4]:
def lowercase_dataframe(dfs):
    lowercase_dfs = []
    for df in dfs:
        lowercase_dfs.append(df.applymap(lambda s:s.lower() if type(s) == str else s))
    return lowercase_dfs

In [5]:
def uuid_index(lowercase_clinical_dfs):
    for df in lowercase_clinical_dfs:
        if df.index.name!='bcr_patient_uuid':
            try: 
                df.set_index('bcr_patient_uuid', inplace = True) 
            except:
                continue

In [6]:
def limit_to_select_columns(column_set, lowercase_dfs):
    limited_dfs = []
    for df in lowercase_dfs:
        available_column = [column_name for column_name in column_set if column_name in df.columns]
        limited_dfs.append(df.loc[:,available_column])
    limited_dfs =[df for df in limited_dfs if df.shape[1]!=0]
    return limited_dfs

In [7]:
def concat_dfs_on_patient_uuid(dfs):
    df1 = dfs[0]
    i=1
    for df in dfs[1:]:
        df1= df1.join(df, on='bcr_patient_uuid', rsuffix= f'{i}', how='outer')
        i+=1
    df1.drop_duplicates(inplace=True)
    df1.set_index('bcr_patient_uuid', inplace=True)
    return df1

In [8]:
def push_columns_together(column_set, dataframe):
    for column_type in column_set: 
        subset_group = [column_name for column_name in dataframe.columns if column_type in column_name]
        if column_type!='karnofsky_performance_score':   
            dataframe[column_type] = dataframe[subset_group].fillna('').agg(','.join, axis=1).str.strip(',')
        if column_type=='karnofsky_performance_score':
            for column in subset_group:
                dataframe[column] = pd.to_numeric(dataframe[column],errors='coerce')
            dataframe[column_type]=dataframe[subset_group].max(axis=1)
    #dataframe['drug_name']=dataframe.drug_name.str.split(r"\||,")
    dataframe['vital_status']=['dead' if 'dead' in value else 'alive' for value in dataframe['vital_status']]
    #dataframe = dataframe.explode('drug_name')
    return dataframe[column_set]

In [9]:
def explode_drug_name_column(dataframe):
    df = dataframe.copy()
    df.drug_name = df.drug_name.str.split(r"\||,")
    exploded_dataframe = df.reset_index().explode('drug_name')
    exploded_dataframe.set_index('bcr_patient_uuid', inplace=True)
    #the explode function makes some accidental duplicates when there are duplicate indices, hence the index reset
    return exploded_dataframe

In [28]:
df5.drug_name.value_counts()

NameError: name 'df5' is not defined

In [29]:
df5 =explode_drug_name_column(df4)

NameError: name 'df4' is not defined

In [10]:
def get_caseid_txt_file(dataframe, filename):
    uuids = dataframe[(dataframe.drug_name!='')
                      &(dataframe.drug_name!='[not available]')
                      &(dataframe.drug_name!='[unknown]')].index
    with open(f'{filename}.txt','w') as f:
      f.write('\n'.join(uuids.unique()))
    return uuids

In [31]:
column_set

NameError: name 'column_set' is not defined

In [11]:
kidney_filters_table = {
    "op":"and",
    "content":[{"op":"in", 
                "content":{
        "field":"primary_site",
        "value": ["Kidney"]}}, 
                {"op":"in",
                 "content":{
                     "field":"files.experimental_strategy", 
                     "value":["RNA-Seq"]}}]}



fields_table = ["case_id","primary_site","project.project_id"] 

cases_endpoint ='https://api.gdc.cancer.gov/cases'

kidney_cd_df = get_clinical_data_online(kidney_filters_table, fields_table, cases_endpoint)

kidney_cd_df.project_id.value_counts() 
# this provides the list of projects from which to fetch clinical data  
# unfortunately, only TCGA provides drug information

TCGA-KIRC      530
TCGA-KIRP      289
TARGET-WT      125
CPTAC-3        110
TCGA-KICH       66
TARGET-RT       57
TARGET-CCSK     13
TARGET-NBL       6
TCGA-SARC        2
Name: project_id, dtype: int64

In [16]:
root_dir = '/Users/dinakats/Desktop/SPICED/final_proj_git_renew/Genetics-to-Therapuetics/Data/kidney2test'

In [13]:
def get_all_clinical_files_for_disease(clinical_data_dataframe, root_dir_for_files):
    files_endpt = "https://api.gdc.cancer.gov/files"
    #only tcga has the correctly formatted clinical files 
    tcga_project_ids =[project_id for project_id in kidney_cd_df.project_id.value_counts().index if "TCGA" in project_id]
    
    for project_id in tcga_project_ids: 
        print(project_id)
        filters = {
            "op": "and",
            "content":[
                {
                "op": "in",
                "content":{
                    "field": "cases.project.program.name",
                    "value": [project_id.split("-")[0]]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "cases.project.project_id",
                    "value": [project_id]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.data_category",
                    "value": ["clinical"]
                    }
                },
                {
                "op": "in",
                "content":{
                    "field": "files.data_format",
                    "value": ["bcr biotab"]
                    }}]
                }

        params = {
        "filters": json.dumps(filters),
        "fields": "file_id",
        "format": "JSON",
        "size": "20"
        }
        
        # Here a GET is used, so the filter parameters should be passed as a JSON string.
        response = requests.get(files_endpt, params = params)
        file_uuid_list = []

        # This step populates the download list with the file_ids from the previous query
        for file_entry in json.loads(response.content.decode("utf-8"))["data"]["hits"]:
            file_uuid_list.append(file_entry["file_id"])

        data_endpt = "https://api.gdc.cancer.gov/data"

        params = {"ids": file_uuid_list}

        response = requests.post(data_endpt, data = json.dumps(params), headers = {"Content-Type": "application/json"})
        response_head_cd = response.headers["Content-Disposition"]
        print(response_head_cd)

        file_name = f'{project_id}.tar.gz'

        with open(f'{root_dir_for_files}/{file_name}', "wb") as output_file:
            output_file.write(response.content)

In [14]:
get_all_clinical_files_for_disease(kidney_cd_df, root_dir)

TCGA-KIRC
attachment; filename=gdc_download_20200730_223315.823238.tar.gz


FileNotFoundError: [Errno 2] No such file or directory: '/Users/dinakats/Desktop/SPICED/final_proj_git_renew/Genetics-to-Therapuetics/Data/kidney3test/TCGA-KIRC.tar.gz'

In [17]:
unzip_clinical_files(root_dir)

[]

In [15]:
def unzip_clinical_files(root_dir):
    list_of_files_to_unpack =[]
    for filename in os.listdir(root_dir):
        #try:
        if ".gz" in filename: 
            list_of_files_to_unpack.append(os.path.join(root_dir,filename))
            full_path = root_dir+"/"+filename
            open_tar = tarfile.open(full_path)
            open_tar.extractall(f'{root_dir}/{filename.split(".")[0]}')
            open_tar.close()
        #except:
            #continue
    return list_of_files_to_unpack

In [37]:
def sort_drug_names(dataframe, drug_name_dictionary): 
    #create reverse dictionary from given dictlist
    drug_dict = {}

    for key, value in drug_name_dictionary.items():
        for item in value: 
            drug_dict[item]=key
    #explode out names in parenthesis
    df = dataframe.copy()
    df.drug_name = df.drug_name.str.split(r"\(")
    exploded_df = df.reset_index().explode('drug_name')
    exploded_df.drug_name = exploded_df.drug_name.str.strip('\)')
    
    #use lists and top drug names to correct for any spelling errors
    drug_name_value_counts= exploded_df[(exploded_df.drug_name!='')&(exploded_df.drug_name!='[not available]')].drug_name.value_counts()
    top_used_drugs = drug_name_value_counts[drug_name_value_counts>10].index.tolist()
    correctly_spelled_drug_names =set(top_used_drugs + list(drug_dict.keys())+ list(drug_dict.values()))
    
    fuzzy_match_dict = {}
    fuzzywuzzy_threshold = 85
    for drug in exploded_df.drug_name:
        if drug not in correctly_spelled_drug_names and drug!='':
            new_name, score = process.extractOne(drug, correctly_spelled_drug_names)
            if score>fuzzywuzzy_threshold:
                fuzzy_match_dict[drug]=new_name
    
    #use drug dictionary to replace drug names
    exploded_df['standard_drugs'] = exploded_df.drug_name.map(fuzzy_match_dict).fillna(exploded_df['drug_name'])
    exploded_df['standard_drugs'] = exploded_df.standard_drugs.map(drug_dict).fillna(exploded_df['standard_drugs'])
    exploded_df.drop_duplicates(inplace=True)
    
    return fuzzy_match_dict, exploded_df

In [38]:
alternative_drug_names = {'gemcitabine':['abine','accogem','acytabin','antoril','axigem','bendacitabin','biogem','boligem','celzar',
                          'citegin','cytigem','cytogem','daplax','dbl','demozar','dercin','emcitab','enekamub','eriogem','fotinex',
                          'gebina','gemalata','gembin','gembine','gembio','gemcel','gemcetin','gemcibine','gemcikal','gemcipen',
                          'gemcired','gemcirena','gemcit','gemcitabin','gemcitabina','gemcitabine','gemcitabinum','gemcitan',
                          'gemedac','gemflor','gemful','gemita','gemko','gemliquid','gemmis','gemnil','gempower','gemsol',
                          'gemstad','gemstada','gemtabine','gemtavis','gemtaz','gemtero','gemtra','gemtro','gemvic','gemxit',
                          'gemzar','gentabim','genuten','genvir','geroam','gestredos','getanosan','getmisi','gezt','gitrabin',
                          'gramagen','haxanit','jemta','kalbezar','medigem','meditabine','nabigem','nallian','oncogem','oncoril',
                          'pamigeno','ribozar','santabin','sitagem','symtabin','yu jie','ze fei','zefei'],
                          'sorafenib':['nexavar','bay-439006'],
                          'doxorubicin':['adriamycin','doxil','liposomal doxorubicin'],
                          'doxetaxel':['taxotere'],
                          'pazonib':['votrient'],
                          'sunitinib':['sutent'],
                          'temsirolimus':['torisel'],
                          'avastin':['bevacizumab'],
                          'interferon-alpha':['interferon'],
                          'capecitibine':['xeloda'],
                          'everolimus':['afinitor','rad001'],
                          'trabectedin':['et-743'],
                          'gefitinib':['iressa'],
                          'dacarbazine':['dtic'],
                          'letrozole':['femara'],
                          'il-2':['interleukin', 'interleukin-2', 'il 2', 'il2'],
                          'deforolimus':['ridaforolimus', 'MK-8669', 'AP23573','ap-23573'],
                          'cisplatin':['platinol'],
                          'carboplatin':['paraplatin']
                          }

In [39]:
column_set = ['drug_name', 'karnofsky_performance_score', 'therapy_type', 'vital_status']

In [49]:
df1 = get_clinical_data_files_locally(root_dir)
df1 = lowercase_dataframe(df1)
uuid_index(df1)

In [41]:
len(df1)

82

In [45]:
df1[0].index.name

'bcr_patient_uuid'

In [323]:
df2 = limit_to_select_columns(column_set, df1)

In [324]:
df2[12].index.name

'bcr_patient_uuid'

In [325]:
df3 = concat_dfs_on_patient_uuid(df2)

In [326]:
df3.shape

(1535, 26)

In [327]:
df4 = push_columns_together(column_set,df3)

In [328]:
df4.shape

(1535, 4)

In [330]:
df4.drug_name.value_counts()

                                870
[not available]                 170
gemcitabine                      40
adriamycin                       23
sunitinib                        23
                               ... 
arimidex                          1
temsirolimus,[not available]      1
adriamycin,sunitinib              1
[not available],rituxan           1
pazopanib,[not available]         1
Name: drug_name, Length: 162, dtype: int64

In [362]:
df5 = explode_drug_name_column(df4)

In [363]:
df5.drug_name.value_counts()

                    870
[not available]     206
gemcitabine          45
adriamycin           27
sunitinib            26
                   ... 
leuprolide            1
cyclophosphamide      1
zd6474                1
rad001                1
threshold-302         1
Name: drug_name, Length: 144, dtype: int64

In [370]:
df5.head()

Unnamed: 0,bcr_patient_uuid,drug_name,karnofsky_performance_score,therapy_type,vital_status
0,5cbf0134-023b-4bd0-97da-eb836ccbc729,lupron,,,alive
0,5cbf0134-023b-4bd0-97da-eb836ccbc729,leuprolide,,,alive
1,cff68090-09df-492b-874c-0caeb29f9361,[not available],,,alive
2,5570758c-1f07-4ff9-a570-b7ae39d15a89,[not available],,,alive
3,f9c593d1-f204-4026-8a9b-a6af0922c885,[not available],100.0,,alive


In [110]:
list_uuid = get_caseid_txt_file(df5, 'nope.txt')

In [382]:
df5.drug_name.value_counts()

                    870
[not available]     206
gemcitabine          45
adriamycin           27
sunitinib            26
                   ... 
leuprolide            1
cyclophosphamide      1
zd6474                1
rad001                1
threshold-302         1
Name: drug_name, Length: 144, dtype: int64

In [380]:
df5 =explode_drug_name_column(df4)

In [382]:
df5.drug_name.value_counts()

                    870
[not available]     206
gemcitabine          45
adriamycin           27
sunitinib            26
                   ... 
leuprolide            1
cyclophosphamide      1
zd6474                1
rad001                1
threshold-302         1
Name: drug_name, Length: 144, dtype: int64

In [639]:
misspell_dict, df6 = sort_drug_names(df5, alternative_drug_names)

In [641]:
df6.drug_name.value_counts()

                        848
[not available]         188
gemcitabine              45
sunitinib                29
adriamycin               27
                       ... 
zd6474                    1
pemetrexed injection      1
lyrpon                    1
cytoxan                   1
leuprolide                1
Name: drug_name, Length: 150, dtype: int64

In [642]:
df6.standard_drugs.value_counts()

                       848
[not available]        188
gemcitabine             58
doxorubicin             51
sunitinib               51
                      ... 
cyclophosphamide         1
zd6474                   1
lyrpon                   1
cytoxan                  1
lymphocyte infusion      1
Name: standard_drugs, Length: 108, dtype: int64

In [643]:
 pickle.dump(df6, open( "clinical_data_df6.pickle", "wb" ) )

# To get data files:
visit: https://portal.gdc.cancer.gov/repository?facetTab=cases  
click: Upload Case Set  
upload: text file from get_caseid_txt_file  
after upload click on files:  
 - choose Experimental Strategy - RNA-Seq
 - choose Workflow Type - HTSeq - FPKM  
 
click: Add all files to cart, then go to the cart (upper right corner)
 download the cart, and the metadata file 