In [9]:
# Import libraries
import numpy as np
import pandas as pd

In [10]:
# Function to standardize sample IDs
def standardize(sample_ids):
    standard_sample_ids = []
    for sample in sample_ids:
        if sample[6:11] == "MSK-P":
            standard = sample[:6] + "MSKP" + sample[11:]
        elif sample[6:9] == "UHN":
            standard = sample[:6] + "UHNA" + sample[9:]
        else:
            standard = sample
        standard_sample_ids.append(standard[11:])
    return standard_sample_ids

In [12]:
# Read in AACR data and standardize
data = pd.read_csv('Data/primary_tumor_public_clinical_data_UPDATED.csv')
sample_ids_std = standardize(list(data["Sample ID"]))
sample_ids = list(data['Sample ID'])

In [13]:
# Read in mutation data
mutations = pd.read_excel('Data/mutation_data.xlsx')

In [14]:
# Reduce mutation file size to only have tumor sample barcodes in cleaned data (with GENIE Code)
print(mutations.shape[0])
matched = mutations[~mutations["Tumor_Sample_Barcode"].isin(sample_ids) == False]
print(matched.shape[0])
matched.to_csv('Data/matched_genie.csv', index = True)

1048575
15560


In [15]:
# Reduce mutation file size to only have tumor sample barcodes in cleaned data (last numbers)
pattern = '|'.join(sample_ids_std)
new_mut = mutations[mutations['Tumor_Sample_Barcode'].str.contains(pattern)]
print(new_mut)

        Hugo_Symbol  Entrez_Gene_Id Center NCBI_Build  Chromosome  \
11345          TP53          7157.0    JHU     GRCh37        17.0   
14928           NF1          4763.0   DFCI     GRCh37        17.0   
14929         EPHA3          2042.0   DFCI     GRCh37         3.0   
14930           ATM           472.0   DFCI     GRCh37        11.0   
14931            AR           367.0   DFCI     GRCh37         NaN   
...             ...             ...    ...        ...         ...   
1022386       PREX2         80243.0   UCSF     GRCh37         8.0   
1022387       PTPRD          5789.0   UCSF     GRCh37         9.0   
1022388        FGF6          2251.0   UCSF     GRCh37        12.0   
1022389      COL2A1          1280.0   UCSF     GRCh37        12.0   
1022390        CDH1           999.0   UCSF     GRCh37        16.0   

         Start_Position  End_Position Strand  \
11345           7578534       7578534      +   
14928          29552161      29552161      +   
14929          89521655    

In [16]:
# Export matched data
print(mutations.shape[0])
print(new_mut.shape[0])
new_mut.to_csv('Data/matched_nums.csv', index = True)

1048575
15807
