In [2]:
import pandas as pd
from pathlib import Path

import numpy as np

In [3]:
pth = "/home/tsakalis/ntua/nestor_celvia/metadata/gene_adco.csv"

In [4]:
ref_df = pd.read_csv(pth)

In [5]:
base_metadata_pth = Path('/home/tsakalis/ntua/nestor_celvia/metadata')

basic_metadata_file = 'metadata.repository.2024-11-05.json'
clinical_cohort_file = 'clinical.cohort.2024-11-07.json'
biospecimen_file = 'biospecimen.cohort.2024-11-07.json'
gene_expr_file = 'Human__TCGA_OV__UNC__RNAseq__GA_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct'
# gene_expr_file = "Human__TCGA_OV__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct"

gene_mutation_file = 'Human__TCGA_OV__WUSM__Mutation__GAIIx__01_28_2016__BI__Gene__Firehose_MutSig2CV.cbt'

In [6]:
gene_names = pd.read_csv(base_metadata_pth / 'GeneNames.tsv', header=None)

In [7]:
metadata_df = pd.read_json(base_metadata_pth / basic_metadata_file)

clinical_cohort_df = pd.read_json(base_metadata_pth / clinical_cohort_file)

biospecimen_df = pd.read_json(base_metadata_pth / biospecimen_file)
gene_expr_df = pd.read_csv(base_metadata_pth / gene_expr_file, delimiter='\t')

gene_mutation_df = pd.read_csv(base_metadata_pth / gene_mutation_file,
                               delimiter='\t')

In [8]:
metadata_df['submitter_id'] = metadata_df.submitter_id.apply(
    lambda x: '.'.join(x.split('-')[:3]))

In [9]:
merged_genes = pd.merge(gene_expr_df.set_index('attrib_name'),
                        gene_names.set_index(0),
                        left_index=True,
                        right_index=True)


In [10]:
clinical_cohort_df['submitter_id'] = clinical_cohort_df['submitter_id'].apply(
    lambda x: x.replace('-', '.'))

In [11]:
gene_patient = pd.merge(metadata_df,
                        merged_genes.T,
                        left_on='submitter_id',
                        right_index=True)

In [12]:
columns_map = {"submitter_id": "TCGA patient", "fle_name_h5": "path", "file_name": "filename"}

In [13]:
gene_patient['fle_name_h5'] = gene_patient['file_name'].apply(lambda x: x.replace('svs', 'h5')).values


In [14]:
gene_patient.columns

Index(['data_format', 'access', 'associated_entities', 'file_name',
       'submitter_id', 'data_category', 'annotations', 'file_size', 'md5sum',
       'file_id', 'data_type', 'state', 'experimental_strategy', 'GNA11',
       'BTK', 'HRAS', 'RNF43', 'AKT2', 'EGFR', 'ARID1A', 'ERBB4', 'EZH2',
       'MAP2K2', 'DDR2', 'NOTCH3', 'CREBBP', 'MAPK1', 'MYCN', 'RAC1',
       'SMARCB1', 'TSC2', 'CDK6', 'FGFR2', 'PTCH1', 'MYD88', 'CBL', 'GNAQ',
       'PPP2R1A', 'TOP1', 'NTRK1', 'NFE2L2', 'NOTCH2', 'MYC', 'CHEK2', 'MAGOH',
       'PDGFRA', 'BRAF', 'ATM', 'fle_name_h5'],
      dtype='object')

In [15]:
gene_patient['file_id']

0       13c2fa97-02ed-4442-aad8-9c4e6b365adc
2       d93b71fa-bfe0-4402-876a-b51edef5ef86
3       9d990512-1576-4625-8fbd-a8b46fbbdf99
4       26438d97-7bd3-4f4c-9f09-38d5cef705a0
5       525e99fd-2d3f-49b9-bf2f-bbacce843c16
                        ...                 
1364    a20a6cb2-2da0-491d-a22a-3bd74fb3062e
1365    81a8c4e8-f40d-4e3e-b18c-f869917c50cb
1366    d1217464-aa0f-4f9e-aeb0-9513886939c5
1367    019607b4-e183-46ee-b062-9abcbe54ceb5
1368    c43c2fa4-1429-4959-a736-3a0a324f6208
Name: file_id, Length: 1337, dtype: object

In [16]:
gene_patient['TCGA ID'] = gene_patient['file_name'].apply(lambda x: '-'.join(x.split('-')[:4])).values

# gene_patient['TCGA patient'] = gene_patient['file_name'].apply(lambda x: '-'.join(x.split('-')[:4])).values

In [17]:
gene_patient_final = gene_patient.rename(columns_map).copy()

In [18]:
gene_patient_final

Unnamed: 0,data_format,access,associated_entities,file_name,submitter_id,data_category,annotations,file_size,md5sum,file_id,...,NFE2L2,NOTCH2,MYC,CHEK2,MAGOH,PDGFRA,BRAF,ATM,fle_name_h5,TCGA ID
0,SVS,open,[{'entity_submitter_id': 'TCGA-61-1903-01A-01-...,TCGA-61-1903-01A-01-BS1.77116a06-9e30-4bf6-885...,TCGA.61.1903,Biospecimen,"[{'entity_submitter_id': 'TCGA-61-1903', 'note...",200210513,05da084e2d65c34aa87bf865483f8b6d,13c2fa97-02ed-4442-aad8-9c4e6b365adc,...,7.6083,8.3437,7.8797,7.3016,8.3923,5.1404,7.7682,5.9685,TCGA-61-1903-01A-01-BS1.77116a06-9e30-4bf6-885...,TCGA-61-1903-01A
2,SVS,open,[{'entity_submitter_id': 'TCGA-42-2587-01A-01-...,TCGA-42-2587-01A-01-TS1.f9c60f94-e626-4e40-849...,TCGA.42.2587,Biospecimen,"[{'entity_submitter_id': 'TCGA-42-2587', 'note...",138771709,c9d89dc3808a0df5bcfb7fe39994b8ca,d93b71fa-bfe0-4402-876a-b51edef5ef86,...,7.9143,8.6136,7.5858,7.2941,8.6792,7.6029,7.9861,8.4471,TCGA-42-2587-01A-01-TS1.f9c60f94-e626-4e40-849...,TCGA-42-2587-01A
3,SVS,open,[{'entity_submitter_id': 'TCGA-29-2414-01A-02-...,TCGA-29-2414-01A-02-BS2.5006bae6-462f-4310-b8a...,TCGA.29.2414,Biospecimen,"[{'entity_submitter_id': 'TCGA-29-2414', 'note...",141220505,0ef0c1307832bc2c94ad65b2e64eedc4,9d990512-1576-4625-8fbd-a8b46fbbdf99,...,7.6193,10.4179,9.2474,6.6855,8.0525,6.3294,7.8052,6.5273,TCGA-29-2414-01A-02-BS2.5006bae6-462f-4310-b8a...,TCGA-29-2414-01A
4,SVS,open,[{'entity_submitter_id': 'TCGA-29-2414-02A-01-...,TCGA-29-2414-02A-01-TS1.9e8b6cda-a655-40f9-a44...,TCGA.29.2414,Biospecimen,"[{'entity_submitter_id': 'TCGA-29-2414', 'note...",99784483,04cb8fd6eb96121626209cea1cf993d5,26438d97-7bd3-4f4c-9f09-38d5cef705a0,...,7.6193,10.4179,9.2474,6.6855,8.0525,6.3294,7.8052,6.5273,TCGA-29-2414-02A-01-TS1.9e8b6cda-a655-40f9-a44...,TCGA-29-2414-02A
5,SVS,open,[{'entity_submitter_id': 'TCGA-42-2588-01A-01-...,TCGA-42-2588-01A-01-TS1.cc3b36dc-1ce9-4db3-998...,TCGA.42.2588,Biospecimen,"[{'entity_submitter_id': 'TCGA-42-2588', 'note...",116102537,9f04fbc5ce72fabcb0a1009372df12c5,525e99fd-2d3f-49b9-bf2f-bbacce843c16,...,7.6001,9.1204,8.6680,6.3689,8.5931,7.5047,7.1795,6.7160,TCGA-42-2588-01A-01-TS1.cc3b36dc-1ce9-4db3-998...,TCGA-42-2588-01A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364,SVS,open,[{'entity_submitter_id': 'TCGA-13-1484-01A-01-...,TCGA-13-1484-01A-01-BS1.dfebf9da-d2d8-42cd-a94...,TCGA.13.1484,Biospecimen,,271158307,17bad5ae203854c3fa27beeaeb1d51ce,a20a6cb2-2da0-491d-a22a-3bd74fb3062e,...,7.3916,8.7870,8.9912,6.3021,7.9721,5.7415,8.0873,7.1429,TCGA-13-1484-01A-01-BS1.dfebf9da-d2d8-42cd-a94...,TCGA-13-1484-01A
1365,SVS,open,[{'entity_submitter_id': 'TCGA-13-0906-01A-01-...,TCGA-13-0906-01A-01-BS1.e8aa0544-cc48-4941-9d7...,TCGA.13.0906,Biospecimen,,175722371,dff143da3e0fb16bf4b0559a2a92af77,81a8c4e8-f40d-4e3e-b18c-f869917c50cb,...,8.3973,8.8336,8.2922,6.1920,9.4318,6.2662,7.5769,5.9633,TCGA-13-0906-01A-01-BS1.e8aa0544-cc48-4941-9d7...,TCGA-13-0906-01A
1366,SVS,open,[{'entity_submitter_id': 'TCGA-13-0906-01A-01-...,TCGA-13-0906-01A-01-TS1.7be7e649-9db2-4a60-b12...,TCGA.13.0906,Biospecimen,,315044077,2e9988fa5644ccd92fcc669d9d369e24,d1217464-aa0f-4f9e-aeb0-9513886939c5,...,8.3973,8.8336,8.2922,6.1920,9.4318,6.2662,7.5769,5.9633,TCGA-13-0906-01A-01-TS1.7be7e649-9db2-4a60-b12...,TCGA-13-0906-01A
1367,SVS,open,[{'entity_submitter_id': 'TCGA-04-1348-01A-01-...,TCGA-04-1348-01A-01-TS1.ffb07f65-72b7-494c-abf...,TCGA.04.1348,Biospecimen,,109640757,1d974d73037217e9a9d97e08023d6eeb,019607b4-e183-46ee-b062-9abcbe54ceb5,...,8.2965,8.8037,9.1196,6.5004,8.4558,5.9298,7.5717,6.4380,TCGA-04-1348-01A-01-TS1.ffb07f65-72b7-494c-abf...,TCGA-04-1348-01A


In [19]:
gene_targets = ['GNA11',
       'BTK', 'HRAS', 'RNF43', 'AKT2', 'EGFR', 'ARID1A', 'ERBB4', 'EZH2',
       'MAP2K2', 'DDR2', 'NOTCH3', 'CREBBP', 'MAPK1', 'MYCN', 'RAC1',
       'SMARCB1', 'TSC2', 'CDK6', 'FGFR2', 'PTCH1', 'MYD88', 'CBL', 'GNAQ',
       'PPP2R1A', 'TOP1', 'NTRK1', 'NFE2L2', 'NOTCH2', 'MYC', 'CHEK2', 'MAGOH',
       'PDGFRA', 'BRAF', 'ATM']

In [20]:
gene_patient_final.columns

Index(['data_format', 'access', 'associated_entities', 'file_name',
       'submitter_id', 'data_category', 'annotations', 'file_size', 'md5sum',
       'file_id', 'data_type', 'state', 'experimental_strategy', 'GNA11',
       'BTK', 'HRAS', 'RNF43', 'AKT2', 'EGFR', 'ARID1A', 'ERBB4', 'EZH2',
       'MAP2K2', 'DDR2', 'NOTCH3', 'CREBBP', 'MAPK1', 'MYCN', 'RAC1',
       'SMARCB1', 'TSC2', 'CDK6', 'FGFR2', 'PTCH1', 'MYD88', 'CBL', 'GNAQ',
       'PPP2R1A', 'TOP1', 'NTRK1', 'NFE2L2', 'NOTCH2', 'MYC', 'CHEK2', 'MAGOH',
       'PDGFRA', 'BRAF', 'ATM', 'fle_name_h5', 'TCGA ID'],
      dtype='object')

In [21]:
gene_patient_final[gene_targets] = np.log(gene_patient_final[gene_targets].values)

In [22]:
gene_patient_final.to_csv('train_genes_adco.csv', index=False)

In [23]:
gene_patient_final.columns


Index(['data_format', 'access', 'associated_entities', 'file_name',
       'submitter_id', 'data_category', 'annotations', 'file_size', 'md5sum',
       'file_id', 'data_type', 'state', 'experimental_strategy', 'GNA11',
       'BTK', 'HRAS', 'RNF43', 'AKT2', 'EGFR', 'ARID1A', 'ERBB4', 'EZH2',
       'MAP2K2', 'DDR2', 'NOTCH3', 'CREBBP', 'MAPK1', 'MYCN', 'RAC1',
       'SMARCB1', 'TSC2', 'CDK6', 'FGFR2', 'PTCH1', 'MYD88', 'CBL', 'GNAQ',
       'PPP2R1A', 'TOP1', 'NTRK1', 'NFE2L2', 'NOTCH2', 'MYC', 'CHEK2', 'MAGOH',
       'PDGFRA', 'BRAF', 'ATM', 'fle_name_h5', 'TCGA ID'],
      dtype='object')

In [24]:
ref_df.columns

Index(['TCGA ID', 'MKI67', 'AURKA', 'BIRC5', 'CCNB1', 'MYBL2', 'MMP11', 'CTSV',
       'ESR1', 'PGR', 'BCL2', 'SCUBE2', 'GRB7', 'ERBB2', 'GSTM1', 'CD68',
       'BAG1', 'ACTB', 'GAPDH', 'RPLP0', 'GUSB', 'TFRC', 'filename', 'path',
       'slide_id', 'bcr_patient_barcode', 'OS', 'OS.time', 'PFI', 'PFI.time',
       'aperio.AppMag', 'tumor', 'TCGA patient'],
      dtype='object')