In [1]:
import os, sys, warnings

current_path = os.path.abspath('.')
parent_path = os.path.dirname(current_path)

sys.path.append(parent_path)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'child.settings')

from pandas.core.common import SettingWithCopyWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

In [2]:
import pandas as pd
import numpy as np

In [3]:
brca_clinical_raw = pd.read_csv('brca_clinical.csv', index_col=0) # need to be downloaded from R script
brca_mrna_raw = pd.read_csv('brca_mrna.csv', index_col=0) # need to be downloaded from R script

In [4]:
brca_clinical_raw.bcr_patient_barcode.duplicated().sum() # check for duplicated patient ids

0

In [5]:
brca_mrna_raw.bcr_patient_barcode.duplicated().sum() # check for duplicated patient ids

0

In [6]:
brca_clinical_raw.shape, brca_mrna_raw.shape # check shapes

((1098, 17), (379, 17816))

In [7]:
brca_mrna = brca_mrna_raw.copy()
brca_mrna['bcr_patient_barcode'] = brca_mrna.bcr_patient_barcode.str[:12] # change patient id to be matching for clinical data

In [8]:
selected_genes = ['MCM6', 'MMP9', 'RAB6B', 'ESM1', 'FLT1', 'BRCA1', 'BRCA2'] # select some genes

In [9]:
brca = brca_clinical_raw.merge(brca_mrna[['bcr_patient_barcode'] + selected_genes], on ='bcr_patient_barcode') # merge clinical and gene data

In [10]:
brca = brca.groupby('bcr_patient_barcode').last() # assure unique patient id

In [11]:
brca.shape # check shape

(358, 23)

In [12]:
na_summary = brca.isna().sum().sort_values(ascending=False) / brca.shape[0]

In [13]:
na_summary # check missing data percentages

patient.clinical_cqcf.country                                       0.603352
patient.drugs.drug.therapy_ongoing                                  0.368715
patient.drugs.drug.therapy_types.therapy_type                       0.363128
patient.ethnicity                                                   0.310056
patient.number_of_lymphnodes_positive_by_he                         0.256983
patient.race                                                        0.248603
patient.biospecimen_cqcf.tumor_samples.tumor_sample.tumor_weight    0.131285
patient.menopause_status                                            0.030726
patient.breast_carcinoma_estrogen_receptor_status                   0.008380
patient.breast_carcinoma_progesterone_receptor_status               0.008380
RAB6B                                                               0.002793
MCM6                                                                0.002793
BRCA2                                                               0.002793

In [14]:
selected_cols = na_summary[na_summary < 0.30].index # select columns with less than 30% missing data

In [15]:
brca = brca[selected_cols].dropna()

In [16]:
brca.columns = [c.split('.')[-1] for c in brca.columns] # fix column names

In [17]:
brca.nunique().sort_values() # check for unique values in columns

gender                                             1
vital_status                                       2
breast_carcinoma_estrogen_receptor_status          2
breast_carcinoma_progesterone_receptor_status      3
race                                               4
menopause_status                                   4
histological_type                                  5
pathologic_stage                                   9
number_of_lymphnodes_positive_by_he               18
age_at_initial_pathologic_diagnosis               53
tumor_weight                                     117
times                                            190
MCM6                                             211
FLT1                                             213
BRCA1                                            214
RAB6B                                            214
MMP9                                             214
ESM1                                             214
BRCA2                                         

In [18]:
# remove some columns which do not provide very useful information
brca = brca \
    .loc[lambda x: x.breast_carcinoma_progesterone_receptor_status != 'indeterminate'] \
    .drop(columns=['gender', 'race', 'histological_type'])

In [19]:
brca.menopause_status.value_counts()

post (prior bilateral ovariectomy or >12 mo since lmp with no prior hysterectomy)               134
pre (<6 months since lmp and no prior bilateral ovariectomy and not on estrogen replacement)     67
peri (6-12 months since last menstrual period)                                                    9
indeterminate (neither pre or postmenopausal)                                                     3
Name: menopause_status, dtype: int64

In [20]:
menopause_mapper = {
    'post (prior bilateral ovariectomy or >12 mo since lmp with no prior hysterectomy)': 'post',
    'pre (<6 months since lmp and no prior bilateral ovariectomy and not on estrogen replacement)': 'pre/peri',
    'peri (6-12 months since last menstrual period)': 'pre/peri'
}

stage_mapper = {
    'stage iia': 'II',
    'stage iib': 'II',
    'stage iiia': 'III',
    'stage i': 'I',
    'stage ia': 'I',
    'stage iiic': 'III',
    'stage iv': 'III',
    'stage iiib': 'III',
    'stage ib': 'I'
}

In [21]:
# make shorter names
brca['pathologic_stage'] = brca.pathologic_stage.apply(lambda x: stage_mapper.get(x))
brca['menopause_status'] = brca.menopause_status.apply(lambda x: menopause_mapper.get(x))

In [22]:
brca = brca.dropna(subset=['menopause_status'])

In [23]:
# rename columns
brca.rename(columns={
    'number_of_lymphnodes_positive_by_he': 'pos_lymphnodes',
    'breast_carcinoma_estrogen_receptor_status': 'estr_rec',
    'breast_carcinoma_progesterone_receptor_status': 'prog_rec',
    'age_at_initial_pathologic_diagnosis': 'age',
    'vital_status': 'status',
    'times': 'time',
    'menopause_status': 'menopause',
    'pathologic_stage': 'stage'
}, inplace=True)

In [24]:
brca = brca[['time', 'status', 'age', 'pos_lymphnodes', 'tumor_weight', 'stage', 'menopause', 'estr_rec', 'prog_rec'] + selected_genes] # rearrange columns

In [25]:
brca # final BRCA dataset

Unnamed: 0_level_0,time,status,age,pos_lymphnodes,tumor_weight,stage,menopause,estr_rec,prog_rec,MCM6,MMP9,RAB6B,ESM1,FLT1,BRCA1,BRCA2
bcr_patient_barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
TCGA-A1-A0SE,1321,0,56,0.0,500.0,I,pre/peri,positive,positive,-0.975556,1.557818,-1.289684,-0.844625,0.83825,-1.387917,-1.34050
TCGA-A1-A0SH,1437,0,39,0.0,500.0,II,pre/peri,negative,positive,-1.463444,3.456000,-0.114474,-0.030750,0.18000,-1.924833,-1.54100
TCGA-A1-A0SO,852,0,67,1.0,500.0,II,post,negative,negative,0.846333,3.286000,0.052684,-2.638750,-0.32875,-0.136417,0.76800
TCGA-A2-A04N,3153,0,66,0.0,153.0,I,post,positive,positive,-1.699778,2.802545,-1.264211,0.503750,0.62900,-1.396583,-1.05250
TCGA-A2-A04P,548,1,36,18.0,276.0,III,pre/peri,negative,negative,-0.258222,3.512545,-0.281105,-0.378125,1.77150,-3.127000,1.54975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-BH-A0BR,1633,0,59,0.0,290.0,I,post,positive,positive,-1.210222,4.281273,-1.193316,0.050500,1.85000,-2.114917,-1.25525
TCGA-BH-A0BS,1641,0,55,0.0,260.0,III,post,positive,positive,-2.233333,3.336273,-2.137684,-1.362000,0.17000,-1.825833,-0.82350
TCGA-BH-A0BV,1519,0,78,2.0,470.0,II,post,positive,positive,-1.202000,4.159364,-1.488316,-0.425125,0.54300,-0.924917,-1.37025
TCGA-BH-A0BW,355,0,71,0.0,180.0,I,post,negative,negative,-0.161111,3.744818,0.629053,-0.840625,0.68575,-1.263083,-0.90925


In [26]:
# brca.to_csv('brca-v2.csv') # write to csv