# Breast Cancer Project

# Lymph Node Metastasis Prediction Using miRNA Expression Data

# Reading Step

## Import Library

In [1]:
import numpy as np
import pandas as pd
import pickle
import os

## Import Functions

In [2]:
def read_object(obj_path):
    with open(obj_path, 'rb') as input:
        return pickle.load(input)

def save_object(obj, dir_path, file_name, file_extension='pkl', create_dir=True):
    if create_dir:
        if not os.path.isdir(dir_path):
            os.makedirs(dir_path)
    name = os.path.normpath(os.path.join(dir_path, file_name+'.'+file_extension))
    with open(name, 'wb') as output:  # overwrites any existing file
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
        return name

## Reading Data

### miRNA Data

In [3]:
dir_path = 'F:/internship/Breast Cancer Project (Lymph Node Metastasis)(miRNA)/Data/'
file_name = 'Human__TCGA_BRCA__BDGSC__miRNASeq__HS_miR__01_28_2016__BI__Gene__Firehose_RPKM_log2 (HiSeq_Gene level).cct'
miRNA_data = pd.read_table(dir_path + file_name, sep='\t', low_memory=False)
miRNA_data

Unnamed: 0,attrib_name,TCGA.3C.AAAU,TCGA.3C.AALI,TCGA.3C.AALJ,TCGA.3C.AALK,TCGA.4H.AAAK,TCGA.5L.AAT0,TCGA.5L.AAT1,TCGA.5T.A9QA,TCGA.A1.A0SB,...,TCGA.UL.AAZ6,TCGA.UU.A93S,TCGA.V7.A7HQ,TCGA.W8.A86G,TCGA.WT.AB41,TCGA.WT.AB44,TCGA.XX.A899,TCGA.XX.A89A,TCGA.Z7.A8R5,TCGA.Z7.A8R6
0,hsa-let-7a-1,13.1299,12.9183,13.0122,13.1449,13.4118,13.3164,13.4454,13.7280,13.6016,...,11.3871,12.8293,13.0703,13.7166,13.0353,13.3759,14.0362,13.6797,12.9623,13.3498
1,hsa-let-7a-2,14.1180,13.9224,14.0101,14.1418,14.4136,14.3110,14.4486,14.7146,14.6088,...,12.3949,13.8227,14.0704,14.7092,14.0367,14.3667,15.0364,14.6849,13.9664,14.3500
2,hsa-let-7a-3,13.1479,12.9134,13.0287,13.1514,13.4206,13.3273,13.4469,13.7370,13.6132,...,11.4324,12.8242,13.0611,13.7177,13.0530,13.3700,14.0434,13.6916,12.9851,13.3786
3,hsa-let-7b,14.5952,14.5127,13.4197,14.6673,14.4386,14.5766,14.6112,15.0988,16.5058,...,11.3976,13.5017,13.6050,15.2094,13.9086,14.5141,14.3396,14.1983,14.3207,14.1173
4,hsa-let-7c,8.4191,9.6483,9.3147,11.5119,11.6944,11.1391,11.2850,9.2000,13.3923,...,7.3776,10.8293,10.6679,12.4698,11.2441,11.9267,12.3620,12.6844,11.9806,10.3791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
818,hsa-mir-95,2.3574,1.1753,1.4031,1.4538,0.8567,1.4006,1.2028,2.8734,1.6279,...,0.8669,1.0416,2.4242,1.1042,2.0449,1.1489,1.7304,0.9498,1.4097,1.9057
819,hsa-mir-96,6.9187,5.9366,6.6184,6.2011,4.3887,6.5431,4.8486,5.7052,5.6049,...,6.4282,4.7626,5.5974,6.0759,5.8115,6.3365,4.1400,5.4236,5.8132,5.5220
820,hsa-mir-98,5.7812,6.4437,6.8905,5.4122,5.1449,5.3830,5.9911,5.3635,6.0218,...,6.4690,5.5098,6.1216,5.1681,6.0045,5.8735,5.8666,5.7870,5.7553,6.3427
821,hsa-mir-99a,7.0356,7.8914,7.5882,10.0330,10.0795,9.3659,9.3093,6.9476,12.0124,...,5.4539,9.3879,9.2302,10.6873,9.7652,10.1058,10.9073,10.3566,10.1318,8.8803


### Clinical Data

In [4]:
dir_path = 'F:/internship/Breast Cancer Project (Lymph Node Metastasis)(miRNA)/Data/'
file_name = 'Human__TCGA_BRCA__MS__Clinical__Clinical__01_28_2016__BI__Clinical__Firehose.tsi'
clinical_data = pd.read_table(dir_path + file_name, sep='\t', low_memory=False)
clinical_data

Unnamed: 0,attrib_name,TCGA.5L.AAT0,TCGA.5L.AAT1,TCGA.A1.A0SP,TCGA.A2.A04V,TCGA.A2.A04Y,TCGA.A2.A0CQ,TCGA.A2.A1G4,TCGA.A2.A25A,TCGA.A7.A0CD,...,TCGA.S3.AA11,TCGA.S3.AA14,TCGA.S3.AA15,TCGA.UL.AAZ6,TCGA.UU.A93S,TCGA.V7.A7HQ,TCGA.WT.AB44,TCGA.XX.A899,TCGA.XX.A89A,TCGA.Z7.A8R6
0,years_to_birth,42,63,40,39,53,62,71,44,66,...,67,47,51,73,63,75,,46,68,46
1,Tumor_purity,0.6501,0.5553,0.6913,0.8341,0.7318,0.7701,0.8879,0.595,0.8525,...,0.7638,0.7492,0.5387,0.8031,0.8339,0.7774,0.7493,0.6069,0.6067,0.8143
2,pathologic_stage,stageii,stageiv,stageii,stageii,stageii,stagei,stageiii,stageii,stagei,...,stageii,stagei,stageii,stageii,stageiv,stageiii,stagei,stageiii,stageii,stagei
3,pathology_T_stage,t2,t2,t2,t2,t2,t1,t3,t2,t1,...,t2,t1,t2,t2,t4,t1,t1,t1,t3,t1
4,pathology_N_stage,n0,n0,n0,n0,n1,n0,n1,n0,n0,...,n0,n0,n1,n0,n3,n2,n0,n2,n0,n0
5,pathology_M_stage,m0,m1,m0,m0,m0,m0,m0,m0,m0,...,m0,m0,m0,,m1,m0,,,,m0
6,histological_type,infiltratinglobularcarcinoma,infiltratinglobularcarcinoma,infiltratingductalcarcinoma,infiltratingductalcarcinoma,infiltratingductalcarcinoma,infiltratingductalcarcinoma,infiltratingductalcarcinoma,infiltratinglobularcarcinoma,infiltratingductalcarcinoma,...,infiltratingductalcarcinoma,infiltratingductalcarcinoma,infiltratingductalcarcinoma,infiltratingductalcarcinoma,infiltratingductalcarcinoma,infiltratingductalcarcinoma,infiltratinglobularcarcinoma,infiltratinglobularcarcinoma,infiltratinglobularcarcinoma,infiltratinglobularcarcinoma
7,number_of_lymph_nodes,0,0,0,0,1,0,2,0,0,...,0,0,2,,,5,0,5,0,0
8,PAM50,,,Basal,LumA,LumB,LumA,LumB,,LumA,...,,,,,,,,,,
9,ER.Status,,,,,,,,,Positive,...,,,,,,,,,,


In [5]:
clinical_data.columns

Index(['attrib_name', 'TCGA.5L.AAT0', 'TCGA.5L.AAT1', 'TCGA.A1.A0SP',
       'TCGA.A2.A04V', 'TCGA.A2.A04Y', 'TCGA.A2.A0CQ', 'TCGA.A2.A1G4',
       'TCGA.A2.A25A', 'TCGA.A7.A0CD',
       ...
       'TCGA.S3.AA11', 'TCGA.S3.AA14', 'TCGA.S3.AA15', 'TCGA.UL.AAZ6',
       'TCGA.UU.A93S', 'TCGA.V7.A7HQ', 'TCGA.WT.AB44', 'TCGA.XX.A899',
       'TCGA.XX.A89A', 'TCGA.Z7.A8R6'],
      dtype='object', length=1098)

In [6]:
samples = []
samples_num = miRNA_data.shape[1] - 1  
id_sample = miRNA_data.columns[1:].tolist()  


n_stage_idx = clinical_data[clinical_data.iloc[:, 0] == 'pathology_N_stage'].index[0]  
n_stage_row = clinical_data.iloc[n_stage_idx, 1:].str.lower()  

miRNA_id = np.array(miRNA_data.iloc[:, 0])  

for i in range(samples_num):
    sample = {}
    sample_id = id_sample[i]
    
    if sample_id in n_stage_row.index:
        label = n_stage_row[sample_id]
        if label == 'n0' or label == 'n1':
            metastasis_status = np.array(0 if label == 'n0' else 1, dtype=np.int8)
            miRNA_values = np.array(miRNA_data[sample_id])
            
            sample['sample_id'] = sample_id
            sample['lymph_node_status'] = metastasis_status
            sample['miRNA_values'] = miRNA_values

            samples.append(sample)

In [7]:
miRNA_id

array(['hsa-let-7a-1', 'hsa-let-7a-2', 'hsa-let-7a-3', 'hsa-let-7b',
       'hsa-let-7c', 'hsa-let-7d', 'hsa-let-7e', 'hsa-let-7f-1',
       'hsa-let-7f-2', 'hsa-let-7g', 'hsa-let-7i', 'hsa-mir-1-1',
       'hsa-mir-1-2', 'hsa-mir-100', 'hsa-mir-101-1', 'hsa-mir-101-2',
       'hsa-mir-103-1', 'hsa-mir-103-2', 'hsa-mir-105-1', 'hsa-mir-105-2',
       'hsa-mir-106a', 'hsa-mir-106b', 'hsa-mir-107', 'hsa-mir-10a',
       'hsa-mir-10b', 'hsa-mir-1178', 'hsa-mir-1179', 'hsa-mir-1180',
       'hsa-mir-1181', 'hsa-mir-1182', 'hsa-mir-1184-1', 'hsa-mir-1185-1',
       'hsa-mir-1185-2', 'hsa-mir-1193', 'hsa-mir-1197', 'hsa-mir-122',
       'hsa-mir-1224', 'hsa-mir-1225', 'hsa-mir-1226', 'hsa-mir-1227',
       'hsa-mir-1228', 'hsa-mir-1229', 'hsa-mir-1231', 'hsa-mir-1234',
       'hsa-mir-1236', 'hsa-mir-1237', 'hsa-mir-1238', 'hsa-mir-124-1',
       'hsa-mir-124-2', 'hsa-mir-124-3', 'hsa-mir-1243', 'hsa-mir-1244-1',
       'hsa-mir-1244-2', 'hsa-mir-1245', 'hsa-mir-1246', 'hsa-mir-1247',
      

In [8]:
len(samples)

615

In [9]:
samples[0]

{'sample_id': 'TCGA.3C.AALI',
 'lymph_node_status': array(1, dtype=int8),
 'miRNA_values': array([12.9183, 13.9224, 12.9134, 14.5127,  9.6483,  9.0065,  9.1343,
         4.4543, 12.6791,  8.4592,  8.3933,  0.    ,  1.2725, 10.4019,
        11.7145,  4.727 , 14.2665,  3.1224,  4.889 ,  5.0207,  4.9924,
         9.0846,  5.8527, 16.5135, 14.851 ,  0.    ,  0.    ,  3.9369,
         0.2108,  0.    ,  0.    ,  0.2108,  0.    ,  0.    ,  0.    ,
         0.    ,  1.0712,  0.    ,  0.8372,  0.    ,  0.3946,  0.3946,
         0.    ,  0.    ,  0.    ,  0.2108,  0.    ,  0.    ,  0.    ,
         0.    ,  0.    ,  0.    ,  0.    ,  1.2725,  0.    ,  2.1055,
         0.3946,  0.5577,  0.    ,  0.    ,  0.    ,  0.    ,  0.2108,
         0.2108,  0.    ,  0.    ,  0.5577,  8.5168,  8.4154,  3.0961,
        11.24  ,  0.    ,  0.    ,  0.    ,  0.    ,  0.    ,  5.8048,
         0.    ,  0.    ,  8.9031,  1.3635,  1.4491,  1.1753,  0.    ,
         0.    ,  0.    ,  0.3946,  0.7042,  0.    ,  6.30

## Saving result

In [10]:
dir_path = 'F:/internship/Breast Cancer Project (Lymph Node Metastasis)(miRNA)/reading_breast_cancer/'
file_name = 'Samples'
save_object(obj=samples, dir_path=dir_path, file_name=file_name)

'F:\\internship\\Breast Cancer Project (Lymph Node Metastasis)(miRNA)\\reading_breast_cancer\\Samples.pkl'

In [None]:
dir_path = 'F:/internship/Breast Cancer Project (Lymph Node Metastasis)(miRNA)/reading_breast_cancer/'
file_name = 'mRNA ID' 
save_object(obj=miRNA_id, dir_path=dir_path, file_name=file_name)

'F:\\internship\\Breast Cancer Project (Lymph Node Metastasis)(miRNA)\\reading_breast_cancer\\mRNA ID.pkl'