# Prostate Cancer Project
# Metastasis Prediction Using mRNA Expression Data 
# Reading Step

## Import Library

In [1]:
import numpy as np
import pandas as pd
import pickle
import os

## Import Functions

In [2]:
def read_object(obj_path):
    with open(obj_path, 'rb') as input:
        return pickle.load(input)

def save_object(obj, dir_path, file_name, file_extension='pkl', create_dir=True):
    if create_dir:
        if not os.path.isdir(dir_path):
            os.makedirs(dir_path)
    name = os.path.normpath(os.path.join(dir_path, file_name+'.'+file_extension))
    with open(name, 'wb') as output:  # overwrites any existing file
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
        return name

## Reading Data

### mRNA Data

In [None]:
dir_path = 'D:/VSCodes (Saeed)/Prostate Cancer Project (Metastasis Status)(mRNA)/Data/NCBI_GEO/GSE46691/'
file_name = 'GSE46691_quantile_normalized.txt.gz'
mRNA_data = pd.read_csv(dir_path + file_name, sep='\t')
mRNA_data

Unnamed: 0,ID_REF,GBX.DISC.PCA107.CEL,GBX.DISC.PCA494.CEL,GBX.DISC.PCA62.CEL,GBX.DISC.PCA74.CEL,GBX.DISC.PCA338.CEL,GBX.DISC.PCA255.CEL,GBX.DISC.PCA128.CEL,GBX.DISC.PCA344.CEL,GBX.DISC.PCA87.CEL,...,GBX.DISC.PCA148.CEL,GBX.DISC.PCA139.CEL,GBX.DISC.PCA217.CEL,GBX.DISC.PCA48.CEL,GBX.DISC.PCA431.CEL,GBX.DISC.PCA367.CEL,GBX.DISC.PCA91.CEL,GBX.DISC.PCA455.CEL,GBX.DISC.PCA224.CEL,GBX.DISC.PCA46.CEL
0,2315101,8.266755,6.399058,6.016815,6.459906,6.058099,6.274884,7.391515,6.928312,6.586673,...,6.725046,7.508285,7.795175,7.299218,7.092828,6.861507,6.345222,7.631274,7.838648,7.201566
1,2315102,4.340586,4.975432,4.081496,4.426973,4.465987,3.764659,6.556342,4.827751,5.096363,...,5.412221,4.364517,5.301507,5.592826,5.136573,3.620280,5.090039,6.408558,5.476892,5.949387
2,2315103,8.433638,8.323522,7.993002,7.889308,8.224840,7.927173,7.892362,8.087230,8.059346,...,7.749851,7.803671,6.364305,7.855100,7.951641,7.667128,7.555915,7.504006,6.766796,7.610124
3,2315104,6.148810,5.483675,6.665121,5.343324,5.932540,4.681516,6.519437,6.129862,6.707060,...,6.092333,5.572528,6.168060,5.520205,6.058601,6.001817,6.325297,6.968980,5.147197,5.301156
4,2315105,7.381292,6.465270,6.511992,5.891926,6.507693,6.459950,6.776188,6.365672,6.038755,...,7.363165,7.521392,8.411135,8.580859,8.623780,7.452340,6.881595,9.023890,8.783975,6.622390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1411394,4135187,7.498537,7.509668,7.492282,7.282646,7.644209,7.751041,7.811115,7.884420,7.718668,...,7.737179,7.795290,7.962716,7.660743,7.759105,7.648148,7.726384,8.113531,7.835282,7.706760
1411395,4135585,9.114021,9.226341,9.222537,9.141647,9.674143,9.169977,8.699547,9.474238,9.250033,...,8.439270,8.587656,7.980659,7.894323,8.459755,8.803058,9.019191,7.651471,8.238000,9.137886
1411396,4135691,7.406348,7.548499,7.577216,7.389647,7.804050,7.818695,8.355885,8.308864,8.148935,...,8.033587,7.939535,8.059828,7.888616,7.985959,7.950388,7.999752,7.875064,7.894748,8.144433
1411397,4135877,9.319602,9.535168,9.311850,9.457244,9.469158,9.639704,9.641120,9.701159,9.353553,...,9.144047,9.558720,9.070225,9.199262,9.238426,9.350412,9.516056,8.302594,9.202988,9.422989


### Clinical Data

In [None]:
dir_path = 'D:/VSCodes (Saeed)/Prostate Cancer Project (Metastasis Status)(mRNA)/Data/NCBI_GEO/GSE46691/'
file_name = 'GSE46691_series_matrix.txt'
clinical_data = pd.read_csv(dir_path + file_name, sep='\t', comment='!')
clinical_data

Unnamed: 0,Sample_title,prostate_cancer_primary_1,prostate_cancer_primary_2,prostate_cancer_primary_3,prostate_cancer_primary_4,prostate_cancer_primary_5,prostate_cancer_primary_6,prostate_cancer_primary_7,prostate_cancer_primary_8,prostate_cancer_primary_9,...,prostate_cancer_primary_536,prostate_cancer_primary_537,prostate_cancer_primary_538,prostate_cancer_primary_539,prostate_cancer_primary_540,prostate_cancer_primary_541,prostate_cancer_primary_542,prostate_cancer_primary_543,prostate_cancer_primary_544,prostate_cancer_primary_545
0,Sample_geo_accession,GSM1134064,GSM1134065,GSM1134066,GSM1134067,GSM1134068,GSM1134069,GSM1134070,GSM1134071,GSM1134072,...,GSM1134599,GSM1134600,GSM1134601,GSM1134602,GSM1134603,GSM1134604,GSM1134605,GSM1134606,GSM1134607,GSM1134608
1,Sample_status,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,...,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013,Public on Jul 01 2013
2,Sample_submission_date,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013,...,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013,May 07 2013
3,Sample_last_update_date,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,...,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013,Jul 01 2013
4,Sample_type,RNA,RNA,RNA,RNA,RNA,RNA,RNA,RNA,RNA,...,RNA,RNA,RNA,RNA,RNA,RNA,RNA,RNA,RNA,RNA
5,Sample_channel_count,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
6,Sample_source_name_ch1,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,...,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample,primary prostate cancer FFPE tissue sample
7,Sample_organism_ch1,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,...,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens,Homo sapiens
8,Sample_characteristics_ch1,gleason score: 7,gleason score: 6,gleason score: 7,gleason score: 7,gleason score: 7,gleason score: 7,gleason score: 7,gleason score: 8,gleason score: 7,...,gleason score: 7,gleason score: 9,gleason score: 9,gleason score: 9,gleason score: 7,gleason score: 9,gleason score: 7,gleason score: 8,gleason score: 9,gleason score: 6
9,Sample_characteristics_ch1,metastatic event: 1,metastatic event: 0,metastatic event: 0,metastatic event: 1,metastatic event: 1,metastatic event: 0,metastatic event: 0,metastatic event: 1,metastatic event: 0,...,metastatic event: 0,metastatic event: 1,metastatic event: 1,metastatic event: 1,metastatic event: 1,metastatic event: 1,metastatic event: 1,metastatic event: 0,metastatic event: 1,metastatic event: 0


In [5]:
clinical_data.columns

Index(['Sample_title', 'prostate_cancer_primary_1',
       'prostate_cancer_primary_2', 'prostate_cancer_primary_3',
       'prostate_cancer_primary_4', 'prostate_cancer_primary_5',
       'prostate_cancer_primary_6', 'prostate_cancer_primary_7',
       'prostate_cancer_primary_8', 'prostate_cancer_primary_9',
       ...
       'prostate_cancer_primary_536', 'prostate_cancer_primary_537',
       'prostate_cancer_primary_538', 'prostate_cancer_primary_539',
       'prostate_cancer_primary_540', 'prostate_cancer_primary_541',
       'prostate_cancer_primary_542', 'prostate_cancer_primary_543',
       'prostate_cancer_primary_544', 'prostate_cancer_primary_545'],
      dtype='object', length=546)

In [6]:
samples = []
samples_num = clinical_data.shape[1]-1
clinical_data_columns = clinical_data.columns
for i in range(samples_num):
    sample = {}
    first_id = clinical_data.columns[i+1]
    second_id = clinical_data[first_id][19]+'.CEL'
    gsm_id = clinical_data[first_id][0]
    metastasis_status = np.array(clinical_data[first_id][9][-1], dtype=np.int8)
    gleason_score = np.array(clinical_data[first_id][8][-1], dtype=np.int8)
    mRNA_values = np.array(mRNA_data[second_id])
    
    sample['first_id'] = gsm_id
    sample['second_id'] = second_id
    sample['metastasis status'] = metastasis_status
    sample['gleason score'] = gleason_score
    sample['mRNA values'] = mRNA_values

    samples.append(sample)



In [7]:
mRNA_id = np.array(mRNA_data['ID_REF'])

In [8]:
mRNA_id

array([2315101, 2315102, 2315103, ..., 4135691, 4135877, 4136112],
      shape=(1411399,))

In [9]:
len(samples)

545

In [10]:
samples[0]

{'first_id': 'GSM1134064',
 'second_id': 'GBX.DISC.PCA1.CEL',
 'metastasis status': array(1, dtype=int8),
 'gleason score': array(7, dtype=int8),
 'mRNA values': array([6.44406927, 4.75344158, 7.94219843, ..., 8.10055912, 9.48165995,
        1.70632135], shape=(1411399,))}

## Saving result

In [None]:
dir_path = 'D:/VSCodes (Saeed)/Prostate Cancer Project (Metastasis Status)(mRNA)/Reading_Step/'
file_name = 'Samples'
save_object(obj=samples, dir_path=dir_path, file_name=file_name)

'D:\\VSCodes_Saeed\\Prostate Cancer Project (Metastasis Status)(mRNA)\\Reading_Step\\Samples.pkl'

In [None]:
dir_path = 'D:/VSCodes (Saeed)/Prostate Cancer Project (Metastasis Status)(mRNA)/Reading_Step/'
file_name = 'mRNA ID'
save_object(obj=mRNA_id, dir_path=dir_path, file_name=file_name)

'D:\\VSCodes_Saeed\\Prostate Cancer Project (Metastasis Status)(mRNA)\\Reading_Step\\mRNA ID.pkl'