# Deep denoising auto-encoder and MLP based multi-output regression on TCGA multi-omics data
# Data Pre-processing

Note: You may skip this notebook if you already have the pre-processed data

Importing libraries

In [0]:
import pandas as pd
import numpy as np

In [0]:
def intersection(list1, list2, list3):
    return list(set(list1) & set(list2) & set(list3))

In [0]:
def extractMatchedIndices(list1, list2):
    comm = []
    for i in list2:
        for j in list1:
            if i in j:
                comm.append(list1.index(j))
    return comm

In [0]:
def remrows(data):
    t = int(0.8*data.shape[1])
    data = data.dropna(thresh=t)
    #data = data[(data.T != 0).any()]
    return data

In [0]:
def remcolumns(data):
    t = int(0.8*data.shape[1])
    data = data.dropna(thresh=t,axis=1)
    #data = data.loc[:, (data != 0).any(axis=0)]
    return data

In [0]:
def tumor_normal_labels(list):
    sample = [x[13:15] for x in list]
    label = np.array([x in ['01','02','03','04','05','06','07','08','09'] for x in sample])
    label=1*label   # converting boolean into int
    return label

In [0]:
def myNormalize(data):
    (rows, cols) = data.shape
    mins = np.zeros(shape=(cols), dtype = np.float32)
    maxs = np.zeros(shape=(cols), dtype = np.float32)
    for j in range(cols):
        mins[j] = np.min(data[:,j])
        maxs[j] = np.max(data[:,j])
    
    result = np.copy(data)
    for i in range(rows):
        for j in range(cols):
            result[i,j] = (data[i,j] - mins[j]) / (maxs[j] - mins[j])
    return result

# Loading data

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
#ls "/content/drive/My Drive"

In [0]:
dnaMeth = pd.read_table('/content/drive/My Drive/TCGA Data/LIHC/LIHC_Methylation450__SingleValue__TSS1500__Both.txt',delimiter='\t',index_col=0)
cna = pd.read_table('/content/drive/My Drive/TCGA Data/LIHC/LIHC__genome_wide_snp_6__GeneLevelCNA.txt',delimiter='\t',index_col=0)
rnaSeq = pd.read_table('/content/drive/My Drive/TCGA Data/LIHC/LIHC_RNASeq__illuminahiseq_rnaseqv2__GeneExp.txt',delimiter='\t',index_col=1)     # Using Entrez ID as row identifier')

Dropping redundant columns

In [0]:
dnaMeth = dnaMeth.drop(dnaMeth.columns[[0]], axis=1)
rnaSeq = rnaSeq.drop(rnaSeq.columns[[0]], axis=1)
cna = cna.drop(cna.columns[[0,1]], axis=1)

# Pre-processing

Extracting sample names using TCGA barcode

In [0]:
dnaMethSamples = list(dnaMeth)
rnaSamples = list(rnaSeq)
cnaSamples = list(cna)
methID = [x[8:16] for x in dnaMethSamples]
rnaID = [x[8:16] for x in rnaSamples]
cnaID = [x[8:16] for x in cnaSamples]

Removing duplicates

In [0]:
methID=set(methID)
rnaID=set(rnaID)
cnaID=set(cnaID)

Reconverting into lists

In [0]:
methID=list(methID)
rnaID=list(rnaID)
cnaID=list(cnaID)

Finding out common samples

In [0]:
commonSamples = intersection(methID, rnaID, cnaID)
commonMeth = extractMatchedIndices(methID,commonSamples)
commonRNA =  extractMatchedIndices(rnaID,commonSamples)
commonCNA =  extractMatchedIndices(cnaID,commonSamples)

Removing rows (genes) having more than 20% missing values across all samples (patients)

In [0]:
dnaMeth = remrows(dnaMeth)
rnaSeq = remrows(rnaSeq)
cna = remrows(cna)

Removing columns (samples) having more than 20% missing values across all rows (genes)

In [0]:
dnaMeth = remcolumns(dnaMeth)
rnaSeq = remcolumns(rnaSeq)
cna = remcolumns(cna)

Reducing each omics data to common samples only

In [0]:
dnaMeth = dnaMeth.iloc[:,commonMeth]
rnaSeq = rnaSeq.iloc[:,commonRNA]
cna = cna.iloc[:,commonCNA]

Removing 1st quantile for rnaSeq

In [0]:
rnaSeq_rowsum = rnaSeq.sum(axis=1)
ind = pd.DataFrame(rnaSeq_rowsum > rnaSeq_rowsum.quantile(0.25))
rnaSeq = rnaSeq[ind.values]

Finding tumor and normal samples

In [0]:
labels = tumor_normal_labels(list(dnaMeth))

Imputing remaining missing values

In [0]:
from sklearn.impute import SimpleImputer

In [0]:
imp = SimpleImputer(missing_values=np.NaN, strategy='mean', copy=True)
imputedDNAMeth = imp.fit_transform(dnaMeth)
imputedRNASeq = imp.fit_transform(rnaSeq)
imputedCNA = imp.fit_transform(cna)

In [0]:
#imp = Imputer(missing_values='NaN', strategy='mean', axis=1, copy=True)
#imputedDNAMeth = imp.fit_transform(dnaMeth)
#imputedRNASeq = imp.fit_transform(rnaSeq)
#imputedCNA = imp.fit_transform(cna)

In [0]:
imputedDNAMeth=imputedDNAMeth.transpose()
imputedRNASeq=imputedRNASeq.transpose()
imputedCNA=imputedCNA.transpose()

Normalizing datasets using min-max normalization

In [0]:
normalized_DNAMeth = myNormalize(imputedDNAMeth)
normalized_RNASeq = myNormalize(imputedRNASeq)
normalized_CNA = myNormalize(imputedCNA)

# Exporting data

Saving pre-processed files

In [0]:
preprocessed_DNAMeth = pd.DataFrame(normalized_DNAMeth)
preprocessed_RNASeq = pd.DataFrame(normalized_RNASeq)
preprocessed_CNA = pd.DataFrame(normalized_CNA)
labels=pd.DataFrame(labels)

Exporting pre-processed data to csv files

In [0]:
preprocessed_DNAMeth.to_csv('/content/drive/My Drive/TCGA Data/Preprocessed_Data/LIHC_preprocessed_DNAMeth.csv',index=False)
preprocessed_RNASeq.to_csv('/content/drive/My Drive/TCGA Data/Preprocessed_Data/LIHC_preprocessed_RNASeq.csv',index=False)
preprocessed_CNA.to_csv('/content/drive/My Drive/TCGA Data/Preprocessed_Data/LIHC_preprocessed_CNA.csv',index=False)
labels.to_csv('/content/drive/My Drive/TCGA Data/Preprocessed_Data/LIHC_labels.csv',index=False)