# (non-ComBat) cohort bias corrections

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import decomposition
from sklearn.preprocessing import imputation

from tqdm import tqdm
import gc
from numba import jit
from matplotlib import pyplot as plt

from joblib import Parallel, delayed
import multiprocessing

from time import time
import seaborn as sns

from scipy.stats import ks_2samp as ks2
from scipy.stats import mannwhitneyu as mwu
from scipy.stats import wasserstein_distance as w1_dist
from scipy.stats import energy_distance as w2_dist

In [2]:
def _clean(x, default='float'):   
    non_default = 'int' if default=='float' else 'float'
    try:
        x.replace([np.inf, -np.inf], np.nan, inplace=True)
        x.dropna(how='all', axis=1, inplace=True)
        if default=='float':
            x = x * 1.0
        else:
            x = x * 1
    except Exception as e:
        print(e)
        for col in x.columns:
            if 'object' in str(x[col].dtypes):
                try:
                    x[col] = x[col].astype(default)
                except:
                    try:
                        x[col] = x[col].astype(non_default)
                    except:
                        print(col)
                        x[col] = x[col].astype('category')
    return x

def get_transposed(df, NameRow='GenX', prefix='GenX'):
    transposed  = df.T
    new_index = transposed.loc[[NameRow]].values.tolist()[0]
    transposed.columns = new_index
    if prefix is not None:
        transposed.columns = [prefix+'_'+_col for _col in transposed.columns.values.tolist()]
    return transposed.drop(NameRow, axis=0, inplace=False)
       
                     
def _outliers_modified_z_score(ys, threshold = 3.5):
    median_y = np.median(ys)
    median_absolute_deviation_y = np.median([np.abs(y - median_y) for y in ys])
    modified_z_scores = [0.6745 * (y - median_y) / median_absolute_deviation_y
                         for y in ys]
    return modified_z_scores, np.where(np.abs(modified_z_scores) > threshold)
    

In [3]:
def timeit(method):
    def timed(*args, **kw):
        ts = time()
        result = method(*args, **kw)
        te = time()

        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result

    return timed

# L/S parallelised
@timeit
def _preprocess_par(df, cohorts = [], 
                scaler = "standard", 
                bias_removal = False, 
                col_range = None, 
                min_cohort_size=10, 
                qrange=(0.25,0.75),
                debug=False,
                n_jobs=10,
                imputer=None):
        
        def _norm(cohort, partition=None):
            ch = df[partition]==cohort
            if sum(ch)<min_cohort_size:
                print("Skipping cohort {}, because of low sample count: {}".format(cohort, sum(ch)))
            else:
                if debug==False:
                    try:
                        if imputer is not None:                                
                            imp = imputation.Imputer(strategy=imputer, axis=0)
                            res = scaler.fit_transform(imp.fit_transform(df.loc[ch,gene_columns].values))
                        else:
                            res = scaler.fit_transform(df.loc[ch,gene_columns].values)
                        df.loc[ch,gene_columns] = pd.DataFrame(data=res, index=ch[ch].index, columns=gene_columns)
                        print("Corrected cohort {}, with {} samples".format(cohort, sum(ch))) 
                    except Exception as e:
                        print("ERROR", e, "cohort:"+cohort)
                        print("index:",ch)
                        print("target:", df.loc[ch,gene_columns].shape) 
                        print("replacement:", res.shape)
                else:
                    for _col in gene_columns: # for debugging
                        df_temp = df.loc[ch, _col].copy() 
                        try:
                            df.loc[ch, _col] = (df_temp-df_temp.mean())/df_temp.std()
                        except Exception as e:
                            print("ERROR", e, "gene:"+_col, "cohort:"+cohort)
            return df[ch]
        def _quantile(cohort, partition=None):
                ch = df[partition]==cohort
                tor = df.loc[ch, gene_columns]
                t = tor.T
                tqn = _qn(t.values)
                df[gene_columns][ch] = tqn.T # pd.DataFrame(data=tqn.T, index=tor.index, columns=gene_columns)
                return df[ch]
    
        if col_range is None:
            gene_columns = [_col for _col in df.columns if 'GenX' in _col]  
        else:                      
            cr = range(col_range[0], col_range[1])
            gene_columns = df.columns[cr]
        
        if len(cohorts)==0:
            cohorts = df.batch_number.unique().tolist()
        
        if scaler == "standard":
            scaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
        elif scaler == "minmax":
            scaler = preprocessing.MinMaxScaler()
        elif scaler == "maxabs":
            scaler = preprocessing.MaxAbsScaler()
        elif scaler == "robust":
            scaler = preprocessing.RobustScaler(quantile_range=qrange, 
                                                    with_scaling=True, with_centering=True)
        elif scaler in ["normalizer", "normaliser"]:
            scaler = preprocessing.Normalizer()  
            
        if bias_removal == True:  
            print("- "*30, 'Removing cohort biases')    
            if scaler != 'quantile':           
                results = Parallel(n_jobs=n_jobs)(delayed(_norm)(cohort) for cohort in cohorts)
            else:
                results = Parallel(n_jobs=n_jobs)(delayed(_quantile)(cohort) for cohort in cohorts)
                
            print("- "*30, 'Concatenating results')    
            df = pd.concat(results)
            
        else:
            ch = df[partition].isin(cohorts)
            df.loc[ch,gene_columns] = scaler.fit_transform(df.loc[ch,gene_columns])
            
        #df = df[df.batch_number.isin(cohorts)]
        return df  
    

In [4]:
# load data

####
####
pheno_small = pd.read_csv("../_docs/Lung_Phenotype_Metadata.txt", sep="\t")
pheno_large = pd.read_csv("../_docs/Lung_Table_Phenotypes.txt", sep="\t")
pheno_large.set_index('submitter_id.samples', inplace=True)
pheno_large['sample_id'] = pheno_large.index
pheno_large['gender.demographic'] = pheno_large['gender.demographic'].apply(lambda x: 0 
                                                                            if x=='male' else 1 
                                                                                if pd.isna(x)==False 
                                                                                else np.nan)
pheno_large = pheno_large.merge(pd.DataFrame(pheno_large.groupby(by='batch_number').size(), 
                              columns=['batch_size']),
                  left_on='batch_number', right_on='batch_number')

pheno_large = pheno_large.merge(
    pd.DataFrame(pheno_large.groupby(by='batch_number')['gender.demographic']\
                     .mean().reset_index().rename(index=str, columns={'gender.demographic':'gender_mean'})),
                  how='left', left_on='batch_number', right_on='batch_number')

####
####
t = pheno_small.groupby(by='PatientID')['Sample Type'].count()
double_patients = t[t>1].index.tolist()
ref_normal_tumor = pheno_small.loc[pheno_small.PatientID.isin(double_patients)][['SampleID', 'Sample Type', 'PatientID']]\
                                            .sort_values(by='SampleID')
pat_cols_primary = list(set(pheno_small.loc[pheno_small['Sample Type'].isin(['Primary Tumor', 
                                                                             'Recurrent Tumor'])]['SampleID']))

####
####
root_dir = '/media/koekiemonster/DATA-FAST/genetic_expression/hackathon_2/Lung'
type_data = pd.read_csv(root_dir+'/HumanMethylation450_meta.csv', engine='c', sep=',',header=7)
type_data['RefGene'] = type_data.UCSC_RefGene_Group.str.split(';').apply(lambda x: 
                                                                         ",".join(sorted(set(x))) 
                                                                         if type(x)==list else np.nan)
rare_refgene = type_data.RefGene.value_counts().index[type_data.RefGene.value_counts()<100].tolist()
type_data['RefGene'][type_data.RefGene.isin(rare_refgene)] = 'uncommon' 
type_data = type_data[['IlmnID', 'Name', 'Infinium_Design_Type', 
                       'Color_Channel', 'Relation_to_UCSC_CpG_Island', 'CHR', 'RefGene']]

####
####
gc.collect()
sourceDir = "/media/koekiemonster/DATA-FAST/genetic_expression/hackathon_2" # "/media/bramvanes/Extra/DATA/RexR/2018" #  #"/media/bramvanes/Extra/DATA/RexR/2018" #
methylation = pd.read_table(sourceDir+"/Lung/Lung_Methylation.txt", sep="\t")


  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyboardInterrupt: 

In [None]:
methylation.set_index('probeID', inplace=True)
methylation= _clean(get_transposed(methylation.copy(), axis=1))