In [None]:
%load_ext autoreload

In [2]:
import pandas as pd
import numpy as np
import os
import feather
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error 
import random

In [70]:
import sys
if "/proj/yunligrp/users/minzhi/custom_lib" not in sys.path:
    sys.path.insert(0, "/proj/yunligrp/users/minzhi/custom_lib")

In [None]:
from function_process_data_eqtl import *
from function_asso import *
from function_mesa_cca import *
%autoreload 2

In [9]:
def cn_map(cn, map_df, common_col):
    cn_mapped = map_df.merge(cn, on = common_col, how = "inner")
    return cn_mapped

In [10]:
def cn_map_list(cohort_list, cohort_dir_list, map_raw_df, common_col, save_dir):
    cohort_cn_map_summary = pd.DataFrame(columns = ["annotation"], index = cohort_list)
    for cohort_dir_filename, cohort_i in zip(cohort_dir_list, cohort_list):
        cohort = pd.read_csv(cohort_dir_filename, sep = "\t", header = None, index_col = None)
        cohort.rename(columns={0:'NWDID'}, inplace = True)
        map_df = map_raw_df[["NWDID", "unique_subject_key", "subject_id"]]
        map_df.dropna(axis = 0, subset = ["unique_subject_key", "subject_id"], how = "any", inplace = True)
        common_col = "NWDID"
        tmp_cn_mapped = cn_map(cohort, map_df, common_col)
        tmp_cn_mapped_filename = "%s_subject_id_cram.tsv"%cohort_i
        tmp_cn_mapped_dir_filename = os.path.join(save_dir, tmp_cn_mapped_filename)
        tmp_cn_mapped.to_csv(tmp_cn_mapped_dir_filename, sep = "\t", header = True, index = False)
        cohort_cn_map_summary.loc[cohort_i, "annotation"] = tmp_cn_mapped.shape[0]
    cohort_cn_map_summary.sort_index(axis = 0, inplace = True)
    return cohort_cn_map_summary

In [11]:
def split_df(df, cat_col, category_list, file_prefix, save_dir):
    for category in category_list:
        tmp_df = df.loc[df[cat_col] == category, :]
        tmp_filename = "%s_%s.tsv"%(file_prefix, category)
        tmp_dir_filename = os.path.join(save_dir, tmp_filename)
        tmp_df.to_csv(tmp_dir_filename, sep = "\t", header = True, index = False)
    return 1

In [12]:
def compare_df_pair(df1, df2, common_col, df1_col, df2_col):
    df1_df2 = df1.merge(df2, on = common_col, how = "inner")
    overlap_num = df1_df2.shape[0]
    df1_num = df1.shape[0]
    df2_num = df2.shape[0]
    df1_nodf2_num = df1_num - overlap_num
    df2_nodf1_num = df2_num - overlap_num
    if df1_col == df2_col:
        df1_col_x = "%s_x"%df1_col
        df2_col_y = "%s_y"%df2_col
    if (df1[df1_col].dtypes == "float64" or df1[df1_col].dtypes == "int32") and (df2[df2_col].dtypes == "float64" or df2[df2_col].dtypes == "int32"):
        cor = pearsonr(df1_df2[df1_col_x], df1_df2[df2_col_y])[0]
        return df1_nodf2_num, df2_nodf1_num, overlap_num, df1_num, df2_num, cor
    else:
        return df1_nodf2_num, df2_nodf1_num, overlap_num, df1_num, df2_num

In [13]:
def df_extraction_duplicates(df, col):
    boolean_series = df[[col]].duplicated(keep = False)
    duplicated_df = df.loc[boolean_series, :]
    return duplicated_df

In [14]:
def concat_egfr(egfr_dir, cohort_list, header_selection):
    egfr_list = []
    for cohort in cohort_list:
        egfr_dir_filename = os.path.join(egfr_dir, "egfr_calculated_%s.tsv"%cohort)
        egfr_raw = pd.read_csv(egfr_dir_filename, sep = "\t", header = 0, index_col = None)
        egfr = egfr_raw[header_selection]
        cohort_pheno_table = "_".join(cohort.split("-"))
        egfr["cohort"] = egfr.shape[0] * [cohort_pheno_table]
        egfr["unique_subject_key"] = egfr.shape[0] * [None]
        egfr_num = egfr.shape[0]
        for egfr_i in range(egfr_num):
            egfr.loc[egfr_i, "unique_subject_key"] = "%s_%s"%(cohort_pheno_table, egfr.loc[egfr_i, "id"])
        egfr.rename(columns = {"id":"SUBJECT_ID", "age":"age_at_EGFRCKDEPI", "EGFR":"EGFRCKDEPI"}, inplace = True)
        egfr_list.append(egfr)
        print("%s completed."%cohort)
    egfr_full = pd.concat(egfr_list, axis = 0)
    egfr_full.reset_index(drop = True, inplace = True)
    return egfr_full

In [15]:
def check_invar(df, col):
    check_invar_list = df[col].values.tolist()
    if len(set(check_invar_list)) == 1:
        invar_bool = True
    else:
        invar_bool = False
    return invar_bool

def check_list_invar(df, col_list):
    invar_list = []
    for col in col_list:
        invar_bool = check_invar(df, col)
        invar_list.append(invar_bool)
    return invar_list

In [16]:
def remove_dup_anno(annotation, invar_subsets, rm_header = None):
    if rm_header == None:
        annotation["aux_rm"] = np.arange(annotation.shape[0])
        rm_header = "aux_rm"
    annotation_dup = df_extraction_duplicates(annotation, "unique_subject_key")
    _, usk_list = categorize_df(annotation_dup, "unique_subject_key")
    remove_header_list = []
    for usk in usk_list:
        tmp_annotation = annotation_dup.loc[annotation_dup.loc[:, "unique_subject_key"] == usk, :]
        tmp_invar_list = check_list_invar(tmp_annotation, invar_subsets)
        tmp_rm_list = tmp_annotation[rm_header].values.tolist()
        if False in tmp_invar_list:
            remove_header_list = remove_header_list + tmp_rm_list
        else:
            tmp_rm_num = len(tmp_rm_list)
            selection = random.randint(0, tmp_rm_num - 1)
            remove_header_list = remove_header_list + [tmp_rm_list[selection]]
    annotation_unique = annotation[~annotation[rm_header].isin(remove_header_list)]
    if rm_header == "aux_rm":
        annotation_unique.drop(axis = 1, labels = ["aux_rm"], inplace = True)
    return annotation_unique

In [18]:
def merge_replace_nan(df0_col, df1_col, df0, df1):
    df0 = df0.set_index(df0_col)
    df1 = df1.set_index(df1_col)
    df0_filled = df0.fillna(df1)
    df0_filled = df0_filled.reset_index()
    return df0_filled

In [64]:
def func(x):
    if x.first_valid_index() is None:
        return None
    else:
        return x[x.first_valid_index()]

In [65]:
def map_annotation(pheno, annotation, pheno_col, annotation_col, pivot_col, how_merge, pheno_prefix, anno_prefix, mapped_save_dir, diff_save_dir):
    pheno_mapped = annotation.merge(pheno, left_on = annotation_col[0], right_on = pheno_col[0], how = merge_how)
    col_num = len(pheno_col[1:])
    for col_i in range(1, col_num + 1):
        if pheno_col[col_i] != annotation_col[col_i]:
            pheno_mapped.drop(axis = 1, columns = [pheno_col[col_i], annotation_col[col_i]], inplace = True)
        else:
            tmp_pheno_col = "%s_y"%pheno_col[col_i]
            tmp_annotation_col = "%s_x"%annotation_col[col_i]
            pheno_mapped.drop(axis = 1, columns = [tmp_pheno_col, tmp_annotation_col], inplace = True)
    if len(annotation_col) > 1:
        depre_sample_list, pivot_col_df = map_deprecation_list(pheno, annotation, pheno_col, annotation_col,
                                                               pivot_col, pheno_prefix, anno_prefix, diff_save_dir)
        if depre_sample_list != []:
            pheno_mapped_raw = pheno_mapped.copy()
            del pheno_mapped
            pheno_mapped = pheno_mapped_raw[~pheno_mapped_raw[pivot_col].isin(depre_sample_list)]
        pheno_mapped = pheno_mapped.merge(pivot_col_df, on = pivot_col, how = "inner")
    pheno_mapped_filename = "%s_%s.tsv"%(anno_prefix, pheno_prefix)
    pheno_mapped_dir_filename = os.path.join(mapped_save_dir, pheno_mapped_filename)
    pheno_mapped.to_csv(pheno_mapped_dir_filename, sep = "\t", header = True, index = False)
    return pheno_mapped

In [66]:
def map_deprecation_list(pheno, annotation, pheno_col, annotation_col, pivot_col, pheno_prefix, anno_prefix, save_dir):
    pheno_annotatio_dif = annotation.merge(pheno, left_on = pheno_col[0], right_on = annotation_col[0], how = "outer")
    depre_sample_list = []
    pivot_df = pheno_annotatio_dif[[pivot_col]]
    for pheno_col_i, annotation_col_i in zip(pheno_col[1:], annotation_col[1:]):
        if pheno_col_i == annotation_col_i:
            annotation_col_i = "%s_x"%annotation_col_i
            pheno_col_i = "%s_y"%pheno_col_i
        col_dif = pheno_annotatio_dif[[pivot_col, annotation_col_i, pheno_col_i]]
        tmp_depre_sample_list, df_col_i = map_deprecation(col_dif, pheno_col_i, annotation_col_i,
                                                          pivot_col, pheno_prefix, anno_prefix, save_dir)
        pivot_df = pivot_df.merge(df_col_i, on = pivot_col, how = "inner")
        depre_sample_list = depre_sample_list + tmp_depre_sample_list
    return depre_sample_list, pivot_df

In [67]:
def map_deprecation(df, pheno_col, annotation_col, pivot_col, pheno_prefix, anno_prefix, save_dir):
    df_pivot = df[[pivot_col]]
    df_col = df[[pheno_col, annotation_col]]
    df_col.iloc[:, 0] = df_col.apply(func, axis = 1)
    df_col.iloc[:, 1] = df_col.apply(func, axis = 1)
    del df
    df = pd.concat([df_pivot, df_col], axis = 1)
    depre_sample_index_list = df[df.iloc[:, 0].isnull()].index.tolist()
    if depre_sample_index_list == []:
        depre_sample_list = []
    else:
        depre_sample_list = df.loc[depre_sample_index_list, pivot_col].values.reshape(1, -1).tolist()[0]
    df.dropna(axis = 0, how = "any", inplace = True)
    tmp_compare = df[pheno_col].eq(df[annotation_col], axis = 0)
    df_dif = df[tmp_compare == False]
    df_cons = df[tmp_compare == True]
    if annotation_col.split('_x')[-1] == "":
        annotation_col_propagate = annotation_col.split('_x')[0]
        df_cons.rename(columns = {annotation_col:annotation_col_propagate}, inplace = True)
    else:
        annotation_col_propagate = annotation_col
    df_cons_propagate = df_cons.loc[:, [pivot_col, annotation_col_propagate]]
    if df_dif.shape[0] != 0:
        print("%s is inconsistent."%pheno_col)
        df_dif_filename = "%s_%s_%s.tsv"%(anno_prefix, pheno_prefix, annotation_col)
        df_dif_dir_filename = os.path.join(save_dir, df_dif_filename)
        df_dif.to_csv(df_dif_dir_filename, sep = "\t", header = True, index = False)
        depre_sample_list = df_dif[[pivot_col]].values.reshape(1, -1).tolist()[0]
    return depre_sample_list, df_cons_propagate

# I. Alpha Globin

## 1. cohort and 2. # samples with calls

### 2.1 Preprocess CN

In [429]:
cn_dir = os.path.join("..", "raw_data", "cn")
cn_filename = "alpha_globin_calls.txt"
cn_dir_filename = os.path.join(cn_dir, cn_filename)
cn = pd.read_csv(cn_dir_filename, sep = "\t", header = 0, index_col = None)

In [430]:
print(cn.shape)
cn_pass = cn.loc[cn.loc[:, "QC_FAIL"] == "QC_PASS", :]
print(cn_pass.shape)
cn_pass = cn_pass.loc[cn_pass.loc[:, "QC_FLAGGED"] == "QC_PASS", :]
print(cn_pass.shape)

(131823, 8)
(131003, 8)
(130032, 8)


In [431]:
duplicates_num(cn_pass, "SAMPLE")

0

In [432]:
cn_pass_useful = cn_pass[["SAMPLE", "CN"]]
cn_pass_useful.rename(columns = {"SAMPLE":"NWDID", "CN":"cn"}, inplace = True)
cn_pass_dir = os.path.join("..", "prepro_data", "cn")
cn_pass_filename = "alpha_globin_calls_pass_useful.tsv"
cn_pass_dir_filename = os.path.join(cn_pass_dir, cn_pass_filename)
cn_pass_useful.to_csv(cn_pass_dir_filename, sep = "\t", header = True, index = False)

### 2.2 Summary of Annotation

In [44]:
annotation_dir = os.path.join("..", "raw_data", "annotation")
annotation_filename = "freeze8_sample_annot_2019-10-08_useful_unique02.tsv"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)

In [49]:
cn_annotation = cn_pass.merge(annotation, left_on = "SAMPLE", right_on = "NWDID", how = "inner")
cn_pass_anno_summary, cn_pass_anno_list = categorize_df(cn_annotation, "study")
print(cn_annotation.shape)
print(cn_pass_anno_list)
cn_pass_anno_summary_dir = os.path.join("..", "data_summary")
cn_pass_anno_summary_filename = "cn-nathan_pass_anno_cohort_summary.tsv"
cn_pass_anno_summary_dir_filename = os.path.join(cn_pass_anno_summary_dir, cn_pass_anno_summary_filename)
cn_pass_anno_summary.to_csv(cn_pass_anno_summary_dir_filename, sep = "\t", header = True, index = False)

(118934, 15)
['GOLDN', 'INSPIRE_AF', 'CAMP', 'GALAI', 'VU_AF', 'HCHS_SOL', 'CARDIA', 'EGCUT', 'THRV', 'PUSH_SCD', 'DECAF', 'REDS-III_Brazil', 'MGH_AF', 'AustralianFamilialAF', 'IPF', 'OMG_SCD', 'Partners', 'CHIRAH', 'BioMe', 'SAS', 'BioVU_AF', 'GENOA', 'CARE_PACT', 'CARE_BADGER', 'PharmHU', 'PCGC_CHD', 'CFS', 'GenSalt', 'PIMA', 'Sarcoidosis', 'HyperGEN', 'BAGS', 'PMBB_AF', 'WHI', 'walk_PHaSST', 'EOCOPD', 'GGAF', 'LTRC', 'WGHS', 'SARP', 'MPP', 'CARE_CLIC', 'MLOF', 'FHS', 'ARIC', 'ChildrensHS_IGERA', 'HVH', 'GeneSTAR', 'COPDGene', 'CCAF', 'SAFS', 'GENAF', 'SAGE', 'GALAII', 'DHS', 'SAPPHIRE_asthma', 'AFLMU', 'CATHGEN', 'MESA', 'VAFAR', 'miRhythm', 'JHS', 'CRA', 'JHU_AF', 'Mayo_VTE', 'CHS', 'Amish', 'ChildrensHS_MetaAir', 'CARE_TREXA', 'ChildrensHS_GAP', 'ECLIPSE']


### 2.3 CN of Each Cohort Based on Phenotype

In [454]:
pheno_dir = os.path.join("..", "prepro_data", "phenotype")
pheno_filename = "freeze8_anno04_af02_btc03-coh03_egfr03-ckd_adad01.tsv"
pheno_dir_filename = os.path.join(pheno_dir, pheno_filename)
pheno = pd.read_csv(pheno_dir_filename, sep = "\t", header = 0, index_col = None)
_, pheno_cohort_list = categorize_df(pheno, "study")

full_cn_dir = os.path.join("..", "prepro_data", "cn")
full_cn_filename = "alpha_globin_calls_pass_useful.tsv"
full_cn_dir_filename = os.path.join(full_cn_dir, full_cn_filename)
full_cn = pd.read_csv(full_cn_dir_filename, sep = "\t", header = 0, index_col = None)

In [455]:
for cohort in pheno_cohort_list:
    pheno_cohort_raw = pheno.loc[pheno.loc[:, "study"] == cohort, :]
    pheno_id = pheno_cohort_raw[["NWDID"]]
    cn = full_cn.merge(pheno_id, on = "NWDID", how = "inner")
    
    full_cn_id = full_cn[["NWDID"]]
    pheno_cohort = full_cn_id.merge(pheno_cohort_raw, on = "NWDID", how = "inner")
    print(cohort, cn.shape)
    cn_filename = "%s_cn.tsv"%cohort
    cn_dir = os.path.join("..", "cohort", cohort, "cn")
    cn_dir_filename = os.path.join(cn_dir, cn_filename)
    if not os.path.isdir(cn_dir):
        os.makedirs(cn_dir, exist_ok = True) 
    cn.to_csv(cn_dir_filename, "\t", header = True, index = False)
    
    pheno_cohort_dir = os.path.join("..", "cohort", cohort, "pre_data")
    pheno_cohort_filename = "freeze8_anno04_af02_btc03-coh03_egfr03-ckd_adad01_%s.tsv"%cohort
    pheno_cohort_dir_filename = os.path.join(pheno_cohort_dir, pheno_cohort_filename)
    if not os.path.isdir(pheno_cohort_dir):
        os.makedirs(pheno_cohort_dir, exist_ok = True)
    pheno_cohort.to_csv(pheno_cohort_dir_filename, sep = "\t", header = True, index = False)

GenSalt (1773, 2)
MESA (4549, 2)
HCHS_SOL (3847, 2)
JHS (3099, 2)
HyperGEN (1786, 2)
CARDIA (3038, 2)
FHS (3329, 2)
ARIC (3919, 2)
CHS (3480, 2)
WHI (10918, 2)
GeneSTAR (1535, 2)
COPDGene (5713, 2)
DHS (372, 2)
BioMe (9171, 2)
GENOA (1058, 2)


## 3. # samples with phenotype data

DDIMER

In [18]:
ddimer_filename = "TOPMED_HarmonizedPhenotypes_DDIMER_21MAY2019.csv"
ddimer_dir = os.path.join("..", "raw_data", "phenotype")
ddimer_dir_filename = os.path.join(ddimer_dir, ddimer_filename)
ddimer_raw = pd.read_csv(ddimer_dir_filename, sep = ",", header = 0, index_col = None)
ddimer = ddimer_raw.loc[:, ["sample.id", "STUDY", "DDIMER", "AGE_DDIMER", "sample_remove_DDIMER"]]
ddimer.dropna(axis = "index", how = "any", inplace = True)
ddimer_category_summary, _ = categorize_df(ddimer, "STUDY")
ddimer_category_summary.sort_values(by = "cohort", axis = 0, inplace = True)
ddimer_category_summary_dir = os.path.join("..", "data_summary")
ddimer_category_summary_filename = "TOPMED_HarmonizedPhenotypes_DDIMER_21MAY2019_study_summary.tsv"
ddimer_category_summary_dir_filename = os.path.join(ddimer_category_summary_dir, ddimer_category_summary_filename)
ddimer_category_summary.to_csv(ddimer_category_summary_dir_filename, sep = "\t", header = True, index = False)

  interactivity=interactivity, compiler=compiler, result=result)


In [19]:
ddimer_qc = ddimer[ddimer["sample_remove_DDIMER"] != 1]
ddimer_qc_category_summary, _ = categorize_df(ddimer_qc, "STUDY")
ddimer_qc_category_summary.sort_values(by = "cohort", axis = 0, inplace = True)
ddimer_qc_category_summary_dir = os.path.join("..", "data_summary")
ddimer_qc_category_summary_filename = "TOPMED_HarmonizedPhenotypes_DDIMER_21MAY2019_qc_study_summary.tsv"
ddimer_qc_category_summary_dir_filename = os.path.join(ddimer_qc_category_summary_dir, ddimer_qc_category_summary_filename)
ddimer_qc_category_summary.to_csv(ddimer_qc_category_summary_dir_filename, sep = "\t", header = True, index = False)

eGFR

In [10]:
egfr_dir = os.path.join("..", "raw_data", "egfr")
filename_list = os.listdir(egfr_dir)

In [20]:
cohort_list = []
sample_size_list = []
for filename in filename_list[1:]:
    cohort_name_raw = filename.split("_")[3]
    if cohort_name_raw == "PHEN":
        cohort_name = "SOL"
    else:
        cohort_name = cohort_name_raw
    dir_filename = os.path.join(egfr_dir, filename)
    tmp_df = pd.read_csv(dir_filename, sep = "\t", header = 0, index_col = None)
    tmp_sample_size = tmp_df.shape[0]
    cohort_list.append(cohort_name)
    sample_size_list.append(tmp_sample_size)
cohort_sample_size_tuple = list(zip(cohort_list, sample_size_list))
egfr_sample_size_df = pd.DataFrame(data = cohort_sample_size_tuple, columns = ["cohort", "sample_size"])
egfr_sample_size_df.sort_values(by = "cohort", axis = 0, inplace = True)
egfr_sample_size_dir = os.path.join("..", "data_summary")
egfr_sample_size_filename = "egfr_cohort_summary.tsv"
egfr_sample_size_dir_filename = os.path.join(egfr_sample_size_dir, egfr_sample_size_filename)
egfr_sample_size_df.to_csv(egfr_sample_size_dir_filename, sep = "\t", header = True, index = False)

grengrp6

In [24]:
gengrp6_filename = "page-harmonized-phenotypes-pca-freeze2-candidate2-2016-12-14.GWASid_fid_22May2018internalPCs.SOLv2consent.txt"
gengrp6_dir = os.path.join("..", "raw_data", "adjustment", "gengrp6")
gengrp6_dir_filename = os.path.join(gengrp6_dir, gengrp6_filename)
gengrp6 = pd.read_csv(gengrp6_dir_filename, sep = "\t", header = 0, index_col = None)
gengrp6.replace(".", np.nan, inplace=True)
gengrp6_select = gengrp6[["z_sol_id", "analysis_id", "CONSENT_text", "INTERNAL_USE_ONLY",
                          "gengrp6"]].dropna(axis=0, subset=["analysis_id","gengrp6"],how="any")
gengrp6_select.rename(columns = {"analysis_id":"SUBJECT_ID"}, inplace=True)
print(gengrp6_select.shape)
gengrp6_select = gengrp6_select[gengrp6_select["CONSENT_text"] != "DROP"]
print(gengrp6_select.shape)

  interactivity=interactivity, compiler=compiler, result=result)


(11829, 5)
(11678, 5)


weight and center

In [27]:
weight_center_dir = os.path.join("..", "raw_data", "adjustment")
weight_center_filename = "bloodcell_output.csv"
weight_center_dir_filename = os.path.join(weight_center_dir, weight_center_filename)
weight_center = pd.read_csv(weight_center_dir_filename, sep = ",", header = 0, index_col = None)
weight_center_select = weight_center[["ID", "WEIGHT_FINAL_NORM_OVERALL", "CENTER"]].dropna(axis = 0, how = "any")
weight_center_select.rename(columns = {"ID":"z_sol_id"}, inplace = True)
print(weight_center_select.shape)
gengrp6_weight_center = gengrp6_select.merge(weight_center_select, on = "z_sol_id", how = "inner")
print(gengrp6_weight_center.shape)

(16415, 3)
(9974, 7)


blood cell traits

In [None]:
pheno_dir = os.path.join("..", "raw_data", "phenotype")
pheno_filename = "coh02_pre.tsv"
pheno_dir_filename = os.path.join(pheno_dir, pheno_filename)
pheno = pd.read_csv(pheno_dir_filename, sep = "\t", header = 0, index_col = None)
cat_col = "cohort"
pheno_cat_summary, cohort_list = categorize_df(pheno, cat_col)
cat_list = cohort_list
col_list = ["hemoglobin_mcnc_bld_1", "hematocrit_vfr_bld_1", "rbc_ncnc_bld_1", "mcv_entvol_rbc_1", "mch_entmass_rbc_1", "mchc_mcnc_rbc_1",
            "rdw_ratio_rbc_1", "neutrophil_ncnc_bld_1", "lymphocyte_ncnc_bld_1", "basophil_ncnc_bld_1", "eosinophil_ncnc_bld_1",
            "monocyte_ncnc_bld_1", "wbc_ncnc_bld_1", "pmv_entvol_bld_1", "platelet_ncnc_bld_1"]
fixed_col_list = ["SUBJECT_ID", "unique_subject_key", "cohort"]
cohort_pheno_df = cat_col_summary(pheno, cat_col, cat_list, col_list, fixed_col_list)
cohort_pheno_dir = os.path.join("..", "data_summary")
cohort_pheno_filename = "cohort_pheno_summary.tsv"
cohort_pheno_dir_filename = os.path.join(cohort_pheno_dir, cohort_pheno_filename)
cohort_pheno_df.to_csv(cohort_pheno_dir_filename, sep = "\t", header = True, index = True)

### 3.2 Appending JHS ln(wbc) and to phenotype

In [50]:
pheno_dir = os.path.join("..", "raw_data", "phenotype")
jhs_pheno_filename = "jhs_basic_phenotypes_05072019.txt"
jhs_pheno_dir_filename = os.path.join(pheno_dir, jhs_pheno_filename)
jhs_pheno = pd.read_csv(jhs_pheno_dir_filename, sep = "\t", header = 0, index_col = None)

In [51]:
jhs_pheno["neutrophil_ncnc_bld_1"] = np.exp(jhs_pheno["lnneu"])
jhs_pheno["age_at_neutrophil_ncnc_bld_1"] = jhs_pheno["age"]
jhs_pheno["lymphocyte_ncnc_bld_1"] = np.exp(jhs_pheno["lnlym"])
jhs_pheno["age_at_lymphocyte_ncnc_bld_1"] = jhs_pheno["age"]
jhs_pheno["basophil_ncnc_bld_1"] = np.exp(jhs_pheno["lnbaso"])
jhs_pheno["age_at_basophil_ncnc_bld_1"] = jhs_pheno["age"]
jhs_pheno["eosinophil_ncnc_bld_1"] = np.exp(jhs_pheno["lneos"])
jhs_pheno["age_at_eosinophil_ncnc_bld_1"] = jhs_pheno["age"]
jhs_pheno["monocyte_ncnc_bld_1"] = np.exp(jhs_pheno["lnmono"])
jhs_pheno["age_at_monocyte_ncnc_bld_1"] = jhs_pheno["age"]
jhs_pheno["wbc_ncnc_bld_1"] = np.exp(jhs_pheno["lnwbc"])
jhs_pheno["age_at_wbc_ncnc_bld_1"] = jhs_pheno["age"]
# jhs_pheno["DDIMER"] = np.exp(jhs_pheno["lnddimer"])
# jhs_pheno["age_at_DDIMER"] = jhs_pheno["age"]

In [53]:
useful_pheno_list = ["NWDID", "neutrophil_ncnc_bld_1", "age_at_neutrophil_ncnc_bld_1","lymphocyte_ncnc_bld_1",
                     "age_at_lymphocyte_ncnc_bld_1", "basophil_ncnc_bld_1", "age_at_basophil_ncnc_bld_1","eosinophil_ncnc_bld_1",
                     "age_at_eosinophil_ncnc_bld_1", "monocyte_ncnc_bld_1", "age_at_monocyte_ncnc_bld_1","wbc_ncnc_bld_1", "age_at_wbc_ncnc_bld_1"]
jhs_pheno_useful = jhs_pheno[useful_pheno_list]

In [55]:
jhs_useful_filename = "jhs_usefule_phenotype_05072019.tsv"
jhs_useful_dir_filename = os.path.join(pheno_dir, jhs_useful_filename)
jhs_pheno_useful.to_csv(jhs_useful_dir_filename, sep = "\t", header = True, index = False)

## 4. # sample in annotation file

### 4.1 Cohort Summary

In [94]:
annotation_dir = os.path.join("..", "raw_data", "annotation")
annotation_filename = "freeze8_sample_annot_2019-07-30.txt"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)
annotation.dropna(axis = 0, subset = ["study"], how = "any", inplace = True)
annotation_cat_summary, _ = categorize_df(annotation, "study")
annotation_cat_summary_dir = os.path.join("..", "data_summary")
annotation_cat_summary_filename = "annotation_cohort_summary_2019-07-30.tsv"
annotation_cat_summary_dir_filename = os.path.join(annotation_cat_summary_dir, annotation_cat_summary_filename)
annotation_cat_summary.to_csv(annotation_cat_summary_dir_filename, sep = "\t", header = True, index = False)

### 4.2 Consent Summary

In [104]:
annotation_dir = os.path.join("..", "raw_data", "annotation")
annotation_filename = "freeze8_sample_annot_2019-07-30.txt"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)
sample_size = annotation.shape[0]
annotation.dropna(axis = 0, subset = ["consent"], how = "any", inplace = True)
sample_size_nona = annotation.shape[0]
na_df = pd.DataFrame(data = [["NAN", sample_size - sample_size_nona]], columns = ["consent", "sample_size"])
annotation_cat_summary, _ = categorize_df(annotation, "consent")
annotation_cat_summary = annotation_cat_summary.append(na_df, ignore_index=True)
annotation_cat_summary_dir = os.path.join("..", "data_summary")
annotation_cat_summary_filename = "annotation_consent_summary_2019-07-30.tsv"
annotation_cat_summary_dir_filename = os.path.join(annotation_cat_summary_dir, annotation_cat_summary_filename)
annotation_cat_summary.to_csv(annotation_cat_summary_dir_filename, sep = "\t", header = True, index = False)

### 4.3 Consent Summary by Cohorts

In [17]:
annotation_dir = os.path.join("..", "raw_data", "annotation")
annotation_filename = "freeze8_sample_annot_2019-10-08.txt"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)
annotation.dropna(axis = 0, subset = ["study"], how = "any", inplace = True)
_, cohort_list = categorize_df(annotation, "study")

In [19]:
consent_summary_cohort = pd.DataFrame(columns=['cohort', 'consent', 'sample_size'])
for cohort in cohort_list:
    tmp_annotation = annotation.loc[annotation.loc[:, "study"] == cohort, :]
    tmp_summary, _ = categorize_df(tmp_annotation, "consent")
    row_num = tmp_summary.shape[0]
    tmp_cohort_series = pd.Series(data = [cohort] * row_num)
    tmp_summary.insert(loc = 0, column = "cohort", value = tmp_cohort_series)
    consent_summary_cohort = pd.concat([consent_summary_cohort, tmp_summary], axis = 0, ignore_index = True)

In [20]:
consent_summary_cohort_dir = os.path.join("..", "data_summary")
consent_summary_cohort_filename = "annotation_consent_summary_cohort_2019-10-08.tsv"
consent_summary_cohort_dir_filename = os.path.join(consent_summary_cohort_dir, consent_summary_cohort_filename)
consent_summary_cohort.to_csv(consent_summary_cohort_dir_filename, sep = "\t", header = True, index = False)

### 4.4 Clean Annotation File

In [32]:
annotation_dir = os.path.join("..", "raw_data", "annotation")
annotation_filename = "freeze8_sample_annot_2020-03-03.txt"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation_raw = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)
annotation_useful_header_list = ["sample.id", "unique_subject_key", "subject_id", "consent", "study", "sex", "exclude"]
annotation = annotation_raw[annotation_useful_header_list]
annotation.dropna(axis = 0, how = "any", inplace = True)
annotation.rename(columns = {"sample.id":"NWDID"}, inplace = True)
annotation.loc[annotation.loc[:, "sex"] == "M", "sex"] = 1
annotation.loc[annotation.loc[:, "sex"] == "F", "sex"] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [33]:
annotation = annotation.loc[annotation.loc[:, "exclude"] == False, :]
annotation.reset_index(inplace = True, drop = True)
sample_num = annotation.shape[0]
consent_remove_idx = []
cohort_list = ["ARIC", "COPDGene", "CHS"]
for sample_i in range(sample_num):
    tmp_consent = annotation.loc[sample_i, "consent"]
    tmp_cohort = annotation.loc[sample_i, "study"]
    tmp_consent_list = tmp_consent.split("-")
    if (tmp_cohort in cohort_list) and (tmp_consent_list[0] == "DS"):
        consent_remove_idx.append(sample_i)
annotation_remove_consent = annotation.drop(axis = 0, labels = consent_remove_idx)

In [34]:
annotation_remove_consent_filename = "freeze8_anno05_af02.tsv"
annotation_remove_consent_dir_filename = os.path.join(annotation_dir, annotation_remove_consent_filename)
annotation_remove_consent.to_csv(annotation_remove_consent_dir_filename, sep = "\t", header = True, index = False)

### 4.5 Remove Duplicates from ToPMed Duplicates List

In [35]:
annotation_dir = os.path.join("..", "raw_data", "annotation")
dup_list_filename = "freeze8_duplicates_2019-04-19.txt"
dup_list_dir_filename = os.path.join(annotation_dir, dup_list_filename)
dup_list = pd.read_csv(dup_list_dir_filename, sep = "\t", header = 0, index_col = None)
dup_list.sort_values(by = ["study1"], inplace = True)

annotation_filename = "freeze8_anno05_af02.tsv"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)

In [36]:
dup_list_control = dup_list.loc[dup_list.loc[:, "study1"] == "Control", :]
dup_list_control_list = dup_list_control["ID2"].values.tolist()

In [37]:
dup_list_norm = dup_list.loc[dup_list.loc[:, "study1"] != "Control", :]
norm_num = dup_list_norm.shape[0]
np.random.seed(43329)
norm_idx_list = np.random.randint(2, size=norm_num)
dup_list_norm_list = []
for sample_i in range(norm_num):
    norm_idx = norm_idx_list[sample_i]
    tmp_sample_id = dup_list_norm.iloc[sample_i, norm_idx]
    dup_list_norm_list.append(tmp_sample_id)

In [38]:
dup_id_list = dup_list_control_list + dup_list_norm_list
print(annotation.shape)
annotation_unique = annotation[~annotation['NWDID'].isin(dup_id_list)]
print(annotation_unique.shape)
annotation_unique_filename = "freeze8_anno05_af02_unique01.tsv"
annotation_unique_dir_filename = os.path.join(annotation_dir, annotation_unique_filename)
annotation_unique.to_csv(annotation_unique_dir_filename, sep = "\t", header = True, index = False)

(138092, 7)
(136047, 7)


### 4.6 Duplicates of unique_subject_key in freeze8 annotation file

In [39]:
annotation_dir = os.path.join("..", "raw_data", "annotation")
annotation_filename = "freeze8_anno05_af02_unique01.tsv"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)

annotation_dup_df = df_extraction_duplicates(annotation, "unique_subject_key")
print(annotation_dup_df.shape)
annotation_dup_df = annotation_dup_df.sort_values(by=['unique_subject_key'])
display(annotation_dup_df)

annotation_dup_filename = "freeze8_anno05_af02_unique01_dup.tsv"
annotation_dup_dir_filename = os.path.join(annotation_dir, annotation_dup_filename)
annotation_dup_df.to_csv(annotation_dup_dir_filename, sep = "\t", header = True, index = False)

(0, 7)


Unnamed: 0,NWDID,unique_subject_key,subject_id,consent,study,sex,exclude


In [40]:
invar_subsets = ["consent", "sex"]
rm_header = "NWDID"
annotation_unique = remove_dup_anno(annotation, invar_subsets, rm_header)
annotation_unique_filename = "freeze8_anno05_af02_unique02.tsv"
annotation_unique_dir_filename = os.path.join(annotation_dir, annotation_unique_filename)
annotation_unique.to_csv(annotation_unique_dir_filename, sep = "\t", header = True, index = False)

### 4.7 Duplicates of NWDID in freeze8 annotation file

In [41]:
annotation_dir = os.path.join("..", "raw_data", "annotation")
annotation_filename = "freeze8_anno05_af02_unique02.tsv"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)
print(duplicates_num(annotation, "NWDID"))

0


### 4.8 Process the overlap between 3 & 4

In [96]:
annotation_dir = os.path.join("..", "raw_data", "phenotype")
annotation_filename = "freeze8_sample_annot_2019-05-30.tsv"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)
annotation_drop_subject_key = annotation.dropna(axis = 0, subset = ["unique_subject_key"], how = "any")
print(annotation_drop_subject_key.shape)
annotation_drop_subject_id = annotation.dropna(axis = 0, subset = ["subject_id"], how = "any")
print(annotation_drop_subject_id.shape)
annotation_drop_study = annotation.dropna(axis = 0, subset = ["study"], how = "any")
print(annotation_drop_study.shape)
annotation.dropna(axis = 0, subset = ["unique_subject_key", "subject_id", "NWDID"], how = "any", inplace = True)
print(annotation.shape)
annotation_select = annotation[["NWDID", "subject_id", "unique_subject_key", "consent", "study", "sex", "exclude"]]
annotation_select.rename(columns = {"subject_id":"SUBJECT_ID", "study":"cohort"}, inplace = True)
annotation_select_dir = os.path.join("..", "raw_data", "phenotype")
annotation_select_filename = "freeze8_sample_annot_2019-05-30_useful.tsv"
annotation_select_dir_filename = os.path.join(annotation_select_dir, annotation_select_filename)
#annotation_select.to_csv(annotation_select_dir_filename, sep = "\t", header = True, index = False)

(133280, 18)
(133280, 18)
(138934, 18)
(133280, 18)


## 5. Overlap b/w 3 & 4

In [22]:
pheno_filename = "coh03_pheno02_pre.tsv"
pheno_dir = os.path.join("..", "raw_data", "phenotype")
pheno_dir_filename = os.path.join(pheno_dir, pheno_filename)
pheno = pd.read_csv(pheno_dir_filename, sep = "\t", header = 0, index_col = None)

annotation_dir = os.path.join("..", "raw_data", "phenotype")
annotation_filename = "freeze8_sample_annot_2019-05-30_useful.tsv"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)
annotation.drop(axis = 1, labels = ["SUBJECT_ID", "cohort"], inplace = True)

pheno_annotation = pheno.merge(annotation, on = ["unique_subject_key"], how = "inner")
print(pheno_annotation.shape)
pheno_annotation.dropna(axis = 0, subset = ["cohort"], inplace = True)
print(pheno_annotation.shape)
print(duplicates_num(pheno_annotation, "NWDID"))
print("There is no duplicates in NWDID in the merged table of phenotype and annotation.")
pheno_annotation_dir = os.path.join("..", "raw_data", "phenotype")
pheno_annotation_filename = "coh03_pheno02_freeze8_anno_pre.tsv"
pheno_annotation_dir_filename = os.path.join(pheno_annotation_dir, pheno_annotation_filename)
pheno_annotation.to_csv(pheno_annotation_dir_filename, sep = "\t", header = True, index = False)

(66696, 47)
(66696, 47)
0
There is no duplicates in NWDID in the merged table of phenotype and annotation.


In [28]:
pheno_filename = "coh03_pheno02_pre.tsv"
pheno_dir = os.path.join("..", "raw_data", "phenotype")
pheno_dir_filename = os.path.join(pheno_dir, pheno_filename)
pheno = pd.read_csv(pheno_dir_filename, sep = "\t", header = 0, index_col = None)

annotation_dir = os.path.join("..", "raw_data", "phenotype")
annotation_filename = "freeze8_sample_annot_2019-05-30_useful.tsv"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)

pheno_annotation_subjectid_uni = pheno.merge(annotation, on = ["SUBJECT_ID", "unique_subject_key"], how = "inner")
print(pheno_annotation_subjectid_uni.shape)
pheno_annotation_subjectid_cohort_uni = pheno.merge(annotation, on = ["SUBJECT_ID", "unique_subject_key", "cohort"], how = "inner")
print(pheno_annotation_subjectid_cohort_uni.shape)
pheno_annotation_subjectid_cohort = pheno.merge(annotation, on = ["SUBJECT_ID", "cohort"], how = "inner")
print(pheno_annotation_subjectid_cohort.shape)
pheno_annotation_subjectid = pheno.merge(annotation, on = ["SUBJECT_ID"], how = "inner")
print(pheno_annotation_subjectid.shape)
pheno_annotation_uni = pheno.merge(annotation, on = ["unique_subject_key"], how = "inner")
print(pheno_annotation_uni.shape)

annotation.drop(axis = 1, labels = ["SUBJECT_ID", "cohort"], inplace = True)

pheno_annotation = pheno.merge(annotation, on = ["unique_subject_key"], how = "inner")
print(pheno_annotation.shape)
pheno_annotation.dropna(axis = 0, subset = ["cohort"], inplace = True)
print(pheno_annotation.shape)
print(duplicates_num(pheno_annotation, "NWDID"))
print("There is no duplicates in NWDID in the merged table of phenotype and annotation.")
pheno_annotation_dir = os.path.join("..", "raw_data", "phenotype")
pheno_annotation_filename = "coh03_pheno02_freeze8_anno_pre.tsv"
pheno_annotation_dir_filename = os.path.join(pheno_annotation_dir, pheno_annotation_filename)

(46105, 48)
(46105, 47)
(46105, 48)
(55366, 49)
(66696, 49)
(66696, 47)
(66696, 47)
0
There is no duplicates in NWDID in the merged table of phenotype and annotation.


In [120]:
load_dir = os.path.join("..", "raw_data", "phenotype")
ddimer_filename = "DDIMER_21MAY2019_complete_useful_col.tsv"
ddimer_dir_filename = os.path.join(load_dir, ddimer_filename)
ddimer = pd.read_csv(ddimer_dir_filename, sep = "\t", header = 0, index_col = None)

pheno_annotation_filename = "coh03_pheno02_freeze8_anno_pre.tsv"
pheno_annotation_dir_filename = os.path.join(load_dir, pheno_annotation_filename)
pheno_annotation = pd.read_csv(pheno_annotation_dir_filename, sep = "\t", header = 0, index_col = None)

pheno_annotation_ddimer_comparesex = pheno_annotation.merge(ddimer, on = ["NWDID", "cohort", "sex"], how = "outer")
print(duplicates_num(pheno_annotation_ddimer_comparesex, "NWDID"))
pheno_annotation_ddimer_nocomparesex = pheno_annotation.merge(ddimer, on = ["NWDID", "cohort"], how = "outer")
print(duplicates_num(pheno_annotation_ddimer_nocomparesex, "NWDID"))

boolean_series = pheno_annotation_ddimer_comparesex[["NWDID"]].duplicated(keep=False)
pheno_annotation_ddimer_comparesex_dup = pheno_annotation_ddimer_comparesex.loc[boolean_series, :]
pheno_annotation_ddimer_comparesex_dup.sort_values(axis = 0, by = "NWDID", inplace = True)

ddimer_anno_sex_diff_dir = os.path.join("..", "data_summary")
ddimer_anno_sex_diff_filename = "ddimer_anno_sex_diff.tsv"
ddimer_anno_sex_diff_dir_filename = os.path.join(ddimer_anno_sex_diff_dir, ddimer_anno_sex_diff_filename)
pheno_annotation_ddimer_comparesex_dup.to_csv(ddimer_anno_sex_diff_dir_filename, sep = "\t", header = True, index = False)

display(pheno_annotation_ddimer_comparesex_dup)

4
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,SUBJECT_ID,unique_subject_key,cohort,hemoglobin_mcnc_bld_1,age_at_hemoglobin_mcnc_bld_1,hematocrit_vfr_bld_1,age_at_hematocrit_vfr_bld_1,rbc_ncnc_bld_1,age_at_rbc_ncnc_bld_1,mcv_entvol_rbc_1,...,gengrp6,WEIGHT_FINAL_NORM_OVERALL,CENTER,NWDID,consent,sex,exclude,age_at_DDIMER,DDIMER,sample_remove_DDIMER
53932,131150792.0,ARIC_131150792,ARIC,,,,,,,,...,,,,NWD355569,HMB-IRB,M,True,,,
68851,,,ARIC,,,,,,,,...,,,,NWD355569,,F,,61.0,48000.0,0.0
52892,131186968.0,ARIC_131186968,ARIC,,,,,,,,...,,,,NWD355606,HMB-IRB,M,True,,,
69022,,,ARIC,,,,,,,,...,,,,NWD355606,,F,,57.0,25000.0,0.0
59496,9229.0,FHS_9229,FHS,,,,,,,,...,,,,NWD611521,HMB-IRB-NPU-MDS,F,False,,,
68671,,,FHS,,,,,,,,...,,,,NWD611521,,M,,61.0,23500.0,0.0
31155,823218.0,WHI_823218,WHI,13.2,78.25,39.9,78.25,,,,...,,,,NWD632820,HMB-IRB-NPU,M,False,,,
69026,,,WHI,,,,,,,,...,,,,NWD632820,,F,,78.0,25220.0,0.0


Manually remove the samples with different sex but the same NWDID.

In [123]:
pheno_annotation_ddimer = pheno_annotation_ddimer_comparesex.drop(labels = [53932, 52892, 59496, 31155], axis = 0)
print(pheno_annotation_ddimer.shape)
print(duplicates_num(pheno_annotation_ddimer, "NWDID"))
pheno_annotation_ddimer_dir = os.path.join("..", "raw_data", "phenotype")
pheno_annotation_ddimer_filename = "coh03_pheno03_freeze8_anno_pre.tsv"
pheno_annotation_ddimer_dir_filename = os.path.join(pheno_annotation_ddimer_dir, pheno_annotation_ddimer_filename)
pheno_annotation_ddimer.to_csv(pheno_annotation_ddimer_dir_filename, sep = "\t", header = True, index = False)

(69394, 50)
0


### Summary of phenotypes that can maps to annotation file based on cohort

In [124]:
pheno_annotation_dir = os.path.join("..", "raw_data", "phenotype")
pheno_annotation_filename = "coh03_pheno03_freeze8_anno_pre.tsv"
pheno_annotation_dir_filename = os.path.join(pheno_annotation_ddimer_dir, pheno_annotation_filename)
pheno_annotation = pd.read_csv(pheno_annotation_dir_filename, sep = "\t", header = 0, index_col = None)

  interactivity=interactivity, compiler=compiler, result=result)


In [138]:
col_list = ["gengrp6", "WEIGHT_FINAL_NORM_OVERALL", "CENTER", "DDIMER", "EGFRCKDEPI", "hemoglobin_mcnc_bld_1", "hematocrit_vfr_bld_1",
            "rbc_ncnc_bld_1", "mcv_entvol_rbc_1", "mch_entmass_rbc_1", "mchc_mcnc_rbc_1", "rdw_ratio_rbc_1", "neutrophil_ncnc_bld_1",
            "lymphocyte_ncnc_bld_1", "basophil_ncnc_bld_1", "eosinophil_ncnc_bld_1", "monocyte_ncnc_bld_1", "wbc_ncnc_bld_1",
            "pmv_entvol_bld_1", "platelet_ncnc_bld_1", "lnHBA1C"]
pheno_annotation.dropna(axis = 0, subset = ["cohort"], how = "any", inplace = True)
cat_col = "cohort"
pheno_annotation_summary, category_list = categorize_df(pheno_annotation, cat_col)
pheno_annotation_category_df = pd.DataFrame(columns = col_list, index = category_list)
for category in category_list:
    for col in col_list:
        tmp_df = pheno_annotation.loc[pheno_annotation["cohort"] == category, col]
        tmp_df.dropna(axis = 0, how = "any", inplace = True)
        pheno_annotation_category_df.loc[category, col] = tmp_df.shape[0]
pheno_annotation_category_df.sort_index(axis = 0, inplace = True)
pheno_annotation_category_dir = os.path.join("..", "data_summary")
pheno_annotation_category_filename = "overlap_pheno_annotation_noexclude_summary.tsv"
pheno_annotation_category_dir_filename = os.path.join(pheno_annotation_category_dir, pheno_annotation_category_filename)
pheno_annotation_category_df.to_csv(pheno_annotation_category_dir_filename, sep = "\t", header = True, index = True)

## 6. Overlap b/w 2 & 3

Preprocess CN sample list to get list of cram of each cohort.

In [56]:
cohort_list = ["freeze6-AA", "JHS", "OMG-SCD", "PAGE", "PharmHU", "REDS-III-Brazil", "SAS", "SCD", "SOL", "walk-PHaSST"]
cohort_dir = os.path.join("..", "..", "cnv", "sample_list")
for cohort in cohort_list:
    tmp_cohort_dir_filename = os.path.join(cohort_dir, "%s_chr16_full_sample.list"%cohort)
    tmp_cohort = pd.read_csv(tmp_cohort_dir_filename, sep = "/", header = None, index_col = None)
    tmp_sample_list = tmp_cohort.iloc[:, -1].values.tolist()
    tmp_sample_id_list = []
    for tmp_sample in tmp_sample_list:
        tmp_sample_id = tmp_sample.split(".")[0]
        tmp_sample_id_list.append(tmp_sample_id)
    tmp_sample_id_df = pd.DataFrame(data = tmp_sample_id_list)
    tmp_sample_id_dir = cohort_dir
    tmp_sample_id_filename = "%s_cram.tsv"%cohort
    tmp_sample_id_dir_filename = os.path.join(tmp_sample_id_dir, tmp_sample_id_filename)
    tmp_sample_id_df.to_csv(tmp_sample_id_dir_filename, sep = "\t", header = False, index = False)

### Overlap b/w samples we have CN and pheno_anno got in section 5

In [155]:
pheno_annotation_dir = os.path.join("..", "raw_data", "phenotype")
pheno_annotation_filename = "coh03_pheno03_freeze8_anno_pre.tsv"
pheno_annotation_dir_filename = os.path.join(pheno_annotation_ddimer_dir, pheno_annotation_filename)
pheno_annotation = pd.read_csv(pheno_annotation_dir_filename, sep = "\t", header = 0, index_col = None)

col_list = ["gengrp6", "WEIGHT_FINAL_NORM_OVERALL", "CENTER", "DDIMER", "EGFRCKDEPI", "hemoglobin_mcnc_bld_1", "hematocrit_vfr_bld_1",
            "rbc_ncnc_bld_1", "mcv_entvol_rbc_1", "mch_entmass_rbc_1", "mchc_mcnc_rbc_1", "rdw_ratio_rbc_1", "neutrophil_ncnc_bld_1",
            "lymphocyte_ncnc_bld_1", "basophil_ncnc_bld_1", "eosinophil_ncnc_bld_1", "monocyte_ncnc_bld_1", "wbc_ncnc_bld_1",
            "pmv_entvol_bld_1", "platelet_ncnc_bld_1", "lnHBA1C"]

cohort_list = ["JHS", "OMG-SCD", "PharmHU", "REDS-III-Brazil", "SAS", "HCHS_SOL", "walk-PHaSST", "WHI",
               "MESA", "GeneSTAR", "COPDGene", "CHS", "CARDIA", "BioMe", "ARIC"]
load_dir = os.path.join("..", "..", "cnv", "sample_list")
cn_pheno_anno_summary = pd.DataFrame(columns = col_list, index = cohort_list)
for cohort in cohort_list:
    if cohort == "HCHS_SOL":
        cohort_in_filename = "SOL"
    else:
        cohort_in_filename = cohort
    cohort_filename = "%s_cram.tsv"%cohort_in_filename
    cohort_dir_filename = os.path.join(load_dir, cohort_filename)
    cohort_df = pd.read_csv(cohort_dir_filename, sep = "\t", header = None, index_col = None)
    cohort_df.rename(columns = {0:"NWDID"}, inplace = True)
    cohort_pheno = cohort_df.merge(pheno_annotation, on = "NWDID", how = "inner")
    for col in col_list:
        tmp_df = cohort_pheno.loc[cohort_pheno["cohort"] == cohort, col]
        tmp_df.dropna(axis = 0, how = "any", inplace = True)
        cn_pheno_anno_summary.loc[cohort, col] = tmp_df.shape[0]
cn_pheno_anno_summary.sort_index(axis = 0, inplace = True)
cn_pheno_anno_summary_dir = os.path.join("..", "data_summary")
cn_pheno_anno_summary_filename = "overlap_cn_pheno_anno.tsv"
cn_pheno_anno_summary_dir_filename = os.path.join(cn_pheno_anno_summary_dir, cn_pheno_anno_summary_filename)
cn_pheno_anno_summary.to_csv(cn_pheno_anno_summary_dir_filename, sep = "\t", header = True, index = True)

Overlap between CN samples and annotation files

In [79]:
annotation_dir = os.path.join("..", "raw_data", "phenotype")
annotation_filename = "freeze8_sample_annot_2019-05-30.tsv"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)
annotation.dropna(axis = 0, subset = ["study"], how = "any", inplace = True)

In [88]:
cohort_list = ["JHS", "OMG-SCD", "PharmHU", "REDS-III-Brazil", "SAS", "SCD", "SOL", "walk-PHaSST", "WHI",
               "MESA", "GeneSTAR", "COPDGene", "CHS", "CARDIA", "BioMe", "ARIC"]
load_dir = os.path.join("..", "..", "cnv", "sample_list")
cohort_dir_list = [os.path.join(load_dir, "%s_cram.tsv"%cohort) for cohort in cohort_list]
commom_col = "col"
map_raw_df = annotation
save_dir = load_dir
cohort_cn_map_summary = cn_map_list(cohort_list, cohort_dir_list, map_raw_df, common_col, save_dir)
cohort_cn_map_summary_dir = os.path.join("..", "data_summary")
cohort_cn_map_summary_filename = "cohort_cn_annotation_overlap_summary.tsv"
cohort_cn_map_summary_dir_filename = os.path.join(cohort_cn_map_summary_dir, cohort_cn_map_summary_filename)
cohort_cn_map_summary.to_csv(cohort_cn_map_summary_dir_filename, sep = "\t", header = True, index = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Overlap between (Overlap between CN samples and annotation files) and blood cell traits

In [94]:
cohort_list = ["JHS", "OMG-SCD", "PharmHU", "REDS-III-Brazil", "SAS", "SOL", "walk-PHaSST", "WHI",
               "MESA", "GeneSTAR", "COPDGene", "CHS", "CARDIA", "BioMe", "ARIC"]
load_dir = os.path.join("..", "..", "cnv", "sample_list")
col_list = ["hemoglobin_mcnc_bld_1", "hematocrit_vfr_bld_1", "rbc_ncnc_bld_1", "mcv_entvol_rbc_1", "mch_entmass_rbc_1", "mchc_mcnc_rbc_1",
            "rdw_ratio_rbc_1", "neutrophil_ncnc_bld_1", "lymphocyte_ncnc_bld_1", "basophil_ncnc_bld_1", "eosinophil_ncnc_bld_1",
            "monocyte_ncnc_bld_1", "wbc_ncnc_bld_1", "pmv_entvol_bld_1", "platelet_ncnc_bld_1"]
pheno_dir = os.path.join("..", "raw_data", "phenotype")
pheno_filename = "coh03_pre.tsv"
pheno_dir_filename = os.path.join(pheno_dir, pheno_filename)
pheno = pd.read_csv(pheno_dir_filename, sep = "\t", header = 0, index_col = None)
for cohort in cohort_list[:1]:
    cohort_filename = "%s_subject_id_cram.tsv"%cohort
    cohort_dir_filename = os.path.join(load_dir, cohort_filename)
    cohort_df = pd.read_csv(cohort_dir_filename, sep = "\t", header = 0, index_col = None)
    cohort_pheno = cohort_df.merge(pheno, on = "unique_subject_key", how = "inner")
display()

Compare JHS phenotype and Annotated JHS

In [112]:
jhs_pheno_filename = "jhs_basic_phenotypes_05072019.txt"
jhs_pheno_dir = os.path.join("..", "raw_data", "phenotype")
jhs_pheno_dir_filename = os.path.join(jhs_pheno_dir, jhs_pheno_filename)
jhs_pheno = pd.read_csv(jhs_pheno_dir_filename, sep = "\t", header = 0, index_col = None)
print(jhs_pheno.shape)

jhs_anno_filename = "JHS_subject_id_cram.tsv"
jhs_anno_dir = os.path.join("..", "..", "cnv", "sample_list")
jhs_anno_dir_filename = os.path.join(jhs_anno_dir, jhs_anno_filename)
jhs_anno = pd.read_csv(jhs_anno_dir_filename, sep = "\t", header = 0, index_col = None)
print(jhs_anno.shape)

jhs_pheno_anno_raw = jhs_pheno.merge(jhs_anno, on = "NWDID", how = "inner")
print(jhs_pheno_anno_raw.shape)
print("So all the annotated JHS samples having phenotypes listed in the JHS phenotype file I used at the beginning.")

jhs_pheno_anno_raw["wbc_ncnc_bld_1"] = jhs_pheno_anno_raw.shape[0] * [None]
jhs_pheno_anno_raw["neutrophil_ncnc_bld_1"] = jhs_pheno_anno_raw.shape[0] * [None]
jhs_pheno_anno_raw["lymphocyte_ncnc_bld_1"] = jhs_pheno_anno_raw.shape[0] * [None]
jhs_pheno_anno_raw["basophil_ncnc_bld_1"] = jhs_pheno_anno_raw.shape[0] * [None]
jhs_pheno_anno_raw["eosinophil_ncnc_bld_1"] = jhs_pheno_anno_raw.shape[0] * [None]
jhs_pheno_anno_raw["monocyte_ncnc_bld_1"] = jhs_pheno_anno_raw.shape[0] * [None]
jhs_pheno_anno_raw["cohort"] = jhs_pheno_anno_raw.shape[0] * ["JHS"]
col_list = ["hemoglobin_mcnc_bld_1", "hematocrit_vfr_bld_1", "rbc_ncnc_bld_1", "mcv_entvol_rbc_1", "mch_entmass_rbc_1", "mchc_mcnc_rbc_1",
            "rdw_ratio_rbc_1", "neutrophil_ncnc_bld_1", "lymphocyte_ncnc_bld_1", "basophil_ncnc_bld_1", "eosinophil_ncnc_bld_1",
            "monocyte_ncnc_bld_1", "wbc_ncnc_bld_1", "pmv_entvol_bld_1", "platelet_ncnc_bld_1", "lnHBA1C"]
jhs_pheno_anno = jhs_pheno_anno_raw[["subject_id", "unique_subject_key", "cohort", "age"] + col_list]
jhs_pheno_anno.rename(columns = {"subject_id":"SUBJECT_ID"}, inplace = True)
for col in col_list:
    jhs_pheno_anno["age_at_%s"%col] = jhs_pheno_anno["age"]
jhs_pheno_anno.drop(axis = 1, labels = "age", inplace = True)

pheno_dir = os.path.join("..", "raw_data", "phenotype")
pheno_filename = "coh02_pre.tsv"
pheno_dir_filename = os.path.join(pheno_dir, pheno_filename)
pheno = pd.read_csv(pheno_dir_filename, sep = "\t", header = 0, index_col = None)
pheno["lnHBA1C"] = pheno.shape[0] * [None]
pheno["age_at_lnHBA1C"] = pheno.shape[0] * [None]
pheno_header = list(pheno)
jhs_pheno_anno_copy = jhs_pheno_anno[pheno_header].copy()

jhs_pheno_anno_dir = os.path.join("..", "raw_data", "phenotype")
jhs_pheno_anno_filename = "jhs_basic_phenotypes_05072019_blood_cell_traits.tsv"
jhs_pheno_anno_dir_filename = os.path.join(jhs_pheno_anno_dir, jhs_pheno_anno_filename)
jhs_pheno_anno_copy.to_csv(jhs_pheno_anno_dir_filename, sep = "\t", header = True, index = False)

(3406, 36)
(3406, 3)
(3406, 38)
So all the annotated JHS samples having phenotypes listed in the JHS phenotype file I used at the beginning.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## 7. Concatenating Annotation (Proper Phenotype), Blood Cell Traits (Proper Adjustments), and eGFR

### 7.1 Concatenating Blood Cell Traits and eGFR

In [439]:
egfr_dir = os.path.join("..", "raw_data", "phenotype", "egfr_calculated")
egfr_filename = "egfr03_unique.tsv"
egfr_dir_filename = os.path.join(egfr_dir, egfr_filename)
egfr = pd.read_csv(egfr_dir_filename, sep = "\t", header = 0, index_col = None)
egfr["SUBJECT_ID"] = egfr["SUBJECT_ID"].astype(str)
egfr.to_csv(egfr_dir_filename, sep = "\t", header = True, index = False)

btc_adad_dir = os.path.join("..", "raw_data", "phenotype")
btc_adad_filename = "btc01-coh03_adad01_noex.tsv"
btc_adad_dir_filename = os.path.join(btc_adad_dir, btc_adad_filename)
btc_adad = pd.read_csv(btc_adad_dir_filename, sep = "\t", header = 0, index_col = None)
btc_adad["SUBJECT_ID"] = btc_adad["SUBJECT_ID"].astype(str)
btc_adad.to_csv(btc_adad_dir_filename, sep = "\t", header = True, index = False)

In [440]:
# Here we first check if there is confliction between btc_adad and egfr03 in the columns "unique_subject_key", "cohort", "SUBJECT_ID"
pheno_col = ["unique_subject_key", "SUBJECT_ID", "cohort"]
annotation_col = ["unique_subject_key", "SUBJECT_ID", "cohort"]
pheno_prefix = "egfr03"
anno_prefix = "btc01-coh03_adad01-noex"
mapped_save_dir = os.path.join("..", "raw_data", "phenotype")
diff_save_dir = os.path.join("..", "data_summary")
pivot_col = "unique_subject_key"
merge_how = "outer"
btc_adad_egfr = map_annotation(egfr, btc_adad, pheno_col, annotation_col, pivot_col, merge_how,
                             pheno_prefix, anno_prefix, mapped_save_dir, diff_save_dir)

### 7.2 Adding CKD, microcytosis, and anemia

In [443]:
btc_adad_egfr_ckd = one_condition_conversion(btc_adad_egfr, "EGFRCKDEPI", 60, "CKD")
btc_microcytosis_adad_egfr_ckd = one_condition_conversion(btc_adad_egfr_ckd, "mcv_entvol_rbc_1", 80, "microcytosis")
btc02_adad_egfr_ckd = two_condition_conversion(btc_microcytosis_adad_egfr_ckd, "hemoglobin_mcnc_bld_1", 13, 12, "male", "anemia")

In [445]:
btc02_adad_egfr_ckd_dir = os.path.join("..", "raw_data", "phenotype")
btc02_adad_egfr_ckd_filename = "btc02-coh03_adad01-noex_egfr03-ckd.tsv"
btc02_adad_egfr_ckd_dir_filename = os.path.join(btc02_adad_egfr_ckd_dir, btc02_adad_egfr_ckd_filename)
btc02_adad_egfr_ckd.to_csv(btc02_adad_egfr_ckd_dir_filename, sep = "\t", header = True, index = False)

### 7.3 Concatenate Annotation (w DDIMER) and BCT-ADAD01_eGFR

In [446]:
btc_adad_egfr_dir = os.path.join("..", "raw_data", "phenotype")
btc_adad_egfr_filename = "btc02-coh03_adad01-noex_egfr03-ckd.tsv"
btc_adad_egfr_dir_filename = os.path.join(btc_adad_egfr_dir, btc_adad_egfr_filename)
btc_adad_egfr = pd.read_csv(btc_adad_egfr_dir_filename, sep = "\t", header = 0, index_col = None)

anno_ddimer_dir = os.path.join("..", "raw_data", "annotation")
anno_ddimer_filename = "freeze8_anno04_af02_unique02_ddimer.tsv"
anno_ddimer_dir_filename = os.path.join(anno_ddimer_dir, anno_ddimer_filename)
anno_ddimer = pd.read_csv(anno_ddimer_dir_filename, sep = "\t", header = 0, index_col = None)

In [447]:
# Here we first check if there is confliction between btc_adad and egfr03 in the columns "unique_subject_key", "cohort", "SUBJECT_ID"
pheno_col = ["unique_subject_key", "SUBJECT_ID", "cohort", "male"]
annotation_col = ["unique_subject_key", "subject_id", "study", "sex"]
pheno_prefix = "btc02-coh03_ddimer-noex_egfr03-ckd_adad01-noex"
anno_prefix = "freeze8_anno04_af02"
mapped_save_dir = os.path.join("..", "prepro_data", "phenotype")
diff_save_dir = os.path.join("..", "data_summary")
pivot_col = "unique_subject_key"
merge_how = "inner"
anno_ddimer_btc_adad_egfr = map_annotation(btc_adad_egfr, anno_ddimer, pheno_col, annotation_col, pivot_col, merge_how,
                                           pheno_prefix, anno_prefix, mapped_save_dir, diff_save_dir)

### 7.4 Merging JHS blood cell traits and ddimer with processed results above

In [56]:
pheno_noex_adad_noex_dir = os.path.join("..", "prepro_data", "phenotype")
pheno_noex_adad_noex_filename = "freeze8_anno04_af02_btc02-coh03_ddimer-noex_egfr03-ckd_adad01-noex.tsv"
pheno_noex_adad_noex_dir_filename = os.path.join(pheno_noex_adad_noex_dir, pheno_noex_adad_noex_filename)
pheno_noex_adad_noex = pd.read_csv(pheno_noex_adad_noex_dir_filename, sep = "\t", header = 0, index_col = None)

pheno_dir = os.path.join("..", "raw_data", "phenotype")
jhs_useful_filename = "jhs_usefule_phenotype_05072019.tsv"
jhs_useful_dir_filename = os.path.join(pheno_dir, jhs_useful_filename)
jhs_pheno_useful = pd.read_csv(jhs_useful_dir_filename, sep = "\t", header = 0, index_col = None)

In [57]:
df0_col = "NWDID"
df1_col = "NWDID"
pheno_noex_adad_noex_filljhs = merge_replace_nan(df0_col, df1_col, pheno_noex_adad_noex, jhs_pheno_useful)
pheno_noex_adad_noex_filljhs_dir = os.path.join("..", "prepro_data", "phenotype")
pheno_noex_adad_noex_filljhs_filename = "freeze8_anno04_af02_btc02-coh03_ddimer-noex_egfr03-ckd_adad01-noex_filljhs.tsv"
pheno_noex_adad_noex_filljhs_dir_filename = os.path.join(pheno_noex_adad_noex_filljhs_dir, pheno_noex_adad_noex_filljhs_filename)
pheno_noex_adad_noex_filljhs.to_csv(pheno_noex_adad_noex_filljhs_dir_filename, sep = "\t", header = True, index = False)

### 7.5 Apply Exclusion Strategy on Mapped Phenotypes

In [448]:
pheno_noex_adad_noex_dir = os.path.join("..", "prepro_data", "phenotype")
pheno_noex_adad_noex_filename = "freeze8_anno04_af02_btc02-coh03_ddimer-noex_egfr03-ckd_adad01-noex.tsv"
pheno_noex_adad_noex_dir_filename = os.path.join(pheno_noex_adad_noex_dir, pheno_noex_adad_noex_filename)
pheno_noex_adad_noex = pd.read_csv(pheno_noex_adad_noex_dir_filename, sep = "\t", header = 0, index_col = None)

#### 7.5.1 Gengrp6: excluding samples with CONSENT_text == DROP no matter how INTERNAL_USE_ONLY is

In [450]:
print(pheno_noex_adad_noex.shape)
pheno_noex_adad = pheno_noex_adad_noex.loc[pheno_noex_adad_noex.loc[:, "CONSENT_text"] != "DROP", :]
print(pheno_noex_adad.shape)

(67999, 58)
(67999, 58)


#### 7.5.2 DDIMER: excluding samples with sample_remove_DDIMER == TRUE

In [451]:
pheno_adad = pheno_noex_adad.loc[pheno_noex_adad.loc[:, "sample_remove_DDIMER"] != 1, :]
print(pheno_adad.shape)

(67999, 58)


#### 7.5.3 Save Result

In [452]:
pheno_adad_dir = os.path.join("..", "prepro_data", "phenotype")
pheno_adad_filename = "freeze8_anno04_af02_btc03-coh03_egfr03-ckd_adad01.tsv"
pheno_adad_dir_filename = os.path.join(pheno_adad_dir, pheno_adad_filename)
pheno_adad.to_csv(pheno_adad_dir_filename, sep = "\t", header = True, index = False)

# II. uPAR

uPAR is the name of the genes we are interested in, and APOL1 is the name of the diseases.

## 1. Convert eGFR ID to Annotation File

### 1.1 Keep AA and Hispanics for eGFR

In [54]:
egfr_dir = os.path.join("..", "raw_data", "phenotype", "egfr_calculated")
egfr_unique_filename = "egfr03-01_unique.tsv"
egfr_unique_dir_filename = os.path.join(egfr_dir, egfr_unique_filename)
egfr_unique = pd.read_csv(egfr_unique_dir_filename, sep = "\t", header = 0, index_col = None)

In [55]:
egfr_unique["AA"] = 0
egfr_unique.loc[egfr_unique.loc[:, "race"] == 2, "AA"] = 1
egfr_sample_num = egfr_unique.shape[0]
rm_idx_list = []
for sample_i in range(egfr_sample_num):
    tmp_AA = egfr_unique.loc[sample_i, "AA"]
    tmp_ethnicity = egfr_unique.loc[sample_i, "ethnicity"]
    if tmp_AA == 0 and tmp_ethnicity == 0:
        rm_idx_list.append(sample_i)
egfr = egfr_unique.drop(labels = rm_idx_list, axis = 0)
egfr_filename = "egfr03-01_unique_AA_HS.tsv"
egfr_dir_filename = os.path.join(egfr_dir, egfr_filename)
egfr.to_csv(egfr_dir_filename, sep = "\t", header = True, index = False)

### 1.2 Map eGFR to Annotation

In [56]:
egfr_dir = os.path.join("..", "raw_data", "phenotype", "egfr_calculated")
egfr_filename = "egfr03-01_unique_AA_HS.tsv"
egfr_dir_filename = os.path.join(egfr_dir, egfr_filename)
egfr = pd.read_csv(egfr_dir_filename, sep = "\t", header = 0, index_col = None)

annotation_dir = os.path.join("..", "raw_data", "annotation")
annotation_filename = "freeze8_anno05_af02_unique02.tsv"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)

In [None]:
# Here we first check if there is confliction between btc_adad and egfr03 in the columns "unique_subject_key", "cohort", "SUBJECT_ID"
pheno_col = ["unique_subject_key", "SUBJECT_ID", "cohort", "male"]
annotation_col = ["unique_subject_key", "subject_id", "study", "sex"]
pheno_prefix = "btc02-coh03_ddimer-noex_egfr03-ckd_adad01-noex"
anno_prefix = "freeze8_anno04_af02"
mapped_save_dir = os.path.join("..", "prepro_data", "phenotype")
diff_save_dir = os.path.join("..", "data_summary")
pivot_col = "unique_subject_key"
merge_how = "inner"
anno_ddimer_btc_adad_egfr = map_annotation(btc_adad_egfr, anno_ddimer, pheno_col, annotation_col, pivot_col, merge_how,
                                           pheno_prefix, anno_prefix, mapped_save_dir, diff_save_dir)

In [69]:
pheno_col = ["unique_subject_key", "male", "SUBJECT_ID"]
annotation_col = ["unique_subject_key", "sex", "subject_id"]
#pheno_col = ["unique_subject_key", "male"]
#annotation_col = ["unique_subject_key", "sex"]
pheno_prefix = "egfr"
anno_prefix = "freeze8_anno05_af02_unique02"
mapped_save_dir = os.path.join("..", "prepro_data", "phenotype")
diff_save_dir = os.path.join("..", "data_summary")
pivot_col = "unique_subject_key"
merge_how = "inner"

egfr_mapped = map_annotation(egfr, annotation, pheno_col, annotation_col, pivot_col, merge_how,
                             pheno_prefix, anno_prefix, mapped_save_dir, diff_save_dir)

## 2. Coding APOL1 Status

In [20]:
snp_ver = "freeze8"
snp_dir = os.path.join("..", "raw_data", "snp")
snp_id_list = ["rs71785313", "rs73885319"]
snp_dir_filename_list = [os.path.join(snp_dir, "%s_%s.raw"%(snp_ver, snp_id)) for snp_id in snp_id_list]
apol1_snp = read_snp_list_together(snp_dir_filename_list, snp_id_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snp_df.dropna(axis = 0, how = "any", inplace = True)


In [21]:
sample_num = apol1_snp.shape[0]
apol1 = pd.DataFrame(columns = ["NWDID", "APOL1"])
apol1["NWDID"] = apol1_snp["NWDID"]
apol1["APOL1"] = apol1_snp["rs71785313"] + apol1_snp["rs73885319"]
apol1.loc[apol1.loc[:, "APOL1"] != 2, "APOL1"] = 0
apol1.loc[apol1.loc[:, "APOL1"] == 2, "APOL1"] = 1

In [23]:
apol1_filename = "APOL1_status.tsv"
apol1_dir = os.path.join("..", "cohort", "APOL1")
apol1_dir_filename = os.path.join(apol1_dir, apol1_filename)
apol1.to_csv(apol1_dir_filename, sep = "\t", header = True, index = False)

## 3. Correctness of rs334 freeze8 version

In [27]:
rs334_dir = os.path.join("..", "prepro_data", "snp")
rs334_freeze6a_filename = "snp_rs334_hetero.tsv"
rs334_freeze8_filename = "freeze8_rs334_hetero.tsv"

rs334_freeze6a_dir_filename = os.path.join(rs334_dir, rs334_freeze6a_filename)
rs334_freeze8_dir_filename = os.path.join(rs334_dir, rs334_freeze8_filename)

rs334_freeze6a = pd.read_csv(rs334_freeze6a_dir_filename, sep = "\t", header = 0, index_col = None)
rs334_freeze8 = pd.read_csv(rs334_freeze8_dir_filename, sep = "\t", header = 0, index_col = None)

rs334_base_NWDID = rs334_freeze6a.merge(rs334_freeze8, on = "NWDID", how = "inner")
print(rs334_base_NWDID.shape)
rs334_base_NWDID_filename = "rs334_base_NWDID.tsv"
rs334_base_NWDID_dir_filename = os.path.join(rs334_dir, rs334_base_NWDID_filename)
rs334_base_NWDID.to_csv(rs334_base_NWDID_dir_filename, sep = "\t", header = True, index = False)

rs334_base_NWDID_geno = rs334_freeze6a.merge(rs334_freeze8, on = ["NWDID", "geno"], how = "inner")
print(rs334_base_NWDID_geno.shape)

(103646, 3)
(103645, 2)


## 4. rs399145

### 4.1 Preprocess Raw rs399145 Data

In [87]:
snp_ver = "freeze8"
snp_dir = os.path.join("..", "raw_data", "snp")
snp_id_list = ["rs399145"]
snp_dir_filename_list = [os.path.join(snp_dir, "%s_%s.raw"%(snp_ver, snp_id)) for snp_id in snp_id_list]
rs399145 = read_snp_list_together(snp_dir_filename_list, snp_id_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snp_df.dropna(axis = 0, how = "any", inplace = True)


In [89]:
rs399145_dir = os.path.join("..", "prepro_data", "snp")
rs399145_filename = "%s_rs399145_hetero.tsv"%snp_ver
rs399145_dir_filename = os.path.join(rs399145_dir, rs399145_filename)
rs399145.to_csv(rs399145_dir_filename, sep = "\t", header = True, index = False)

#### 4.2 (All Population) Find Two Copies of Minor Allele Individuals of rs399145 while rs334 = 1

In [24]:
snp_dir = os.path.join("..", "prepro_data", "snp")
rs399145_filename = "freeze8_rs399145_hetero.tsv"
rs399145_dir_filename = os.path.join(snp_dir, rs399145_filename)
rs399145 = pd.read_csv(rs399145_dir_filename, sep = "\t", header = 0, index_col = None)

rs334_filename = "freeze8_rs334_hetero.tsv"
rs334_dir_filename = os.path.join(snp_dir, rs334_filename)
rs334 = pd.read_csv(rs334_dir_filename, sep = "\t", header = 0, index_col = None)

In [35]:
rs334_rs399145 = rs334.merge(rs399145, on = "NWDID", how = "inner")
rs334_heter_rs399145 = rs334_rs399145.loc[rs334_rs399145.loc[:, "rs334"] == 1, :]
rs334_heter_rs399145_homo = rs334_heter_rs399145.loc[rs334_heter_rs399145.loc[:, "rs399145"] == 2, :]
print(rs334_heter_rs399145_homo.shape)
rs334_heter_rs399145_homo_dir = os.path.join("..", "data_summary")
rs334_heter_rs399145_homo_filename = "freeze8_rs334-heter_rs399145-2.tsv"
rs334_heter_rs399145_homo_dir_filename = os.path.join(rs334_heter_rs399145_homo_dir, rs334_heter_rs399145_homo_filename)
rs334_heter_rs399145_homo.to_csv(rs334_heter_rs399145_homo_dir_filename, sep = "\t", header = True, index = False)

(52, 3)


In [36]:
rs334_rs399145_dir = os.path.join("..", "cohort", "APOL1", "ready_data")
rs334_rs399145_filename = "common_rs399145-rs334.tsv"
rs334_rs399145_dir_filename = os.path.join(rs334_rs399145_dir, rs334_rs399145_filename)
rs334_rs399145 = pd.read_csv(rs334_rs399145_dir_filename, sep = "\t", header = 0, index_col = None)

rs334_heter_rs399145 = rs334_rs399145.loc[rs334_rs399145.loc[:, "rs334"] == 1, :]
rs334_heter_rs399145_homo = rs334_heter_rs399145.loc[rs334_heter_rs399145.loc[:, "rs399145"] == 2, :]
print(rs334_heter_rs399145_homo.shape)

rs334_heter_rs399145_homo_dir = os.path.join("..", "data_summary")
rs334_heter_rs399145_homo_filename = "freeze8_rs334-heter_rs399145-2_used.tsv"
rs334_heter_rs399145_homo_dir_filename = os.path.join(rs334_heter_rs399145_homo_dir, rs334_heter_rs399145_homo_filename)
rs334_heter_rs399145_homo.to_csv(rs334_heter_rs399145_homo_dir_filename, sep = "\t", header = True, index = False)

(15, 4)


In [45]:
snp_ver = "freeze8"
snp_dir = os.path.join("..", "raw_data", "snp")
snp_id_list = ["rs2302524", "rs2633317", "rs4251805", "rs4760", "rs73935023"]
snp_dir_filename_list = [os.path.join(snp_dir, "%s_%s.raw"%(snp_ver, snp_id)) for snp_id in snp_id_list]
snp_list = read_snp_list_each(snp_dir_filename_list, snp_id_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snp_df.dropna(axis = 0, how = "any", inplace = True)


In [59]:
ddimer_filename = "TOPMED_HarmonizedPhenotypes_DDIMER_21MAY2019.csv"
ddimer_dir = os.path.join("..", "raw_data", "phenotype")
ddimer_dir_filename = os.path.join(ddimer_dir, ddimer_filename)
ddimer_raw = pd.read_csv(ddimer_dir_filename, sep = ",", header = 0, index_col = None)
ddimer = ddimer_raw.loc[:, ["sample.id", "STUDY", "sex", "AGE_DDIMER", "DDIMER", "sample_remove_DDIMER"]]
ddimer.dropna(axis = "index", how = "any", inplace = True)
ddimer.rename(columns = {"sample.id":"NWDID", "STUDY":"cohort", "AGE_DDIMER":"age_at_DDIMER"}, inplace = True)
sex_dict = {"M":1, "F":0}
ddimer.replace({"sex": sex_dict}, inplace = True)
save_dir = os.path.join("..", "raw_data", "phenotype")
ddimer_filename = "DDIMER_21MAY2019_complete_useful_col.tsv"
ddimer_dir_filename = os.path.join(save_dir, ddimer_filename)
ddimer.to_csv(ddimer_dir_filename, sep = "\t", header = True, index = False)
print(duplicates_num(ddimer, "NWDID"))
print("There is no duplicates in NWDID in DDIMER I processed.")

0
There is no duplicates in NWDID in DDIMER I processed.


  interactivity=interactivity, compiler=compiler, result=result)


### Concatenate DDIMER to Annotation File

In [323]:
ddimer_dir = os.path.join("..", "raw_data", "phenotype")
ddimer_filename = "DDIMER_21MAY2019_complete_useful_col.tsv"
ddimer_dir_filename = os.path.join(ddimer_dir, ddimer_filename)
ddimer = pd.read_csv(ddimer_dir_filename, sep = "\t", header = 0, index_col = None)

annotation_dir = os.path.join("..", "raw_data", "annotation")
annotation_filename = "freeze8_anno04_af02_unique02.tsv"
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)

In [338]:
pheno_col = ["NWDID", "sex", "cohort"]
annotation_col = ["NWDID", "sex", "study"]
pheno_prefix = "ddimer"
anno_prefix = "freeze8_anno04_af02_unique02"
mapped_save_dir = os.path.join("..", "raw_data", "annotation")
diff_save_dir = os.path.join("..", "data_summary")
pivot_col = "NWDID"
merge_how = "left"
ddimer_mapped = map_annotation(ddimer, annotation, pheno_col, annotation_col, pivot_col, merge_how,
                               pheno_prefix, anno_prefix, mapped_save_dir, diff_save_dir)

sex_y is inconsistent.


## Appendix B. Preprocess weight, center and gengrp6

In [63]:
gengrp6_filename = "page-harmonized-phenotypes-pca-freeze2-candidate2-2016-12-14.GWASid_fid_22May2018internalPCs.SOLv2consent.txt"
gengrp6_dir = os.path.join("..", "raw_data", "adjustment", "gengrp6")
gengrp6_dir_filename = os.path.join(gengrp6_dir, gengrp6_filename)
gengrp6 = pd.read_csv(gengrp6_dir_filename, sep = "\t", header = 0, index_col = None)
gengrp6.replace(".", np.nan, inplace=True)
gengrp6_select = gengrp6[["z_sol_id", "analysis_id", "CONSENT_text", "INTERNAL_USE_ONLY",
                          "gengrp6", "study"]].dropna(axis=0, subset=["analysis_id","gengrp6"],how="any")
print(gengrp6_select.shape)

gengrp6_select_dropna = gengrp6[["z_sol_id", "analysis_id", "CONSENT_text", "INTERNAL_USE_ONLY", "gengrp6", "study"]].dropna(axis=0, how="any")
gengrp6_select_dropna.rename(columns = {"analysis_id":"SUBJECT_ID", "study":"cohort"}, inplace=True)
print(gengrp6_select_dropna.shape)
cat_col = "cohort"
gengrp6_category_summary, category_list = categorize_df(gengrp6_select_dropna, cat_col)
display(gengrp6_category_summary)
gengrp6_select_dropna["cohort"] = gengrp6_select_dropna.shape[0] * ["HCHS_SOL"]
gengrp6_select_dropna.reset_index(drop = True, inplace = True)

(11829, 6)
(11829, 6)


Unnamed: 0,cohort,sample_size
0,SOL,11829


In [71]:
gengrp6_select_dropna_num = gengrp6_select_dropna.shape[0]
gengrp6_select_dropna["unique_subject_key"] = gengrp6_select_dropna_num * [None]
for gengrp6_select_dropna_i in range(gengrp6_select_dropna_num):
    gengrp6_select_dropna.loc[gengrp6_select_dropna_i, "unique_subject_key"] = "%s_%s"%("HCHS_SOL", gengrp6_select_dropna.loc[gengrp6_select_dropna_i, "SUBJECT_ID"])
#    if (gengrp6_select_dropna_i + 1) % 100 == 0:
#        print("%d/%d"%(gengrp6_select_dropna_i + 1, gengrp6_select_dropna_num))
gengrp6_select_dropna_dir = os.path.join("..", "raw_data", "phenotype")
gengrp6_select_dropna_filename = "gengrp6_sol_noexclude.tsv"
gengrp6_select_dropna_dir_filename = os.path.join(gengrp6_select_dropna_dir, gengrp6_select_dropna_filename)
gengrp6_select_dropna.to_csv(gengrp6_select_dropna_dir_filename, sep = "\t", header = True, index = False)

weight and center

In [72]:
weight_center_dir = os.path.join("..", "raw_data", "adjustment")
weight_center_filename = "bloodcell_output.csv"
weight_center_dir_filename = os.path.join(weight_center_dir, weight_center_filename)
weight_center = pd.read_csv(weight_center_dir_filename, sep = ",", header = 0, index_col = None)
weight_center_select = weight_center[["ID", "WEIGHT_FINAL_NORM_OVERALL", "CENTER"]].dropna(axis = 0, how = "any")
weight_center_select.rename(columns = {"ID":"z_sol_id"}, inplace = True)
print(weight_center_select.shape)

(16415, 3)


In [74]:
gengrp6_select_dropna["z_sol_id"] = gengrp6_select_dropna["z_sol_id"].astype(str)
weight_center_select["z_sol_id"] = weight_center_select["z_sol_id"].astype(str)
gengrp6_weight_center = gengrp6_select_dropna.merge(weight_center_select, on = "z_sol_id", how = "outer")
gengrp6_weight_center.dropna(axis=0, subset=["SUBJECT_ID"],how="any", inplace = True)
print(gengrp6_weight_center.shape)
gengrp6_weight_center_dir = os.path.join("..", "raw_data", "phenotype")
gengrp6_weight_center_filename = "gengrp6_center_weight_noex.tsv"
gengrp6_weight_center_dir_filename = os.path.join(gengrp6_weight_center_dir, gengrp6_weight_center_filename)
gengrp6_weight_center.to_csv(gengrp6_weight_center_dir_filename, sep = "\t", header = True, index = False)

(11829, 9)


### Concatenating gengrp6_weight_center table to phenotype table

In [347]:
btc_dir = os.path.join("..", "raw_data", "phenotype")
btc_filename = "btc01-coh03.tsv"
btc_dir_filename = os.path.join(btc_dir, btc_filename)
btc = pd.read_csv(btc_dir_filename, sep = "\t", header = 0, index_col = None)
btc["SUBJECT_ID"] = btc["SUBJECT_ID"].astype(str)
print(btc.shape)
print("duplicated unique_subject_key", duplicates_num(btc, "unique_subject_key"))

gengrp6_weight_center_dir = os.path.join("..", "raw_data", "phenotype")
gengrp6_weight_center_filename = "gengrp6_center_weight_noex.tsv"
gengrp6_weight_center_dir_filename = os.path.join(gengrp6_weight_center_dir, gengrp6_weight_center_filename)
gengrp6_weight_center = pd.read_csv(gengrp6_weight_center_dir_filename, sep = "\t", header = 0, index_col = None)
gengrp6_weight_center["SUBJECT_ID"] = gengrp6_weight_center["SUBJECT_ID"].astype(str)
print("duplicated unique_subject_key", duplicates_num(gengrp6_weight_center, "unique_subject_key"))

(192282, 35)
duplicated unique_subject_key 0
duplicated unique_subject_key 0


In [349]:
# Here we first check if there is confliction between btc and gengrp6_weight_center in the columns "unique_subject_key", "cohort", "SUBJECT_ID"
pheno_col = ["unique_subject_key", "SUBJECT_ID", "cohort"]
annotation_col = ["unique_subject_key", "SUBJECT_ID", "cohort"]
pheno_prefix = "adad01_noex"
anno_prefix = "btc01-coh03"
pivot_col = "unique_subject_key"
merge_how = "left"
mapped_save_dir = os.path.join("..", "raw_data", "phenotype")
diff_save_dir = os.path.join("..", "data_summary")
btc_adad = map_annotation(gengrp6_weight_center, btc, pheno_col, annotation_col, pivot_col, merge_how,
                          pheno_prefix, anno_prefix, mapped_save_dir, diff_save_dir)

## Appendix C. Preprocess eGFR

In [129]:
egfr_old_filename = "data_all_2_eGFR_Jan2019.csv"
egfr_old_dir = os.path.join("..", "raw_data", "kidney_pheno")
egfr_old_dir_filename = os.path.join(egfr_old_dir, egfr_old_filename)
egfr_old = pd.read_csv(egfr_old_dir_filename, sep = ",", header = 0, index_col = None)
cat_col = "Study"
egfr_old_summary, category_list = categorize_df(egfr_old, cat_col)
egfr_old_summary_dir = os.path.join("..", "data_summary")
egfr_old_summary_filename = "data_all_2_eGFR_Jan2019_category_summary.tsv"
egfr_old_summary_dir_filename = os.path.join(egfr_old_summary_dir, egfr_old_summary_filename)
egfr_old_summary.to_csv(egfr_old_summary_dir_filename, sep = "\t", header = True, index = False)
display(egfr_old_summary)
save_dir = egfr_old_dir
file_prefix = "eGFR_Jan2019"
split_df(egfr_old, cat_col, category_list, file_prefix, save_dir)

Unnamed: 0,cohort,sample_size
8,ARIC,3555
9,Amish,1023
0,FHS,3087
5,GENOA,1023
3,GenSalt,1666
7,GeneSTAR,190
2,HyperGEN,1759
4,JHS,3115
6,MESA,4090
1,WHI,4224


1

In [29]:
egfr_old_dir = os.path.join("..", "raw_data", "kidney_pheno", "egfr_freeze5")
egfr_new_dir = os.path.join("..", "raw_data", "phenotype", "egfr_calculated")
cohort_list = ["AMISH", "ARIC", "FHS", "GENOA", "GenSalt", "GeneSTAR", "HyperGEN", "JHS", "MESA", "WHI"]
compare_egfr_df = pd.DataFrame(columns = ["# old not in new", "# new not in old", "# overlap", "# old", "# new", "cor", "# dup old", "# dup new"], index = cohort_list)
for cohort in cohort_list:
    egfr_old_dir_filename = os.path.join(egfr_old_dir, "eGFR_Jan2019_%s.tsv"%cohort)
    egfr_new_dir_filename = os.path.join(egfr_new_dir, "egfr_calculated_%s.tsv"%cohort)
    egfr_old = pd.read_csv(egfr_old_dir_filename, sep = "\t", header = 0, index_col = None)
    egfr_new = pd.read_csv(egfr_new_dir_filename, sep = "\t", header = 0, index_col = None)
    common_col = "id"
    egfr_old_col = "EGFR"
    egfr_new_col = "EGFR"
    egfr_old[egfr_old_col] = egfr_old[egfr_old_col].astype(float)
    egfr_new[egfr_new_col] = egfr_new[egfr_new_col].astype(float)
    df1_nodf2_num, df2_nodf1_num, overlap_num, df1_num, df2_num, cor = compare_df_pair(egfr_old, egfr_new, common_col, egfr_old_col, egfr_new_col)
    compare_egfr_df.loc[cohort, "# old not in new"] = df1_nodf2_num
    compare_egfr_df.loc[cohort, "# new not in old"] = df2_nodf1_num
    compare_egfr_df.loc[cohort, "# overlap"] = overlap_num
    compare_egfr_df.loc[cohort, "# old"] = df1_num
    compare_egfr_df.loc[cohort, "# new"] = df2_num
    compare_egfr_df.loc[cohort, "cor"] = cor
    compare_egfr_df.loc[cohort, "# dup old"] = duplicates_num(egfr_old, common_col)
    compare_egfr_df.loc[cohort, "# dup new"] = duplicates_num(egfr_new, common_col)
    print("Cohort %s completed."%cohort)
display(compare_egfr_df)
compare_egfr_dir = os.path.join("..", "data_summary")
compare_egfr_filename = "compare_egft_calculated_Jan2019_summary_nodup.tsv"
compare_egfr_dir_filename = os.path.join(compare_egfr_dir, compare_egfr_filename)
compare_egfr_df.to_csv(compare_egfr_dir_filename, sep = "\t", header = True, index = True)

Cohort AMISH completed.
Cohort ARIC completed.
Cohort FHS completed.
Cohort GENOA completed.
Cohort GenSalt completed.
Cohort GeneSTAR completed.
Cohort HyperGEN completed.
Cohort JHS completed.
Cohort MESA completed.
Cohort WHI completed.


Unnamed: 0,# old not in new,# new not in old,# overlap,# old,# new,cor,# dup old,# dup new
AMISH,0,95,1023,1023,1118,1,0,0
ARIC,0,11782,3555,3555,15337,1,0,0
FHS,-52,348,3139,3087,3487,1,0,58
GENOA,0,560,1023,1023,1583,1,0,0
GenSalt,0,180,1666,1666,1846,1,0,0
GeneSTAR,0,16,190,190,206,1,0,0
HyperGEN,0,125,1759,1759,1884,1,0,0
JHS,0,275,3115,3115,3390,1,0,0
MESA,0,2323,4090,4090,6413,1,0,0
WHI,0,468,4224,4224,4692,1,0,0


### **(Deprecated)** Replace the space elements in the MESA egfr table as NAN and remove them

In [170]:
load_dir = os.path.join("..", "raw_data", "phenotype", "egfr_useful")
egfr_new_dir_filename = os.path.join(load_dir, "egfr_calculated_MESA_raw.tsv")
egfr_new = pd.read_csv(egfr_new_dir_filename, sep = "\t", header = 0, index_col = None)
egfr_new.replace(r'^\s*$', np.nan, regex=True, inplace = True)
display(egfr_new.shape)
egfr_new.dropna(axis = 0, how = "any", inplace = True)
display(egfr_new.shape)
egfr_new_mod_dir_filename = os.path.join(load_dir, "egfr_calculated_MESA.tsv")
egfr_new.to_csv(egfr_new_mod_dir_filename, sep = "\t", header = True, index = False)

(6429, 4)

(6413, 4)

### MESA: Correlation between cepgfr1c and eGFR I calculated

In [87]:
freeze5_egfr_dir = os.path.join("..", "raw_data", "kidney_pheno", "egfr_freeze5")
freeze5_egfr_filename = "eGFR_Jan2019_MESA.tsv"
freeze5_egfr_dir_filename = os.path.join(freeze5_egfr_dir, freeze5_egfr_filename)
freeze5_egfr_df = pd.read_csv(freeze5_egfr_dir_filename, sep = "\t", header = 0, index_col = None)
freeze5_egfr = freeze5_egfr_df["EGFR"].values

cepgfr1c_dir = os.path.join("..", "raw_data", "egfr")
cepgfr1c_filename = "TOPMed_kidney_phenotype_MESA_exam1.tsv"
cepgfr1c_dir_filename = os.path.join(cepgfr1c_dir, cepgfr1c_filename)
cepgfr1c_df = pd.read_csv(cepgfr1c_dir_filename, sep = "\t", header = 0, index_col = None)
cepgfr1c = cepgfr1c_df["EGFR"].values

calegfr_dir = os.path.join("..", "raw_data", "egfr")
calegfr_filename = "TOPMed_MESA_RenalPhenotypes_16March2018_egfr.tsv"
calegfr_dir_filename = os.path.join(calegfr_dir, calegfr_filename)
calegfr_df = pd.read_csv(calegfr_dir_filename, sep = "\t", header = 0, index_col = None)
calegfr = calegfr_df["EGFR"].values

calegfr_mod_dir = os.path.join("..", "raw_data", "egfr")
calegfr_mod_filename = "TOPMed_MESA_RenalPhenotypes_16March2018_egfr_mod.tsv"
calegfr_mod_dir_filename = os.path.join(calegfr_mod_dir, calegfr_mod_filename)
calegfr_mod_df = pd.read_csv(calegfr_mod_dir_filename, sep = "\t", header = 0, index_col = None)
calegfr_mod = calegfr_mod_df["EGFR"].values

cor_cepgfr1c_calegfr = pearsonr(cepgfr1c, calegfr)[0]
cor_cepgfr1c_calegfr_mod = pearsonr(cepgfr1c, calegfr_mod)[0]
print(cor_cepgfr1c_calegfr, cor_cepgfr1c_calegfr_mod)

mse_cepgfr1c_calegfr = mean_squared_error(cepgfr1c, calegfr)
mse_cepgfr1c_calegfr_mod = mean_squared_error(cepgfr1c, calegfr_mod)
print(mse_cepgfr1c_calegfr)
print(mse_cepgfr1c_calegfr_mod)

0.9993978212689216 0.9952502226206945
3.394391478720293
33.29065562220411


In [86]:
freeze5_egfr_df.rename(columns = {"EGFR":"EGFR_freeze5"}, inplace = True)
calegfr_df.rename(columns = {"EGFR":"EGFR_calegfr"}, inplace = True)
calegfr_mod_df.rename(columns = {"EGFR":"EGFR_calegfr_mod"}, inplace = True)
egfr_list = [freeze5_egfr_df, calegfr_df, calegfr_mod_df]
merge_egfr = merge_df_list(egfr_list, "id", merge_method='merge', how = 'inner')
common_calegfr = merge_egfr["EGFR_calegfr"].values
common_freeze5_egfr = merge_egfr["EGFR_freeze5"].values
common_calegfr_mod = merge_egfr["EGFR_calegfr_mod"].values

cor_common_freeze5_calegfr = pearsonr(common_freeze5_egfr, common_calegfr)[0]
cor_common_freeze5_calegfr_mod = pearsonr(common_freeze5_egfr, common_calegfr_mod)[0]
print(cor_common_freeze5_calegfr, cor_common_freeze5_calegfr_mod)

mse_common_freeze5_calegfr = mean_squared_error(common_freeze5_egfr, common_calegfr)
mse_common_freeze5_calegfr_mod = mean_squared_error(common_freeze5_egfr, common_calegfr_mod)
print(mse_common_freeze5_calegfr)
print(mse_common_freeze5_calegfr_mod)

0.9959972669338772 0.9999999999999826
17.445292821842674
8.525105559709774e-12


In [85]:
freeze5_egfr_dir = os.path.join("..", "raw_data", "kidney_pheno", "egfr_freeze5")
freeze5_egfr_filename = "eGFR_Jan2019_MESA.tsv"
freeze5_egfr_dir_filename = os.path.join(freeze5_egfr_dir, freeze5_egfr_filename)
freeze5_egfr_df = pd.read_csv(freeze5_egfr_dir_filename, sep = "\t", header = 0, index_col = None)

cepgfr1c_dir = os.path.join("..", "raw_data", "egfr")
cepgfr1c_filename = "TOPMed_kidney_phenotype_MESA_exam1.tsv"
cepgfr1c_dir_filename = os.path.join(cepgfr1c_dir, cepgfr1c_filename)
cepgfr1c_df = pd.read_csv(cepgfr1c_dir_filename, sep = "\t", header = 0, index_col = None)

freeze5_egfr_df.rename(columns = {"EGFR":"EGFR_freeze5"}, inplace = True)
cepgfr1c_df.rename(columns = {"EGFR":"EGFR_cepgfr1c"}, inplace = True)
freeze5_cepgfr1c = freeze5_egfr_df.merge(cepgfr1c_df, left_on = "id", right_on = "sidno", how = "inner")
common_freeze5_egfr = freeze5_cepgfr1c["EGFR_freeze5"].values
common_cepgfr1c = freeze5_cepgfr1c["EGFR_cepgfr1c"].values

cor_common_freeze5_cepgfr1c = pearsonr(common_freeze5_egfr, common_cepgfr1c)[0]
print(cor_common_freeze5_cepgfr1c)

mse_common_freeze5_cepgfr1c = mean_squared_error(common_freeze5_egfr, common_cepgfr1c)
print(mse_common_freeze5_cepgfr1c)

0.9949210599819995
33.930496703190656


### Process DHS Exsited eGFR Table

In [114]:
dhs_dir = os.path.join("..", "raw_data", "kidney_pheno", "dhs")
dhs_filename = "phs001412_AF_20190705_nda.csv"
dhs_dir_filename = os.path.join(dhs_dir, dhs_filename)
dhs = pd.read_csv(dhs_dir_filename, sep = ",", header = 0, index_col = None)
dhs.dropna(axis = 0, subset = ["CKD_eGFR", "Subject_ID", "Study"], inplace = True, how = "any")

check race composition

In [117]:
race_set = set(dhs["Race"].values.tolist())
print(len(race_set), race_set)

1 {'Black'}


In [129]:
dhs.rename(columns = {"Subject_ID":"id", "Sex":"male", "Age":"age", "Race":"race", "CKD_eGFR":"EGFR" }, inplace = True)
race_dict = {"Black":2}
male_dict = {"F":0, "M":1}
dhs.replace({"male":male_dict, "race":race_dict}, inplace = True)

In [132]:
dhs_egfr_sample_num = dhs.shape[0]
dhs_egfr_col_list = ["id", "scr", "race", "age", "male", "EGFR", "case.control"]
dhs_egfr = pd.DataFrame(data = np.zeros((dhs_egfr_sample_num, len(dhs_egfr_col_list))), columns = dhs_egfr_col_list)
dhs_egfr["id"] = dhs["id"]
dhs_egfr["race"] = dhs["race"]
dhs_egfr["age"] = dhs["age"]
dhs_egfr["male"] = dhs["male"]
dhs_egfr["EGFR"] = dhs["EGFR"]
dhs_egfr_dir = os.path.join("..", "raw_data", "egfr")
dhs_egfr_filename = "TopMed_Kidney_Phenotype_DHS_egfr.tsv"
dhs_egfr_dir_filename = os.path.join(dhs_egfr_dir, dhs_egfr_filename)
dhs_egfr.to_csv(dhs_egfr_dir_filename, sep = "\t", header = True, index = False)

### Resolving FHS # old not in new negative issue

In [185]:
load_dir = os.path.join("..", "raw_data", "phenotype", "egfr_useful")
egfr_new_dir_filename = os.path.join(load_dir, "egfr_calculated_FHS_raw.tsv")
egfr_new = pd.read_csv(egfr_new_dir_filename, sep = "\t", header = 0, index_col = None)
dup_num = duplicates_num(egfr_new, "id")
print(dup_num)
boolean_series = egfr_new[["id"]].duplicated(keep=False)
egfr_new_dup = egfr_new.loc[boolean_series, :]
egfr_new_dup.sort_values(axis = 0, by = "id", inplace = True)
#display(egfr_new_dup)
duplicates_list = egfr_new_dup["id"].values.tolist()
print(len(duplicates_list))
print("Total number of all duplicates are two times of number of samples with duplicates, so each sample with duplicate having only one duplicate.")

58
116
Total number of all duplicates are two times of number of samples with duplicates, so each sample with duplicate having only one duplicate.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Now I need to check if egfr of each pair of duplicates are identical.

In [187]:
egfr_list = egfr_new_dup["EGFR"].values.tolist()
duplicate_egfr_diff_list = []
egfr_num = len(egfr_list)
for egfr_i in range(0, egfr_num, 2):
    duplicate_egfr_diff = egfr_list[egfr_i + 1] - egfr_list[egfr_i]
    duplicate_egfr_diff_list.append(duplicate_egfr_diff)
print(sum(duplicate_egfr_diff_list))
print("All egfr values of pairs of duplicates are identical, so I will drop any of them in each pair.")

0.0
All egfr values of pairs of duplicates are identical, so I will drop any of them in each pair.


In [188]:
load_dir = os.path.join("..", "raw_data", "phenotype", "egfr_useful")
egfr_new_dir_filename = os.path.join(load_dir, "egfr_calculated_FHS_raw.tsv")
egfr_new = pd.read_csv(egfr_new_dir_filename, sep = "\t", header = 0, index_col = None)
egfr_new.drop_duplicates(subset = "id", keep = "first", inplace = True)
egfr_new_nodup_dir_filename = os.path.join(load_dir, "egfr_calculated_FHS.tsv")
egfr_new.to_csv(egfr_new_nodup_dir_filename, sep = "\t", header = True, index = False)

### Preprocess CHS

CHS SCr file did not include sex info which is needed for calculating CKS-eGFR. So I map the individual ID to the annotation file to get the sex info.

In [47]:
annotation_filename = "freeze8_anno05_af02_unique02.tsv"
annotation_dir = os.path.join("..", "raw_data", "annotation")
annotation_dir_filename = os.path.join(annotation_dir, annotation_filename)
annotation = pd.read_csv(annotation_dir_filename, sep = "\t", header = 0, index_col = None)

chs_dir = os.path.join("..", "raw_data", "kidney_pheno", "chs")
chs_filename = "TOPMed_kidney_phenotype_CHS.txt"
chs_dir_filename = os.path.join(chs_dir, chs_filename)
chs = pd.read_csv(chs_dir_filename, sep = "\t", header = 0, index_col = None)
chs['ID'] = chs['ID'].astype(str)

In [49]:
subject_id_sex = annotation[["subject_id", "sex"]]
subject_id_sex.rename(columns = {"subject_id":"ID", "sex":"MALE"}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [50]:
chs_sex = chs.merge(subject_id_sex, on = "ID", how = "inner")
chs_sex_filename = "TOPMed_kidney_phenotype_CHS_sex.txt"
chs_sex_dir = chs_dir
chs_sex_dir_filename = os.path.join(chs_sex_dir, chs_sex_filename)
chs_sex.to_csv(chs_sex_dir_filename, sep = "\t", header = True, index = False)

### Preprocess CARDIA

In [31]:
cardia_dir = os.path.join("..", "raw_data", "kidney_pheno", "cardia")
cardia_dcc_filename = "cardia_dcc_demographic_v3.txt"
cardia_scr_filename = "cardia_scr_exam6.txt"
cardia_race_age_filename = "cardia_race_age_exam6.txt"

cardia_dcc_dir_filename = os.path.join(cardia_dir, cardia_dcc_filename)
cardia_scr_dir_filename = os.path.join(cardia_dir, cardia_scr_filename)
cardia_race_age_dir_filename = os.path.join(cardia_dir, cardia_race_age_filename)

cardia_dcc = pd.read_csv(cardia_dcc_dir_filename, sep = "\t", header = 0, index_col = None)
cardia_scr = pd.read_csv(cardia_scr_dir_filename, sep = "\t", header = 0, index_col = None)
cardia_race_age = pd.read_csv(cardia_race_age_dir_filename, sep = "\t", header = 0, index_col = None)

In [32]:
cardia_dcc_scr_race_age_list = [cardia_dcc, cardia_scr, cardia_race_age]
common_col = "ID"
cardia = merge_df_list(cardia_dcc_scr_race_age_list, common_col, merge_method='merge', how = 'inner')

cardia.loc[cardia.loc[:, "MALE"] == "female", "MALE"] = 0
cardia.loc[cardia.loc[:, "MALE"] == "male", "MALE"] = 1

In [33]:
cardia_filename = "cardia_exam6.txt"
cardia_dir_filename = os.path.join(cardia_dir, cardia_filename)
cardia.to_csv(cardia_dir_filename, sep = "\t", header = True, index = False)

### Concatenating All eGFR Tables

In [51]:
egfr_dir = os.path.join("..", "raw_data", "phenotype", "egfr_calculated")
cohort_list = ["AMISH", "ARIC", "CARDIA", "CHS", "DHS", "FHS", "GENOA", "GenSalt", "GeneSTAR", "JHS", "MESA", "WHI", "HCHS-SOL", "HyperGEN"]
header_selection = ["id", "age", "race", "male", "ethnicity", "EGFR"]
egfr_full = concat_egfr(egfr_dir, cohort_list, header_selection)
egfr_full_filename = "egfr03-01.tsv"
egfr_full_dir_filename = os.path.join(egfr_dir, egfr_full_filename)
egfr_full.to_csv(egfr_full_dir_filename, sep = "\t", header = True, index = False)

AMISH completed.
ARIC completed.
CARDIA completed.
CHS completed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


DHS completed.
FHS completed.
GENOA completed.
GenSalt completed.
GeneSTAR completed.
JHS completed.
MESA completed.
WHI completed.
HCHS-SOL completed.
HyperGEN completed.


### Removing and Recording Duplicates in Comprehensive eGFR Table

In [52]:
egfr_dup = df_extraction_duplicates(egfr_full, "unique_subject_key")
print(egfr_dup.shape)
egfr_dup = egfr_dup.sort_values(by=['unique_subject_key'])
egfr_dup_dir = os.path.join("..", "data_summary")
egfr_dup_filename = "egfr03-01_dup.tsv"
egfr_dup_dir_filename = os.path.join(egfr_dup_dir, egfr_dup_filename)
egfr_dup.to_csv(egfr_dup_dir_filename, sep = "\t", header = True, index = False)

(214, 8)


In [53]:
invar_subsets = ["race", "male", "cohort"]
egfr_unique = remove_dup_anno(egfr_full, invar_subsets)
egfr_unique_filename = "egfr03-01_unique.tsv"
egfr_unique_dir_filename = os.path.join(egfr_dir, egfr_unique_filename)
egfr_unique.to_csv(egfr_unique_dir_filename, sep = "\t", header = True, index = False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Concatenating eGFR to phenotype table

In [30]:
pheno_dir = os.path.join("..", "raw_data", "phenotype")
pheno_filename = "coh03_pre.tsv"
pheno_dir_filename = os.path.join(pheno_dir, pheno_filename)
pheno = pd.read_csv(pheno_dir_filename, sep = "\t", header = 0, index_col = None)
pheno["SUBJECT_ID"] = pheno["SUBJECT_ID"].astype(str)

egfr_dir = os.path.join("..", "raw_data", "phenotype", "egfr_useful")
egfr_filename = "egfr01.tsv"
egfr_dir_filename = os.path.join(egfr_dir, egfr_filename)
egfr = pd.read_csv(egfr_dir_filename, sep = "\t", header = 0, index_col = None)
egfr["SUBJECT_ID"] = egfr["SUBJECT_ID"].astype(str)

pheno_egfr = pheno.merge(egfr, on = ["SUBJECT_ID", "unique_subject_key", "cohort"], how = "outer")
print(pheno_egfr.shape)
print(duplicates_num(pheno_egfr, "unique_subject_key"))
pheno_egfr_filename = "coh03_pheno01_pre.tsv"
pheno_egfr_dir = os.path.join("..", "raw_data", "phenotype")
pheno_egfr_dir_filename = os.path.join(pheno_egfr_dir, pheno_egfr_filename)
pheno_egfr.to_csv(pheno_egfr_dir_filename, sep = "\t", header = True, index = False)

(223838, 37)
0


In [59]:
def map_deprecation_list(pheno, annotation, pheno_col, annotation_col, pivot_col, pheno_prefix, anno_prefix, save_dir):
    pheno_annotatio_dif = annotation.merge(pheno, left_on = pheno_col[0], right_on = annotation_col[0], how = "outer")
    depre_sample_list = []
    pivot_df = pheno_annotatio_dif[[pivot_col]]
    for pheno_col_i, annotation_col_i in zip(pheno_col[1:], annotation_col[1:]):
        if pheno_col_i == annotation_col_i:
            annotation_col_i = "%s_x"%annotation_col_i
            pheno_col_i = "%s_y"%pheno_col_i
        col_dif = pheno_annotatio_dif[[pivot_col, annotation_col_i, pheno_col_i]]
        tmp_depre_sample_list, df_col_i = map_deprecation(col_dif, pheno_col_i, annotation_col_i,
                                                          pivot_col, pheno_prefix, anno_prefix, save_dir)
        pivot_df = pivot_df.merge(df_col_i, on = pivot_col, how = "inner")
        depre_sample_list = depre_sample_list + tmp_depre_sample_list
    return depre_sample_list, pivot_df

In [282]:
df3 = pd.DataFrame(df1.apply(func, axis = 1), columns = ['A'])
display(df3)

Unnamed: 0,A
0,0.0
1,0.0
2,1.0
3,1.0
4,0.0
5,0.0
6,1.0
7,0.0
8,1.0
9,0.0


In [297]:
df4 = pd.DataFrame({'A': [0, 0, 1, 1, 0, 0, 1, 1, 1, 0, None, 0], 'B': [0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0]})
display(df4)
df4.iloc[:, 0] = df4.apply(func, axis = 1)
display(df4)

Unnamed: 0,A,B
0,0.0,0
1,0.0,1
2,1.0,1
3,1.0,1
4,0.0,0
5,0.0,0
6,1.0,1
7,1.0,0
8,1.0,1
9,0.0,0


Unnamed: 0,A,B
0,0.0,0
1,0.0,1
2,1.0,1
3,1.0,1
4,0.0,0
5,0.0,0
6,1.0,1
7,1.0,0
8,1.0,1
9,0.0,0


In [298]:
sample_lost = df4[df4.iloc[:, 0].isnull()].index.tolist()
print(sample_lost)

[]


In [46]:
df0 = pd.DataFrame({'A': ["a", "g", "c", "d", "k"],
                    'B': [3, 4, 5, 6, 7]})
df1 = pd.DataFrame({'A': ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
                    'B': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
                    'C': ["nice", "good", "exce", "ama", "awe", np.nan, "perf", "terri", "well"]})
display(df0)
display(df1)
df1 = df1.set_index('A')
df0 = df0.set_index('A')
df3 = df1.fillna(df0)
df4 = df1.combine_first(df0)
display(df3)
display(df4)
df3 = df3.reset_index()
display(df3)

Unnamed: 0,A,B
0,a,3
1,g,4
2,c,5
3,d,6
4,k,7


Unnamed: 0,A,B,C
0,a,,nice
1,b,,good
2,c,,exce
3,d,,ama
4,e,,awe
5,f,,
6,g,,perf
7,h,,terri
8,i,,well


Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.0,nice
b,,good
c,5.0,exce
d,6.0,ama
e,,awe
f,,
g,4.0,perf
h,,terri
i,,well


Unnamed: 0_level_0,B,C
A,Unnamed: 1_level_1,Unnamed: 2_level_1
a,3.0,nice
b,,good
c,5.0,exce
d,6.0,ama
e,,awe
f,,
g,4.0,perf
h,,terri
i,,well
k,7.0,


Unnamed: 0,A,B,C
0,a,3.0,nice
1,b,,good
2,c,5.0,exce
3,d,6.0,ama
4,e,,awe
5,f,,
6,g,4.0,perf
7,h,,terri
8,i,,well


In [None]:
df2 = df0.merge(df1, left_on = ['B'], right_on = ['C'], how = "inner")
display(df2)

In [58]:
def map_annotation(pheno, annotation, pheno_col, annotation_col, pivot_col, how_merge, pheno_prefix, anno_prefix, mapped_save_dir, diff_save_dir):
    pheno_mapped = annotation.merge(pheno, left_on = annotation_col[0], right_on = pheno_col[0], how = merge_how)
    col_num = len(pheno_col[1:])
    for col_i in range(1, col_num + 1):
        if pheno_col[col_i] != annotation_col[col_i]:
            pheno_mapped.drop(axis = 1, columns = [pheno_col[col_i], annotation_col[col_i]], inplace = True)
        else:
            tmp_pheno_col = "%s_y"%pheno_col[col_i]
            tmp_annotation_col = "%s_x"%annotation_col[col_i]
            pheno_mapped.drop(axis = 1, columns = [tmp_pheno_col, tmp_annotation_col], inplace = True)
    if len(annotation_col) > 1:
        depre_sample_list, pivot_col_df = map_deprecation_list(pheno, annotation, pheno_col, annotation_col,
                                                               pivot_col, pheno_prefix, anno_prefix, diff_save_dir)
        if depre_sample_list != []:
            pheno_mapped_raw = pheno_mapped.copy()
            del pheno_mapped
            pheno_mapped = pheno_mapped_raw[~pheno_mapped_raw[pivot_col].isin(depre_sample_list)]
        pheno_mapped = pheno_mapped.merge(pivot_col_df, on = pivot_col, how = "inner")
    pheno_mapped_filename = "%s_%s.tsv"%(anno_prefix, pheno_prefix)
    pheno_mapped_dir_filename = os.path.join(mapped_save_dir, pheno_mapped_filename)
    pheno_mapped.to_csv(pheno_mapped_dir_filename, sep = "\t", header = True, index = False)
    return pheno_mapped

In [60]:
def map_deprecation(df, pheno_col, annotation_col, pivot_col, pheno_prefix, anno_prefix, save_dir):
    df_pivot = df[[pivot_col]]
    df_col = df[[pheno_col, annotation_col]]
    df_col.iloc[:, 0] = df_col.apply(func, axis = 1)
    df_col.iloc[:, 1] = df_col.apply(func, axis = 1)
    del df
    df = pd.concat([df_pivot, df_col], axis = 1)
    depre_sample_index_list = df[df.iloc[:, 0].isnull()].index.tolist()
    if depre_sample_index_list == []:
        depre_sample_list = []
    else:
        depre_sample_list = df.loc[depre_sample_index_list, pivot_col].values.reshape(1, -1).tolist()[0]
    df.dropna(axis = 0, how = "any", inplace = True)
    tmp_compare = df[pheno_col].eq(df[annotation_col], axis = 0)
    df_dif = df[tmp_compare == False]
    df_cons = df[tmp_compare == True]
    if annotation_col.split('_x')[-1] == "":
        annotation_col_propagate = annotation_col.split('_x')[0]
        df_cons.rename(columns = {annotation_col:annotation_col_propagate}, inplace = True)
    else:
        annotation_col_propagate = annotation_col
    df_cons_propagate = df_cons.loc[:, [pivot_col, annotation_col_propagate]]
    if df_dif.shape[0] != 0:
        print("%s is inconsistent."%pheno_col)
        df_dif_filename = "%s_%s_%s.tsv"%(anno_prefix, pheno_prefix, annotation_col)
        df_dif_dir_filename = os.path.join(save_dir, df_dif_filename)
        df_dif.to_csv(df_dif_dir_filename, sep = "\t", header = True, index = False)
        depre_sample_list = df_dif[[pivot_col]].values.reshape(1, -1).tolist()[0]
    return depre_sample_list, df_cons_propagate

In [59]:
def map_deprecation_list(pheno, annotation, pheno_col, annotation_col, pivot_col, pheno_prefix, anno_prefix, save_dir):
    pheno_annotatio_dif = annotation.merge(pheno, left_on = pheno_col[0], right_on = annotation_col[0], how = "outer")
    depre_sample_list = []
    pivot_df = pheno_annotatio_dif[[pivot_col]]
    for pheno_col_i, annotation_col_i in zip(pheno_col[1:], annotation_col[1:]):
        if pheno_col_i == annotation_col_i:
            annotation_col_i = "%s_x"%annotation_col_i
            pheno_col_i = "%s_y"%pheno_col_i
        col_dif = pheno_annotatio_dif[[pivot_col, annotation_col_i, pheno_col_i]]
        tmp_depre_sample_list, df_col_i = map_deprecation(col_dif, pheno_col_i, annotation_col_i,
                                                          pivot_col, pheno_prefix, anno_prefix, save_dir)
        pivot_df = pivot_df.merge(df_col_i, on = pivot_col, how = "inner")
        depre_sample_list = depre_sample_list + tmp_depre_sample_list
    return depre_sample_list, pivot_df