In [1]:
%load_ext autoreload

In [12]:
import pandas as pd
import numpy as np
import os
import feather
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error 
import random
import copy

In [3]:
import sys
if "/proj/yunligrp/users/minzhi/custom_lib" not in sys.path:
    sys.path.insert(0, "/proj/yunligrp/users/minzhi/custom_lib")

In [4]:
from function_process_data_eqtl import *
from function_asso import *
from function_mesa_cca import *
%autoreload 2

### Loading Kinship Matrix for Later Usage

In [10]:
predata_dir = os.path.join("..", "prepro_data", "kinship")
freeze8_kinship_filename = "freeze8_kinship.feather"
freeze8_kinship_dir_filename = os.path.join(predata_dir, freeze8_kinship_filename)
%time freeze8_kinship_df = pd.read_feather(freeze8_kinship_dir_filename, use_threads = True)
freeze8_sample_list = list(freeze8_kinship_df)
freeze8_sample_df = pd.DataFrame(data=freeze8_sample_list, columns=["NWDID"])
freeze8_kinship_df_ID = pd.concat(objs=[freeze8_sample_df, freeze8_kinship_df], axis=1)

CPU times: user 1min 56s, sys: 5min 6s, total: 7min 2s
Wall time: 6min 40s


## Load rs2302524, rs2633317, rs4251805, rs4760, rs73935023

In [53]:
snp_ver = "freeze8"
snp_dir = os.path.join("..", "raw_data", "snp")
save_dir = os.path.join("..", "prepro_data", "snp")
snp_id_dict = {"rs2302524":"whole", "rs2633317":"whole", "rs4251805":"whole", "rs4760":"whole", "rs73935023":"whole"}
snp_col_list = [6, 6, 6, 6, 6]
snp_dir_filename_list = [os.path.join(snp_dir, "%s_%s.raw"%(snp_ver, snp_id)) for snp_id in snp_id_dict]
snp_df_list = save_snp_dict_each(snp_dir_filename_list, snp_id_dict, snp_ver, save_dir, snp_col_list)

## Load rs334, rs399145

In [11]:
snp_ver = "freeze8"
snp_dir = os.path.join("..", "raw_data", "snp")
save_dir = os.path.join("..", "prepro_data", "snp")
snp_id_dict = {"rs334":"hetero", "rs399145":"whole", "rs11248850":"whole"}
snp_col_list = [6, 6, 6]
snp_dir_filename_list = [os.path.join(snp_dir, "%s_%s.raw"%(snp_ver, snp_id)) for snp_id in snp_id_dict]
snp_df_list = save_snp_dict_each(snp_dir_filename_list, snp_id_dict, snp_ver, save_dir, snp_col_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snp_df.dropna(axis = 0, how = "any", inplace = True)


## eGFR and APOL1 Status x Gen

### Preprocess

#### Adding CKD

In [7]:
pheno_dir = os.path.join("..", "prepro_data", "phenotype")
pheno_filename = "freeze8_anno05_af02_unique02_egfr.tsv"
pheno_dir_filename = os.path.join(pheno_dir, pheno_filename)
pheno = pd.read_csv(pheno_dir_filename, sep = "\t", header = 0, index_col = None)
pheno_ckd = one_condition_conversion(pheno, "EGFRCKDEPI", 60, "CKD")

In [9]:
pheno_ckd_dir = os.path.join("..", "prepro_data", "phenotype")
pheno_ckd_filename = "freeze8_anno05_af02_unique02_egfr-ckd.tsv"
pheno_ckd_dir_filename = os.path.join(pheno_ckd_dir, pheno_ckd_filename)
pheno_ckd.to_csv(pheno_ckd_dir_filename, sep = "\t", header = True, index = False)

#### Splitting into Each Cohort

##### egfr-ckd

In [46]:
egfr_dir = os.path.join("..", "prepro_data", "phenotype")
egfr_filename = "freeze8_anno05_af02_unique02_egfr-ckd.tsv"
egfr_dir_filename = os.path.join(egfr_dir, egfr_filename)
egfr = pd.read_csv(egfr_dir_filename, sep = "\t", header = 0, index_col = None)

In [47]:
anno = egfr
df = egfr
cat_col = "study"
anno_merge_col_list = ["NWDID"]
df_merge_col_list = ["NWDID"]
all_cohorts = True
save_filename_prefix = "freeze8_anno05_af02_unique02_egfr-ckd"
save_dir_root = os.path.join("..", "apol1")
save_dir_base = "pre_data"
cohort_list = df_splitter(anno, df, cat_col, anno_merge_col_list, df_merge_col_list,
                          all_cohorts, save_filename_prefix, save_dir_root, save_dir_base)

To "all_cohorts", same dataframe, but different size and loaded as two different dataframes (df1.equals(df2) == False)

In [25]:
df1_dir_filename = "/proj/yunligrp/users/minzhi/asso/apol1/all_cohorts/pre_data/freeze8_anno05_af02_unique02_egfr-ckd.tsv"
df2_dir_filename = "/proj/yunligrp/users/minzhi/asso/apol1/all_cohorts/pre_data/freeze8_anno05_af02_unique02_egfr-ckd_all_cohorts.tsv"
df1 = pd.read_csv(df1_dir_filename, sep = "\t", header = 0, index_col = None)
df2 = pd.read_csv(df2_dir_filename, sep = "\t", header = 0, index_col = None)

##### APOL1 Status

In [43]:
apol1_dir = os.path.join("..", "prepro_data", "apol1")
apol1_filename = "APOL1_status.tsv"
apol1_dir_filename = os.path.join(apol1_dir, apol1_filename)
apol1 = pd.read_csv(apol1_dir_filename, sep = "\t", header = 0, index_col = None)

In [44]:
anno_dir = os.path.join("..", "prepro_data", "phenotype")
anno_filename = "freeze8_anno05_af02_unique02_egfr-ckd.tsv"
anno_dir_filename = os.path.join(anno_dir, anno_filename)
anno = pd.read_csv(anno_dir_filename, sep = "\t", header = 0, index_col = None)

In [45]:
df = apol1
cat_col = "study"
anno_merge_col_list = ["NWDID"]
df_merge_col_list = ["NWDID"]
all_cohorts = True
save_filename_prefix = "apol1"
save_dir_root = os.path.join("..", "apol1")
save_dir_base = "pre_data"
cohort_list = df_splitter(anno, df, cat_col, anno_merge_col_list, df_merge_col_list,
                          all_cohorts, save_filename_prefix, save_dir_root, save_dir_base)

## All Cohorts togeother vs. Multiple SNPs

### Common Samples

In [90]:
status_name = "apol1"
cohort_list = ["all_cohorts"]
freeze_ver = "freeze8"
pc_num = 11
common_col = "NWDID"
snp_id_dict_list = [{"rs2302524":"whole"}, {"rs2633317":"whole"}, {"rs4251805":"whole"}, {"rs4760":"whole"}, {"rs73935023":"whole"}]
pheno_prefix = "freeze8_anno05_af02_unique02_egfr-ckd"

for cohort in cohort_list:
    pheno_filename = "%s_%s.tsv"%(pheno_prefix, cohort)
    status_filename = "%s_%s.tsv"%(status_name, cohort)
    for snp_id_dict in snp_id_dict_list:
        snp_list = list(snp_id_dict.keys())
        snp_id_string = "_".join(snp_list)
        save_dir = os.path.join("..", status_name, cohort, "ready_data_%s"%snp_id_string)
        snp_filename_list = []
        for snp_id in snp_id_dict:
            snp_type = snp_id_dict[snp_id]
            snp_filename = "%s_%s_%s.tsv"%(freeze_ver, snp_id, snp_type)
            snp_filename_list.append(snp_filename)
        common_gene_status_pc_pheno_kinship(status_name, cohort, freeze_ver, pc_num, common_col, freeze8_kinship_df_ID,
                                            snp_filename_list, snp_list, pheno_filename, status_filename, save_dir)
        print("%s common samples found."%cohort)

(23885, 23886)
all_cohorts common samples found.
(23885, 23886)
all_cohorts common samples found.
(23885, 23886)
all_cohorts common samples found.
(23885, 23886)
all_cohorts common samples found.
(23885, 23886)
all_cohorts common samples found.


In [15]:
def table_univar_interact(cohort, gen_name, status_name, load_dir):
    save_dir = load_dir
    status_filename = "common_%s.tsv"%status_name
    status_dir_filename = os.path.join(load_dir, status_filename)
    status = pd.read_csv(status_dir_filename, sep="\t", header = 0, index_col = None)
    
    gen_filename = "common_%s.tsv"%gen_name
    gen_dir_filename = os.path.join(load_dir, gen_filename)
    gen = pd.read_csv(gen_dir_filename, sep = "\t", header = 0, index_col = None)
    
    status_gen = genxgen(status, gen, status_name, gen_name, common_col, save_dir)

In [122]:
status_name = "apol1"
cohort_list = ["all_cohorts"]
common_col = "NWDID"
snp_id_dict_list = [{"rs2302524":"whole"}, {"rs2633317":"whole"}, {"rs4251805":"whole"}, {"rs4760":"whole"}, {"rs73935023":"whole"}]

phenotype_list = ["EGFRCKDEPI", "CKD"]
table_dict_list = [{"rs2302524":["rs2302524", "apol1-rs2302524"]}, {"rs2633317":["rs2633317", "apol1-rs2633317"]},
                   {"rs4251805":["rs4251805", "apol1-rs4251805"]}, {"rs4760":["rs4760", "apol1-rs4760"]},
                   {"rs73935023":["rs73935023", "apol1-rs73935023"]}]
adad_dict = {"quan":[], "cati":["AA", "ethnicity", "study"]}

for cohort in cohort_list:
    for table_dict, snp_id_dict in zip(table_dict_list, snp_id_dict_list):
        snp_id_list = list(snp_id_dict.keys())
        snp_id_string = "_".join(snp_id_list)
        
        load_dir = os.path.join("..", status_name, cohort, "ready_data_%s"%snp_id_string)
        gen_name = snp_id_string
        table_univar_interact(cohort, gen_name, status_name, load_dir)
        wrap_prepare_matrix_pheno_adad_in_pheno(status_name, phenotype_list, table_dict, load_dir, adad_dict)
    print("%s completed."%cohort)

rs2302524 completed.
rs2633317 completed.
rs4251805 completed.
rs4760 completed.
rs73935023 completed.
all_cohorts completed.


## Each Cohorts vs. rs334 and rs399145

### For each cohort

In [125]:
egfr_dir = os.path.join("..", "prepro_data", "phenotype")
egfr_filename = "freeze8_anno05_af02_unique02_egfr-ckd.tsv"
egfr_dir_filename = os.path.join(egfr_dir, egfr_filename)
egfr = pd.read_csv(egfr_dir_filename, sep = "\t", header = 0, index_col = None)
_, cohort_list = categorize_df(egfr, "study")

status_name = "apol1"
freeze_ver = "freeze8"
pc_num = 11
common_col = "NWDID"
snp_id_dict_list = [{"rs334":"hetero", "rs399145":"whole", "rs11248850":"whole"}]
pheno_prefix = "freeze8_anno05_af02_unique02_egfr-ckd"

for cohort in cohort_list:
    pheno_filename = "%s_%s.tsv"%(pheno_prefix, cohort)
    status_filename = "%s_%s.tsv"%(status_name, cohort)
    for snp_id_dict in snp_id_dict_list:
        snp_list = list(snp_id_dict.keys())
        snp_id_string = "_".join(snp_list)
        save_dir = os.path.join("..", status_name, cohort, "ready_data_%s"%snp_id_string)
        snp_filename_list = []
        for snp_id in snp_id_dict:
            snp_type = snp_id_dict[snp_id]
            snp_filename = "%s_%s_%s.tsv"%(freeze_ver, snp_id, snp_type)
            snp_filename_list.append(snp_filename)
        common_gene_status_pc_pheno_kinship(status_name, cohort, freeze_ver, pc_num, common_col, freeze8_kinship_df_ID,
                                            snp_filename_list, snp_list, pheno_filename, status_filename, save_dir)
        print("%s common samples found."%cohort)

(375, 376)
DHS common samples found.
(1612, 1613)
WHI common samples found.
(709, 710)
CHS common samples found.
(1682, 1683)
ARIC common samples found.
(3132, 3133)
JHS common samples found.
(1090, 1091)
MESA common samples found.
(7708, 7709)
HCHS_SOL common samples found.
(205, 206)
GeneSTAR common samples found.
(1844, 1845)
HyperGEN common samples found.
(1091, 1092)
GENOA common samples found.
(3413, 3414)
FHS common samples found.
(1014, 1015)
CARDIA common samples found.


In [19]:
def table_apol1(cohort, gen_name_list, status_name, common_col, load_dir):
    save_dir = load_dir
    status_filename = "common_%s.tsv"%status_name
    status_dir_filename = os.path.join(load_dir, status_filename)
    status = pd.read_csv(status_dir_filename, sep="\t", header = 0, index_col = None)
    
    gen_list = []
    for gen_name in gen_name_list:
        gen_filename = "common_%s.tsv"%gen_name
        gen_dir_filename = os.path.join(load_dir, gen_filename)
        gen = pd.read_csv(gen_dir_filename, sep = "\t", header = 0, index_col = None)
        genxgen(status, gen, status_name, gen_name, common_col, save_dir)
        gen_list.append(gen)
        
    genxgen(gen_list[0], gen_list[1], gen_name_list[0], gen_name_list[1], common_col, save_dir)
    genxgen(gen_list[0], gen_list[2], gen_name_list[0], gen_name_list[2], common_col, save_dir)
    genxgen(gen_list[1], gen_list[2], gen_name_list[1], gen_name_list[2], common_col, save_dir)

In [22]:
egfr_dir = os.path.join("..", "prepro_data", "phenotype")
egfr_filename = "freeze8_anno05_af02_unique02_egfr-ckd.tsv"
egfr_dir_filename = os.path.join(egfr_dir, egfr_filename)
egfr = pd.read_csv(egfr_dir_filename, sep = "\t", header = 0, index_col = None)
_, cohort_list = categorize_df(egfr, "study")

status_name = "apol1"
common_col = "NWDID"
snp_id_dict_list = [{"rs334":"hetero", "rs399145":"whole", "rs11248850":"whole"}]

phenotype_list = ["EGFRCKDEPI", "CKD"]
table_dict_list = [{"table_apol1":["rs334", "rs399145", "rs11248850", "apol1-rs334", "apol1-rs399145", "apol1-rs11248850",
                                   "rs334-rs399145", "rs334-rs11248850", "rs399145-rs11248850"]}]
adad_dict = {"quan":[], "cati":["AA", "ethnicity"]}

for cohort in cohort_list:
    for table_dict, snp_id_dict in zip(table_dict_list, snp_id_dict_list):
        snp_id_list = list(snp_id_dict.keys())
        snp_id_string = "_".join(snp_id_list)
        
        load_dir = os.path.join("..", status_name, cohort, "ready_data_%s"%snp_id_string)
        gen_name = snp_id_string
        table_apol1(cohort, snp_id_list, status_name, common_col, load_dir)
        wrap_prepare_matrix_pheno_adad_in_pheno(status_name, phenotype_list, table_dict, load_dir, adad_dict)
    print("%s completed."%cohort)

table_apol1 completed.
GeneSTAR completed.
table_apol1 completed.
GENOA completed.
table_apol1 completed.
HyperGEN completed.
table_apol1 completed.
WHI completed.
table_apol1 completed.
ARIC completed.
table_apol1 completed.
CARDIA completed.
table_apol1 completed.
JHS completed.
table_apol1 completed.
CHS completed.
table_apol1 completed.
HCHS_SOL completed.
table_apol1 completed.
FHS completed.
table_apol1 completed.
MESA completed.
table_apol1 completed.
DHS completed.


### Summary Each Type of APOL1 Status, rs334, rs399145

In [49]:
cohort = "APOL1"
load_dir = os.path.join("..", "cohort", cohort, "ready_data")
apol1_filename = "common_status.tsv"
apol1_dir_filename = os.path.join(load_dir, apol1_filename)
apol1 = pd.read_csv(apol1_dir_filename, sep="\t")

rs334_filename = "common_rs334.tsv"
rs334_dir_filename = os.path.join(load_dir, rs334_filename)
rs334 = pd.read_csv(rs334_dir_filename, sep="\t")

rs399145_filename = "common_rs399145.tsv"
rs399145_dir_filename = os.path.join(load_dir, rs399145_filename)
rs399145 = pd.read_csv(rs399145_dir_filename, sep="\t")

egfr_filename = "common_pheno_adad_dummy.tsv"
egfr_dir_filename = os.path.join(load_dir, egfr_filename)
egfr = pd.read_csv(egfr_dir_filename, sep = "\t", header = 0, index_col = None)

In [64]:
df_dict = {"APOL1":apol1, "rs334":rs334, "rs399145":rs399145}
common_col = "NWDID"
save_dir = os.path.join("..", "data_summary")
value_tuple = (egfr, "EGFRCKDEPI")
df_name_list = ["APOL1", "rs334", "rs399145"]
for case_col_list in list(combinations(df_name_list,2)):
    df_list = [df_dict[case_col_list[0]], df_dict[case_col_list[1]]]
    overlap_num_df, overlap_value_df = overlap_num_all_cases(df_list, case_col_list, common_col, save_dir, value_tuple)