In [1]:
%load_ext autoreload

In [11]:
import pandas as pd
import numpy as np
import os
import feather
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error 
import random
import copy

In [3]:
import sys
if "/proj/yunligrp/users/minzhi/custom_lib" not in sys.path:
    sys.path.insert(0, "/proj/yunligrp/users/minzhi/custom_lib")

In [12]:
from function_process_data_eqtl import *
from function_asso import *
from function_mesa_cca import *
%autoreload 2

In [22]:
def df_splitter(anno, df, cat_col, anno_merge_col_list, df_merge_col_list, all_cohorts, save_filename_prefix, save_dir_root, save_dir_base):
    if anno.equals(df):
        anno_df = copy.copy(anno)
    else:
        anno_df = anno.merge(df, left_on = anno_merge_col_list, right_on = df_merge_col_list, how = "inner")
    _, cohort_list = categorize_df(anno_df, cat_col)
    if all_cohorts == True:
        cohort_list.append("all_cohorts")
    
    for cohort in cohort_list:
        if cohort == "all_cohorts":
            df_cohort_raw = copy.copy(anno_df)
        else:
            df_cohort_raw = anno_df.loc[anno_df.loc[:, cat_col] == cohort, :]
        df_id = df_cohort_raw[df_merge_col_list]
        df_cohort = df.merge(df_id, on = df_merge_col_list, how = "inner")
        df_cohort_filename = "%s_%s.tsv"%(save_filename_prefix, cohort)
        save_dir = os.path.join(save_dir_root, cohort, save_dir_base)
        df_cohort_dir_filename = os.path.join(save_dir, df_cohort_filename)
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir, exist_ok = True)
        df_cohort.to_csv(df_cohort_dir_filename, sep = "\t", header = True, index = False)
    return cohort_list

In [None]:
def common_gene_status_pc_pheno_kinship(status_name, cohort, freeze_ver, gpc_num, common_col, snp_dict, pheno_prefix, kinship_df_ID):
    root_dir = os.path.join("..", status_name, cohort)
    save_dir = os.path.join(root_dir, "ready_data")
    if not os.path.isdir(save_dir):
        os.makedirs(save_dir, exist_ok = True) 
    snp_dir_filename_list = [os.path.join("..", "prepro_data", "snp", "%s_%s_%s.tsv"%(freeze_ver, snp_i, snp_selection)) for snp_i in snp_list]
    pc_dir_filename = os.path.join("..", "prepro_data", "pc", "%s_pc%d_pcair.tsv"%(freeze_ver, pc_num))
    status_dir_filename = os.path.join(root_dir, "pre_data", "%s_%s.tsv"%(cohort, cn_var))
    pheno_dir_filename = os.path.join("..", "cohort", cohort, "pre_data", "%s_%s.tsv"%(pheno_prefix, cohort))
    kinship_dir_filename = os.path.join("..", "prepro_data", "kinship", "%s_kinship_sample.tsv"%freeze_ver)
    snp_pc_cn_dir_filename_list = snp_dir_filename_list + [pc_dir_filename, cn_dir_filename, pheno_dir_filename, kinship_dir_filename]
    
    snp_pc_cn_df = read2df_list(snp_pc_cn_dir_filename_list)
    common_snp_0 = merge_df_list(snp_pc_cn_df, common_col, merge_method='first', how = 'inner')
    
    common_snp_0_filename = "common_%s.tsv"%snp_list[0]
    common_snp_0_dir_filename = os.path.join(save_dir, common_snp_0_filename)
    common_snp_0.to_csv(common_snp_0_dir_filename, sep="\t", index=False)
    
    sample_df = common_snp_0[["NWDID"]]
    sample_filename = "common_sample.tsv"
    sample_dir_filename = os.path.join(save_dir, sample_filename)
    sample_df.to_csv(sample_dir_filename, sep="\t", index=False)
    
    snp_num = len(snp_list)
    for snp_idx in range(1, snp_num):
        common_0 = snp_pc_cn_df[snp_idx]
        common_1 = common_snp_0
        file_type = snp_list[snp_idx]
        save_common_df(common_0, common_1, common_col, save_dir, file_type)
    
    save_common_df(snp_pc_cn_df[snp_num], common_snp_0, common_col, save_dir, "pc")
    save_common_df(snp_pc_cn_df[snp_num + 1], common_snp_0, common_col, save_dir, cn_var)
    save_common_df(snp_pc_cn_df[snp_num + 2], common_snp_0, common_col, save_dir, "pheno")
    if cn_var == "cn":
        common_del = cn2del(snp_pc_cn_df[snp_num + 1])
        save_common_df(common_del, common_snp_0, common_col, save_dir, "del")
    
    kinship_sample_selected = kinship_select_sample(kinship_df_ID, sample_df)
    print(kinship_sample_selected.shape)
    common_kinship_filename_prefix = "common_kinship"
    save_kinship_simple(save_dir, common_kinship_filename_prefix, kinship_sample_selected, sample_col = True)

### Loading Kinship Matrix for Later Usage

In [5]:
predata_dir = os.path.join("..", "prepro_data", "kinship")
freeze8_kinship_filename = "freeze8_kinship.feather"
freeze8_kinship_dir_filename = os.path.join(predata_dir, freeze8_kinship_filename)
%time freeze8_kinship_df = pd.read_feather(freeze8_kinship_dir_filename, use_threads = True)
freeze8_sample_list = list(freeze8_kinship_df)
freeze8_sample_df = pd.DataFrame(data=freeze8_sample_list, columns=["NWDID"])
freeze8_kinship_df_ID = pd.concat(objs=[freeze8_sample_df, freeze8_kinship_df], axis=1)

CPU times: user 1min 36s, sys: 5min 48s, total: 7min 25s
Wall time: 10min 54s


## 1. Load rs2302524, rs2633317, rs4251805, rs4760, rs73935023

In [45]:
snp_ver = "freeze8"
snp_dir = os.path.join("..", "raw_data", "snp")
snp_id_list = ["rs2302524", "rs2633317", "rs4251805", "rs4760", "rs73935023"]
snp_dir_filename_list = [os.path.join(snp_dir, "%s_%s.raw"%(snp_ver, snp_id)) for snp_id in snp_id_list]
snp_list = read_snp_list_each(snp_dir_filename_list, snp_id_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snp_df.dropna(axis = 0, how = "any", inplace = True)


## eGFR and APOL1 Status x Gen

### Preprocess

#### Adding CKD

In [7]:
pheno_dir = os.path.join("..", "prepro_data", "phenotype")
pheno_filename = "freeze8_anno05_af02_unique02_egfr.tsv"
pheno_dir_filename = os.path.join(pheno_dir, pheno_filename)
pheno = pd.read_csv(pheno_dir_filename, sep = "\t", header = 0, index_col = None)
pheno_ckd = one_condition_conversion(pheno, "EGFRCKDEPI", 60, "CKD")

In [9]:
pheno_ckd_dir = os.path.join("..", "prepro_data", "phenotype")
pheno_ckd_filename = "freeze8_anno05_af02_unique02_egfr-ckd.tsv"
pheno_ckd_dir_filename = os.path.join(pheno_ckd_dir, pheno_ckd_filename)
pheno_ckd.to_csv(pheno_ckd_dir_filename, sep = "\t", header = True, index = False)

#### Splitting into Each Cohort

##### egfr-ckd

In [23]:
egfr_dir = os.path.join("..", "prepro_data", "phenotype")
egfr_filename = "freeze8_anno05_af02_unique02_egfr-ckd.tsv"
egfr_dir_filename = os.path.join(egfr_dir, egfr_filename)
egfr = pd.read_csv(pheno_ckd_dir_filename, sep = "\t", header = 0, index_col = None)

In [24]:
anno = egfr
df = egfr
cat_col = "study"
anno_merge_col_list = ["NWDID"]
df_merge_col_list = ["NWDID"]
all_cohorts = True
save_filename_prefix = "freeze8_anno05_af02_unique02_egfr-ckd"
save_dir_root = os.path.join("..", "apol1")
save_dir_base = "pre_data"
cohort_list = df_splitter(anno, df, cat_col, anno_merge_col_list, df_merge_col_list, all_cohorts, save_filename_prefix, save_dir_root, save_dir_base)

To "all_cohorts", same dataframe, but different size and loaded as two different dataframes (df1.equals(df2) == False)

In [25]:
df1_dir_filename = "/proj/yunligrp/users/minzhi/asso/apol1/all_cohorts/pre_data/freeze8_anno05_af02_unique02_egfr-ckd.tsv"
df2_dir_filename = "/proj/yunligrp/users/minzhi/asso/apol1/all_cohorts/pre_data/freeze8_anno05_af02_unique02_egfr-ckd_all_cohorts.tsv"
df1 = pd.read_csv(df1_dir_filename, sep = "\t", header = 0, index_col = None)
df2 = pd.read_csv(df2_dir_filename, sep = "\t", header = 0, index_col = None)

##### APOL1 Status

In [32]:
apol1_dir = os.path.join("..", "prepro_data", "apol1")
apol1_filename = "APOL1_status.tsv"
apol1_dir_filename = os.path.join(apol1_dir, apol1_filename)
apol1 = pd.read_csv(apol1_dir_filename, sep = "\t", header = 0, index_col = None)

In [34]:
anno_dir = os.path.join("..", "raw_data", "annotation")
anno_filename = "freeze8_anno05_af02_unique02.tsv"
anno_dir_filename = os.path.join(anno_dir, anno_filename)
anno = pd.read_csv(anno_dir_filename, sep = "\t", header = 0, index_col = None)

In [37]:
df = apol1
cat_col = "study"
anno_merge_col_list = ["NWDID"]
df_merge_col_list = ["NWDID"]
all_cohorts = True
save_filename_prefix = "apol1"
save_dir_root = os.path.join("..", "apol1")
save_dir_base = "pre_data"
cohort_list = df_splitter(anno, df, cat_col, anno_merge_col_list, df_merge_col_list, all_cohorts, save_filename_prefix, save_dir_root, save_dir_base)

#### Common Samples

In [None]:
snp_list = ["rs2302524", "rs2633317", "rs4251805", "rs4760", "rs73935023"]

In [37]:
cohort = "APOL1"
freeze_ver = "freeze8"
pc_num = 11
common_col = "NWDID"
snp_list = ["rs334", "rs399145"]
cn_var = "status"
pheno_prefix = "freeze8_2019-10-08_useful_unique02_egfr-ckd.tsv"
load_dir = os.path.join("..", "APOL1", "all_cohorts", snp_id)
if os.path.exists(load_dir)
common_snp_pc_cn_pheno_kinship(cohort, freeze_ver, pc_num, common_col, snp_list, freeze8_kinship_df_ID, cn_var, pheno_prefix)

(23877, 23878)


### APOL1 Status x rs334

In [16]:
cohort = "APOL1"
load_dir = os.path.join("..", "cohort", cohort, "ready_data")
apol1_filename = "common_status.tsv"
apol1_dir_filename = os.path.join(load_dir, apol1_filename)
apol1 = pd.read_csv(apol1_dir_filename, sep="\t")

rs334_filename = "common_rs334.tsv"
rs334_dir_filename = os.path.join(load_dir, rs334_filename)
rs334 = pd.read_csv(rs334_dir_filename, sep="\t")

rs399145_filename = "common_rs399145.tsv"
rs399145_dir_filename = os.path.join(load_dir, rs399145_filename)
rs399145 = pd.read_csv(rs399145_dir_filename, sep="\t")

In [31]:
rs399145_rs334_x = genxgen(rs399145, rs334, "rs399145", "rs334", "NWDID", load_dir)
APOL1_rs334_x = genxgen(apol1, rs334, "APOL1", "rs334", "NWDID", load_dir)
APOL1_rs399145_x = genxgen(apol1, rs399145, "APOL1", "rs399145", "NWDID", load_dir)

In [32]:
cohort = "APOL1"
cn_var = "status"

phenotype_list = ["EGFRCKDEPI", "CKD"]
# table_dict = {"table1":["rs399145", "rs399145-rs334", "APOL1-rs334"]}
table_dict = {"table1":["APOL1-rs399145"]}
adad_dict = {"quan":[], "cati":["AA", "ethnicity", "study"]}

load_dir = os.path.join("..", "cohort", cohort, "ready_data")
wrap_prepare_matrix_pheno_adad_in_pheno(phenotype_list, table_dict, load_dir, adad_dict)
print("%s completed."%cohort)

table1 completed.
APOL1 completed.


### Summary Each Type of APOL1 Status, rs334, rs399145

In [49]:
cohort = "APOL1"
load_dir = os.path.join("..", "cohort", cohort, "ready_data")
apol1_filename = "common_status.tsv"
apol1_dir_filename = os.path.join(load_dir, apol1_filename)
apol1 = pd.read_csv(apol1_dir_filename, sep="\t")

rs334_filename = "common_rs334.tsv"
rs334_dir_filename = os.path.join(load_dir, rs334_filename)
rs334 = pd.read_csv(rs334_dir_filename, sep="\t")

rs399145_filename = "common_rs399145.tsv"
rs399145_dir_filename = os.path.join(load_dir, rs399145_filename)
rs399145 = pd.read_csv(rs399145_dir_filename, sep="\t")

egfr_filename = "common_pheno_adad_dummy.tsv"
egfr_dir_filename = os.path.join(load_dir, egfr_filename)
egfr = pd.read_csv(egfr_dir_filename, sep = "\t", header = 0, index_col = None)

In [64]:
df_dict = {"APOL1":apol1, "rs334":rs334, "rs399145":rs399145}
common_col = "NWDID"
save_dir = os.path.join("..", "data_summary")
value_tuple = (egfr, "EGFRCKDEPI")
df_name_list = ["APOL1", "rs334", "rs399145"]
for case_col_list in list(combinations(df_name_list,2)):
    df_list = [df_dict[case_col_list[0]], df_dict[case_col_list[1]]]
    overlap_num_df, overlap_value_df = overlap_num_all_cases(df_list, case_col_list, common_col, save_dir, value_tuple)