In [1]:
%load_ext autoreload

In [2]:
import pandas as pd
import numpy as np
import os
import feather
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error 
import random

In [11]:
import sys
if "/proj/yunligrp/users/minzhi/custom_lib" not in sys.path:
    sys.path.insert(0, "/proj/yunligrp/users/minzhi/custom_lib")

In [12]:
from function_process_data_eqtl import *
from function_asso import *
from function_mesa_cca import *
%autoreload 2

### Loading Kinship Matrix for Later Usage

In [10]:
predata_dir = os.path.join("..", "prepro_data", "kinship")
freeze8_kinship_filename = "freeze8_kinship.feather"
freeze8_kinship_dir_filename = os.path.join(predata_dir, freeze8_kinship_filename)
%time freeze8_kinship_df = pd.read_feather(freeze8_kinship_dir_filename, use_threads = True)
freeze8_sample_list = list(freeze8_kinship_df)
freeze8_sample_df = pd.DataFrame(data=freeze8_sample_list, columns=["NWDID"])
freeze8_kinship_df_ID = pd.concat(objs=[freeze8_sample_df, freeze8_kinship_df], axis=1)

CPU times: user 1min 37s, sys: 5min 42s, total: 7min 20s
Wall time: 10min 34s


## 1. Load rs2302524, rs2633317, rs4251805, rs4760, rs73935023

In [45]:
snp_ver = "freeze8"
snp_dir = os.path.join("..", "raw_data", "snp")
snp_id_list = ["rs2302524", "rs2633317", "rs4251805", "rs4760", "rs73935023"]
snp_dir_filename_list = [os.path.join(snp_dir, "%s_%s.raw"%(snp_ver, snp_id)) for snp_id in snp_id_list]
snp_list = read_snp_list_each(snp_dir_filename_list, snp_id_list)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  snp_df.dropna(axis = 0, how = "any", inplace = True)


## eGFR and APOL1 Status x Gen

### Preprocess

#### Adding CKD

In [7]:
pheno_dir = os.path.join("..", "prepro_data", "phenotype")
pheno_filename = "freeze8_anno05_af02_unique02_egfr.tsv"
pheno_dir_filename = os.path.join(pheno_dir, pheno_filename)
pheno = pd.read_csv(pheno_dir_filename, sep = "\t", header = 0, index_col = None)
pheno_ckd = one_condition_conversion(pheno, "EGFRCKDEPI", 60, "CKD")

In [9]:
pheno_ckd_dir = os.path.join("..", "APOL1", "all_cohorts", "pre_data")
pheno_ckd_filename = "freeze8_anno05_af02_unique02_egfr-ckd.tsv"
pheno_ckd_dir_filename = os.path.join(pheno_ckd_dir, pheno_ckd_filename)
pheno_ckd.to_csv(pheno_ckd_dir_filename, sep = "\t", header = True, index = False)

#### Common Samples

In [None]:
snp_list = ["rs2302524", "rs2633317", "rs4251805", "rs4760", "rs73935023"]

In [37]:
cohort = "APOL1"
freeze_ver = "freeze8"
pc_num = 11
common_col = "NWDID"
snp_list = ["rs334", "rs399145"]
cn_var = "status"
pheno_prefix = "freeze8_2019-10-08_useful_unique02_egfr-ckd.tsv"
load_dir = os.path.join("..", "APOL1", "all_cohorts", snp_id)
if os.path.exists(load_dir)
common_snp_pc_cn_pheno_kinship(cohort, freeze_ver, pc_num, common_col, snp_list, freeze8_kinship_df_ID, cn_var, pheno_prefix)

(23877, 23878)


### APOL1 Status x rs334

In [16]:
cohort = "APOL1"
load_dir = os.path.join("..", "cohort", cohort, "ready_data")
apol1_filename = "common_status.tsv"
apol1_dir_filename = os.path.join(load_dir, apol1_filename)
apol1 = pd.read_csv(apol1_dir_filename, sep="\t")

rs334_filename = "common_rs334.tsv"
rs334_dir_filename = os.path.join(load_dir, rs334_filename)
rs334 = pd.read_csv(rs334_dir_filename, sep="\t")

rs399145_filename = "common_rs399145.tsv"
rs399145_dir_filename = os.path.join(load_dir, rs399145_filename)
rs399145 = pd.read_csv(rs399145_dir_filename, sep="\t")

In [31]:
rs399145_rs334_x = genxgen(rs399145, rs334, "rs399145", "rs334", "NWDID", load_dir)
APOL1_rs334_x = genxgen(apol1, rs334, "APOL1", "rs334", "NWDID", load_dir)
APOL1_rs399145_x = genxgen(apol1, rs399145, "APOL1", "rs399145", "NWDID", load_dir)

In [32]:
cohort = "APOL1"
cn_var = "status"

phenotype_list = ["EGFRCKDEPI", "CKD"]
# table_dict = {"table1":["rs399145", "rs399145-rs334", "APOL1-rs334"]}
table_dict = {"table1":["APOL1-rs399145"]}
adad_dict = {"quan":[], "cati":["AA", "ethnicity", "study"]}

load_dir = os.path.join("..", "cohort", cohort, "ready_data")
wrap_prepare_matrix_pheno_adad_in_pheno(phenotype_list, table_dict, load_dir, adad_dict)
print("%s completed."%cohort)

table1 completed.
APOL1 completed.


### Summary Each Type of APOL1 Status, rs334, rs399145

In [49]:
cohort = "APOL1"
load_dir = os.path.join("..", "cohort", cohort, "ready_data")
apol1_filename = "common_status.tsv"
apol1_dir_filename = os.path.join(load_dir, apol1_filename)
apol1 = pd.read_csv(apol1_dir_filename, sep="\t")

rs334_filename = "common_rs334.tsv"
rs334_dir_filename = os.path.join(load_dir, rs334_filename)
rs334 = pd.read_csv(rs334_dir_filename, sep="\t")

rs399145_filename = "common_rs399145.tsv"
rs399145_dir_filename = os.path.join(load_dir, rs399145_filename)
rs399145 = pd.read_csv(rs399145_dir_filename, sep="\t")

egfr_filename = "common_pheno_adad_dummy.tsv"
egfr_dir_filename = os.path.join(load_dir, egfr_filename)
egfr = pd.read_csv(egfr_dir_filename, sep = "\t", header = 0, index_col = None)

In [64]:
df_dict = {"APOL1":apol1, "rs334":rs334, "rs399145":rs399145}
common_col = "NWDID"
save_dir = os.path.join("..", "data_summary")
value_tuple = (egfr, "EGFRCKDEPI")
df_name_list = ["APOL1", "rs334", "rs399145"]
for case_col_list in list(combinations(df_name_list,2)):
    df_list = [df_dict[case_col_list[0]], df_dict[case_col_list[1]]]
    overlap_num_df, overlap_value_df = overlap_num_all_cases(df_list, case_col_list, common_col, save_dir, value_tuple)