## This notebook check number of samples available for each phenotypes

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [6]:
pheno_folder = "/expanse/protected/gymreklab-dbgap/mount/H3Africa/DS_I_Africa_project/from_ilifu/awigen_dataset/phenotypes"
pheno_df = pd.read_csv(pheno_folder+"/EGAF00004691444/EGA_dataset_v0_1.csv")
pheno_df.columns = [i.strip(" ") for i in pheno_df.columns.tolist()]
pheno_df["study_id"] = pheno_df["study_id"].apply(lambda x: x.strip(" "))
pheno_df["country_qc"] = pheno_df["study_id"].apply(lambda x: x.strip(" "))

In [7]:
phnoe_excel = pd.ExcelFile(pheno_folder+"/EGAF00004691445/V2.0_AWI-Gen_Code_Book-EGA_Variables.xlsx")
phnoe_excel.sheet_names

['Main',
 '1. Demographic Information',
 '2. Behaviour_Lifestyle',
 '3. Health History',
 '4. Family History',
 '5. Exposure',
 '6. Infection History',
 '7. Cardiovascular Disease Risk',
 '8. Measurements & Calculations']

In [8]:
Measue_df = phnoe_excel.parse('8. Measurements & Calculations',skiprows=2)

In [10]:
trait_num_dict = {}
trait_list = Measue_df["Variable Name"].tolist()
for trait in trait_list:
    valid_data = sum(~(pheno_df[trait].isnull() | pheno_df[trait].isin([-999,-111,-222])))
    trait_num_dict[trait]=valid_data
    print(f"{trait}: {valid_data:,}")

standing_height_qc: 11,948
weight_qc: 11,956
bmi_c_qc: 11,945
bmi_cat_c_qc: 11,945
waist_circumference_qc: 11,472
hip_circumference_qc: 11,479
waist_hip_r_c_qc: 11,468
bp_sys_average_qc: 11,944
bp_dia_average_qc: 11,945
htn_jnc7_qc: 12,032
pulse_average_qc: 11,968
visceral_fat_qc: 11,059
subcutaneous_fat_qc: 11,165
mean_cimt_right_qc: 9,945
mean_cimt_left_qc: 9,952
fasting_confirmation_qc: 11,407
glucose_qc: 11,655
insulin_qc: 8,119
diabetes_status_c_qc: 12,032
s_creatinine_qc: 11,701
hdl_qc: 11,719
ldl_qc: 10,619
friedewald_ldl_c_c_qc: 11,687
cholesterol_1_qc: 11,717
non_hdl_c_c_qc: 11,717
triglycerides_qc: 11,701
ur_creatinine_qc: 9,023
ur_albumin_qc: 4,491
ur_protein_qc: 9,863
acr_qc: 4,288
egfr_c_qc: 11,701
ckd_c_qc: 9,836


In [14]:
Measue_df["sample_with_values"] = Measue_df["Variable Name"].map(trait_num_dict)

In [18]:
Measue_df.to_csv("/expanse/protected/gymreklab-dbgap/mount/H3Africa/DS_I_Africa_project/from_ilifu/awigen_dataset/phenotypes/phenotypes_table.csv",sep=",",index=None)

In [28]:
Measue_df=pd.read_csv("/expanse/protected/gymreklab-dbgap/mount/H3Africa/DS_I_Africa_project/from_ilifu/awigen_dataset/phenotypes/phenotypes_table.csv",sep=",")

In [29]:
Measue_df

Unnamed: 0,Variable Name,Data Type,Unit,Description,Categorical Description,Notes & Exceptions,sample_with_values
0,standing_height_qc,Continuous,mm,Standing height\n,-999 - Missing,,11948
1,weight_qc,Continuous,kg,Weight\n,-999 - Missing,,11956
2,bmi_c_qc,Continuous,kg/m2,BMI (calculated),-999 - Missing,,11945
3,bmi_cat_c_qc,Continuous,weight in \nkg/height in m2,BMI (categorical) (calculated),0 - underweight is a BMI<18.5\n1 - normal weig...,,11945
4,waist_circumference_qc,Continuous,mm,Waist circumference\n,-999 - Missing,Exception: Missing data for Agincourt over 70 ...,11472
5,hip_circumference_qc,Continuous,mm,Hip circumference\n,-999 - Missing,Exception: Missing data for Agincourt over 70 ...,11479
6,waist_hip_r_c_qc,Continuous,,Waist-to-hip ratio (calculated),-999 - Missing\n,Exception: Agincourt data missing for over 70 ...,11468
7,bp_sys_average_qc,Continuous,mmHg,Average systolic blood pressure\n,-999 - Missing,,11944
8,bp_dia_average_qc,Continuous,mmHg,Average diastolic blood pressure\n,-999 - Missing,,11945
9,htn_jnc7_qc,Categorical,,Hypertension status (calculated)\n\n,0 - no hypertension\n1 - hypertension present\...,"Note self report HT, or SBP >=140 or DBP >= 90...",12032


In [2]:
aou_df = pd.read_csv("https://raw.githubusercontent.com/CAST-genomics/cast-workflows/main/gwas/aou/phenotypes_manifest.csv",sep=",")

In [6]:
ukb_df = pd.read_csv("https://pan-ukb-us-east-1.s3.amazonaws.com/sumstats_release/phenotype_manifest.tsv.bgz",compression='gzip',sep="\t")

In [31]:
ukb_df.to_csv("/expanse/protected/gymreklab-dbgap/mount/H3Africa/DS_I_Africa_project/from_ilifu/awigen_dataset/phenotypes/panUKB_phenotype_manifest.csv",sep=",",index=None)

In [27]:
ukb_df[["trait_type","description"]][ukb_df["description"].str.contains("protein")==True]

Unnamed: 0,trait_type,description
3,biomarkers,Apolipoprotein A
4,biomarkers,Apolipoprotein B
11,biomarkers,C-reactive protein
19,biomarkers,Lipoprotein A
26,biomarkers,Total protein
4501,continuous,Non-albumin protein
4675,icd10,E78 Disorders of lipoprotein metabolism and ot...
5094,icd10,O14 Gestational [pregnancy-induced] hypertensi...
5187,icd10,R77 Other abnormalities of plasma proteins
5675,phecode,Disorders of protein plasma/amino-acid transpo...


In [None]:
ukb_df.description.

In [10]:
ukb_df.loc[0][]

trait_type                                                    biomarkers
phenocode                                                          30600
pheno_sex                                                     both_sexes
coding                                                               NaN
modifier                                                            irnt
                                             ...                        
aws_path_tabix         s3://pan-ukb-us-east-1/sumstats_flat_files_tab...
md5_hex                                 b656fcf77aea5019197489593bcc2954
size_in_bytes                                                 2282017915
md5_hex_tabix                           daa278e77542009a259c2382da9c0e1e
size_in_bytes_tabix                                              2242763
Name: 0, Length: 82, dtype: object

In [38]:
check_phe = "cholesterol"
display([i for i in Measue_df["Description"].tolist() if check_phe in i])
display([i for i in pheno_df.columns.tolist() if check_phe in i])


['HDL cholesterol (measured - fasting)',
 'LDL cholesterol result (measured - fasting)',
 'Low-density lipoprotein (LDL)-cholesterol (Friedwald calculated)\n\n',
 'Total cholesterol (measured - fasting)\n',
 'Non-HDL calculated variable (Total cholesterol minus HDL-C) (calculated)']

['h_cholesterol_mom_qc',
 'h_cholesterol_dad_qc',
 'h_cholesterol_qc',
 'cholesterol_treatment_qc',
 'cholesterol_1_qc']

In [20]:
aou_df.phenotype.apply(lambda x: " ".join(x.split("_")))

0                                   ALT
1                        platelet count
2                  alkaline phosphatase
3            aspartate aminotransferase
4                               calcium
5                           cholesterol
6                            creatinine
7                    eosinophil percent
8                               glucose
9                  glycated haemoglobin
10                          haematocrit
11                      hdl cholesterol
12                      ldl cholesterol
13                   lymphocyte percent
14                                 egfr
15                 mean platelet volume
16                     neutrophil count
17                   neutrophil percent
18                            phosphate
19                 red blood cell count
20          mean corpuscular hemoglobin
21              mean corpuscular volume
22    red blood cell distribution width
23                              protein
24                        triglycerides


In [26]:
awigen_phe_dict ={"standing_height_qc":"height",
                  "weight_qc":"Body mass index (BMI)", 
                  "bp_sys_average_qc":"Systolic blood pressure (SBP)",
                  "bp_dia_average_qc":"Diastolic blood pressure (DBP)",
                  "diabetes_status_c_qc":"Type 2 diabetes"}

Unnamed: 0,Variable Name,Data Type,Unit,Description,Categorical Description,Notes & Exceptions,sample_with_values
0,standing_height_qc,Continuous,mm,Standing height\n,-999 - Missing,,11948
1,weight_qc,Continuous,kg,Weight\n,-999 - Missing,,11956
2,bmi_c_qc,Continuous,kg/m2,BMI (calculated),-999 - Missing,,11945
3,bmi_cat_c_qc,Continuous,weight in \nkg/height in m2,BMI (categorical) (calculated),0 - underweight is a BMI<18.5\n1 - normal weig...,,11945
4,waist_circumference_qc,Continuous,mm,Waist circumference\n,-999 - Missing,Exception: Missing data for Agincourt over 70 ...,11472
5,hip_circumference_qc,Continuous,mm,Hip circumference\n,-999 - Missing,Exception: Missing data for Agincourt over 70 ...,11479
6,waist_hip_r_c_qc,Continuous,,Waist-to-hip ratio (calculated),-999 - Missing\n,Exception: Agincourt data missing for over 70 ...,11468
7,bp_sys_average_qc,Continuous,mmHg,Average systolic blood pressure\n,-999 - Missing,,11944
8,bp_dia_average_qc,Continuous,mmHg,Average diastolic blood pressure\n,-999 - Missing,,11945
9,htn_jnc7_qc,Categorical,,Hypertension status (calculated)\n\n,0 - no hypertension\n1 - hypertension present\...,"Note self report HT, or SBP >=140 or DBP >= 90...",12032
