#### This note book will check common features and demography

In [81]:
import pandas as pd
import numpy as np
from os import listdir
import os

In [82]:
# to make this notebook's output stable across runs
np.random.seed(42)

# To Set limit for display of columns
pd.set_option('display.max_columns', 200)

# Supress scientific notation
pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [83]:
def ReadFiles(path, file_names):
    
    """
    Function to read the csv files
    
    Parameters:
    -----------
    
    path: Path the directory you want to read from
    file_names: Names of the files you want to read
    
    Output: Dictionary of data files
    
    """
    
    files = {}
    names = [os.path.splitext(file)[0] for file in file_names]
    for i in range(len(file_names)):
        key = names[i]
        files[key] = pd.read_csv(path+"/"+file_names[i], dtype=object)
    return files



def CompareData(data, key1, key2, col1, col2 ):
    
    """
    Function to compare the information of two files, for e.g, features
    
    Parameters:
    ----------
    data: Data dictionary containing files
    key_indx1: Key index for file1
    key_indx2: Key index of file2
    col1: index column from file1 to merge the information
    col2: index column from file2 to merge the infromation
     
    """
    df1 = data[key1]
    df2 = data[key2] 
    common = pd.merge(df1, df2, left_on=col1, right_on=col2, how='inner')
    return common
    

In [84]:
# Load the Epic samples
files = listdir("E:/Machine Learning/Output Data")
pattern = (") 150.csv", ') 150 .csv', 'Epic Samples Used in ML.csv')
csv_files = [file for file in files if file.endswith(pattern)] # pattern to read

In [85]:
csv_files

['Epic Important features(cross-sectional) 150.csv',
 'Epic Important features(longitudinal) 150 .csv',
 'Epic Samples Used in ML.csv']

In [86]:
# call the function
dir_path = "E:/Machine Learning/Output Data"
data = ReadFiles(path=dir_path, file_names=csv_files)


In [87]:
data

{'Epic Important features(cross-sectional) 150':                                                0                       1
 0                                      w3_PTSDpm  1.8502415924331358e-06
 1                                    w1c1_PTSDpy  1.8749525472941326e-06
 2                       w2c1_L1_21_drugs_alcohol  1.9248092496355553e-06
 3                            w3_L1_28_Unemployed  2.5959110294921796e-06
 4                                  w2c1_PTSDlife  3.2783463299201313e-06
 ..                                           ...                     ...
 145                       w2c1_r_worst_intrusion    0.003358044607050212
 146                                   cg25368824    0.003563357557119314
 147                    w1c1_life_worst_intrusion    0.006026408050380713
 148  w2c1_life_sumptsdworst_PTS_symptom_severity    0.008550157801208418
 149    w3_life_sumptsdworst_PTS_symptom_severity      0.8901026529770454
 
 [150 rows x 2 columns],
 'Epic Important features(longitudina

In [88]:
# Load the data having more information
pheno_files = ["Combined DF without Imputation_Including_Smoking.csv", 
             "MorePhenoTpeInfo_500_withRemmited.csv"]
files_path = "E:/DNHS_EWAS_DATA/DNHSEWAS492_RProject/Data"
pheno_files = ReadFiles(path=files_path, file_names=pheno_files)


In [89]:
pheno_keys = list(pheno_files.keys())
print(pheno_files[pheno_keys[0]].head())

  Unnamed: 0 RESP w1c1_bloodid w2_bloodid  w4_bloodid  w5_bloodid  \
0          1    1     23000250        NaN  2011010251         NaN   
1          2    2     23000825        NaN         NaN  2013010091   
2          3    4          NaN        NaN         NaN         NaN   
3          4    6     23116375        NaN         NaN         NaN   
4          5   13          NaN        NaN         NaN         NaN   

  w1c1_L1_21_drugs_alcohol w1c1_L1_23_Divorce w1c1_L1_24_Lost_Job  \
0                        1                  2                   1   
1                        1                  1                   1   
2                        1                  1                   2   
3                        1                  2                   1   
4                        1                  1                   2   

  w1c1_L1_25_Emotionally_mistreated w1c1_L1_27_Legal_problems  \
0                                 1                         2   
1                                 1     

In [90]:
pheno_more = pheno_files[pheno_keys[1]]

In [91]:
# Now lets find the demography
samples = data['Epic Samples Used in ML']
print("Epic samples :",samples.shape)
samples.head()

Epic samples : (210, 171)


Unnamed: 0,Row.names,CD8T,CD4T,NK,Bcell,Mono,Neu,Gender,Wave,RESP,Sample_Name,w1c1_L1_21_drugs_alcohol,w1c1_L1_23_Divorce,w1c1_L1_24_Lost_Job,w1c1_L1_25_Emotionally_mistreated,w1c1_L1_27_Legal_problems,w1c1_L1_28_Unemployed,w1c1_L1_29_Financial_probs,w2c1_L1_21_drugs_alcohol,w2c1_L1_23_Divorce,w2c1_L1_24_Lost_Job,w2c1_L1_25_Emotionally_mistreated,w2c1_L1_27_Legal_problems,w2c1_L1_28_Unemployed,w2c1_L1_29_Financial_probs,w3_L1_21_drugs_alcohol,w3_L1_23_Divorce,w3_L1_24_Lost_Job,w3_L1_25_Emotionally_mistreated,w3_L1_27_Legal_problems,w3_L1_28_Unemployed,w3_L1_29_Financial_probs,w4_L1_21_drugs_alcohol,w4_L1_23_Divorce,w4_L1_24_Lost_Job,w4_L1_25_Emotionally_mistreated,w4_L1_27_Legal_problems,w4_L1_28_Unemployed,w4_L1_29_Financial_probs,w5_l1_21_drugs_alcohol,w5_l1_23_Divorce,w5_l1_24_Lost_Job,w5_l1_25_Emotionally_mistreated,w5_l1_27_Legal_problems,w5_l1_28_Unemployed,w5_l1_29_Financial_probs,w3_NN1_Loneliness_Scale1,w3_NN2_Loneliness_Scale2,w3_NN3_Loneliness_Scale3,w3_J1_Perceived_discrimination1,w3_J2_Perceived_discrimination2,w3_J3_Perceived_discrimination3,w3_J4_Perceived_discrimination4,w3_J5_Perceived_discrimination5,w3_J6_Perceived_discrimination6,w3_J7_Perceived_discrimination7,w3_J8_Perceived_discrimination8,w3_J9_Perceived_discrimination9,w1_educ_Participant,w2_educ_Participant,w3_educ_Participant,w4_educ_Participant,w5_educ_Participant,w3_U8B_Mothers_edu,w3_U8C_Fathers_edu,w1c1_stress2_drugs_alcohol,w1c1_stress4_Divorce,w1c1_stress5_Lost_Job,w1c1_stress6_Emotionally_mistreated,w1c1_stress8_Legal_problems,w1c1_stress9_Unemployed,w1c1_stress10_Financial_probs,w1c1_traumanum,w1c1_PTSDlife,w1c1_PTSDpy,w1c1_PTSDpm,w1c1_phq9cat_Depression_severity,w1c1_gad7cat_generalized_anx_symp_severity,w2c1_stress2_drugs_alcohol,w2c1_stress4_Divorce,w2c1_stress5_Lost_Job,w2c1_stress6_Emotionally_mistreated,w2c1_stress8_Legal_problems,w2c1_stress9_Unemployed,w2c1_stress10_Financial_probs,w2c1_traumanum,w2c1_pyphq9cat_Depression_severity,w2c1_gad7cat_py_generalized_anx_symp_severity,w2c1_PTSDlife,w2c1_PTSDpy,w2c1_PTSDpm,w3_stress2_drugs_alcohol,w3_stress4_Divorce,w3_stress5_Lost_Job,w3_stress6_Emotionally_mistreated,w3_stress8_Legal_problems,w3_stress9_Unemployed,w3_stress10_Financial_probs,w3_traumanum,w3_slphq9cat_Depression_severity,w3_gad7cat_sl_generalized_anx_symp_severity,w3_PTSDlife,w3_PTSDsl,w3_PTSDpm,w1c1_life_sumptsdworst_PTS_symptom_severity,w1c1_life_worst_intrusion,w1c1_life_worst_avoidance,w1c1_life_worst_hyperarousal,w2c1_inc_sumptsdworst_PTS_symptom_severity,w2c1_inc_worst_intrusion,w2c1_inc_worst_avoidance,w2c1_inc_worst_hyperarousal,w2c1_r_sumptsdworst_PTS_symptom_severity,w2c1_r_worst_intrusion,w2c1_r_worst_avoidance,w2c1_r_worst_hyperarousal,w3_inc_sumptsdworst_PTS_symptom_severity,w3_inc_worst_intrusion,w3_inc_worst_avoidance,w3_inc_worst_hyperarousal,w3_r_sumptsdworst_PTS_symptom_severity,w3_r_worst_intrusion,w3_r_worst_avoidance,w3_r_worst_hyperarousal,w4_stress2_drugs_alcohol,w4_stress4_Divorce,w4_stress5_Lost_Job,w4_stress6_Emotionally_mistreated,w4_stress8_Legal_problems,w4_stress9_Unemployed,w4_stress10_Financial_probs,w4_traumanum,w4_slphq9cat_Depression_severity,w4_gad7cat_sl_generalized_anx_symp_severity,w4_PTSDlife,w4_PTSDsl,w4_PTSDpm,w4_inc_sumptsdworst_PTS_symptom_severity,w4_inc_worst_intrusion,w4_inc_worst_avoidance,w4_inc_worst_hyperarousal,w4_r_sumptsdworst_PTS_symptom_severity,w4_r_worst_intrusion,w4_r_worst_avoidance,w4_r_worst_hyperarousal,w2c1_life_sumptsdworst_PTS_symptom_severity,w3_life_sumptsdworst_PTS_symptom_severity,w4_life_sumptsdworst_PTS_symptom_severity,w5_stress2_drugs_alcohol,w5_stress4_Divorce,w5_stress5_Lost_Job,w5_stress6_Emotionally_mistreated,w5_stress8_Legal_problems,w5_stress9_Unemployed,w5_stress10_Financial_probs,w5_traumanum,w5_slphq9cat_Depression_severity,w5_gad7cat_sl_generalized_anx_symp_severity,w5_PTSDlife,w5_PTSDsl,w5_PTSDpm,w5_inc_sumptsdworst_PTS_symptom_severity,w5_inc_worst_intrusion,w5_inc_worst_avoidance,w5_inc_worst_hyperarousal,w5_r_sumptsdworst_PTS_symptom_severity,w5_r_worst_intrusion,w5_r_worst_avoidance,w5_r_worst_hyperarousal,w5_life_sumptsdworst_PTS_symptom_severity,TraumaNum
0,203257030261_R02C01,0.1312968782357869,0.249403596696435,0.0775285400419206,0.105092128987754,0.0920457255192543,0.344633130518849,1,W2,65,2009010455,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,8,2,2,2,1,2,1,2,4,2,4,4,4,4,3,4,4,5,5,5,5,5,1,1,1,0,0,0,0,0,1,6,0,0,0,3,3,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,2,3,0,0,0,35,13,17,5,40,5,18,14,27,10,11,6,33,17,15,10,32,5,14,10,1,0,0,0,0,0,0,1,2,2,0,0,0,40,13,16,11,41,12,18,11,35,35,41,0,0,0,0,0,0,1,0,2,2,0,0,0,26,6,9,5,18,7,7,5,41,6
1,203257030261_R05C01,0.0948840251364565,0.180723854163022,0.0547057377511308,0.0724766690427491,0.075012267962608,0.522197445944034,2,W2,12629,2009010236,2,2,2,2,2,2,1,2,2,2,2,2,1,1,2,2,2,2,1,1,1,2,2,2,2,2,1,1,2,2,8,2,2,2,1,2,1,2,3,3,4,2,2,2,2,2,2,7,8,8,8,8,2,2,0,0,0,0,0,0,1,0,0,0,0,2,2,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,2,1,1,1,44,10,18,15,31,11,11,9,34,7,10,10,43,15,17,11,17,5,7,5,0,0,0,0,0,1,1,2,1,1,1,0,0,21,5,9,7,17,5,7,5,31,43,43,0,0,0,0,0,0,1,0,2,2,1,0,0,42,6,17,16,63,7,26,19,43,1
2,203497730130_R01C01,0.0472621766058546,0.248464667898753,0.124656007266761,0.11159532710435,0.0888983025184511,0.3791235186058289,2,W1,12218,23112625,2,2,1,2,2,2,1,2,2,2,1,2,2,1,2,2,2,2,2,1,1,2,2,2,2,2,2,1,2,2,2,2,2,1,1,3,2,1,4,4,2,4,4,4,4,4,4,3,3,3,3,3,1,1,0,0,1,0,0,0,1,4,0,0,0,3,1,0,0,0,1,0,0,1,3,3,1,0,0,0,0,0,0,0,0,1,1,0,3,1,0,0,0,28,10,9,8,31,14,10,7,25,7,9,9,35,10,15,10,27,9,11,7,0,0,0,0,0,0,1,1,3,1,0,0,0,29,11,10,8,33,15,13,7,31,31,31,0,0,0,0,0,1,1,2,5,3,0,0,0,50,15,20,15,38,7,14,17,50,4
3,203497730130_R02C01,0.1279506366269469,0.157801717029453,0.0551120979066981,0.0199784748402415,0.0984055447655586,0.540751528831102,2,W2,10792,2009010075,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,4,4,4,4,4,4,4,4,4,3,3,3,3,3,98,98,0,0,0,0,0,0,0,1,0,0,0,2,2,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,1,3,0,0,0,21,9,7,5,18,5,7,5,29,8,15,6,47,16,18,19,17,5,7,10,0,0,0,0,0,0,0,1,2,1,0,0,0,20,6,8,6,37,15,16,16,29,29,29,0,0,0,0,0,0,1,0,2,2,0,0,0,20,6,7,16,27,7,10,6,29,1
4,203497730130_R08C01,0.1929587531929889,0.220010224001117,0.1047432490997969,0.0817510015519277,0.0841601634065853,0.316376608747584,1,W2,11963,2009010346,2,1,2,2,2,2,1,2,2,2,2,2,2,1,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,1,1,4,4,4,4,4,4,4,4,4,7,7,7,7,7,2,2,0,1,0,0,0,0,1,3,0,0,0,1,3,0,0,0,0,0,0,1,0,2,1,0,0,0,0,0,0,0,0,0,1,1,2,3,0,0,0,18,5,7,6,27,5,8,6,18,5,7,6,19,6,8,5,17,16,7,10,0,0,0,0,0,0,0,2,2,2,0,0,0,23,6,10,7,17,5,7,5,18,19,23,0,0,0,0,0,0,1,1,1,2,0,0,0,19,7,7,5,17,5,7,5,23,3


In [92]:
# Now combine the phenotye information
comb_pheno = pd.merge(samples, pheno_more, on='Row.names', how='inner')
print(comb_pheno.shape)
comb_pheno.head()

(210, 213)


Unnamed: 0.1,Row.names,CD8T,CD4T,NK,Bcell,Mono,Neu,Gender,Wave_x,RESP_x,Sample_Name_x,w1c1_L1_21_drugs_alcohol,w1c1_L1_23_Divorce,w1c1_L1_24_Lost_Job,w1c1_L1_25_Emotionally_mistreated,w1c1_L1_27_Legal_problems,w1c1_L1_28_Unemployed,w1c1_L1_29_Financial_probs,w2c1_L1_21_drugs_alcohol,w2c1_L1_23_Divorce,w2c1_L1_24_Lost_Job,w2c1_L1_25_Emotionally_mistreated,w2c1_L1_27_Legal_problems,w2c1_L1_28_Unemployed,w2c1_L1_29_Financial_probs,w3_L1_21_drugs_alcohol,w3_L1_23_Divorce,w3_L1_24_Lost_Job,w3_L1_25_Emotionally_mistreated,w3_L1_27_Legal_problems,w3_L1_28_Unemployed,w3_L1_29_Financial_probs,w4_L1_21_drugs_alcohol,w4_L1_23_Divorce,w4_L1_24_Lost_Job,w4_L1_25_Emotionally_mistreated,w4_L1_27_Legal_problems,w4_L1_28_Unemployed,w4_L1_29_Financial_probs,w5_l1_21_drugs_alcohol,w5_l1_23_Divorce,w5_l1_24_Lost_Job,w5_l1_25_Emotionally_mistreated,w5_l1_27_Legal_problems,w5_l1_28_Unemployed,w5_l1_29_Financial_probs,w3_NN1_Loneliness_Scale1,w3_NN2_Loneliness_Scale2,w3_NN3_Loneliness_Scale3,w3_J1_Perceived_discrimination1,w3_J2_Perceived_discrimination2,w3_J3_Perceived_discrimination3,w3_J4_Perceived_discrimination4,w3_J5_Perceived_discrimination5,w3_J6_Perceived_discrimination6,w3_J7_Perceived_discrimination7,w3_J8_Perceived_discrimination8,w3_J9_Perceived_discrimination9,w1_educ_Participant,w2_educ_Participant,w3_educ_Participant,w4_educ_Participant,w5_educ_Participant,w3_U8B_Mothers_edu,w3_U8C_Fathers_edu,w1c1_stress2_drugs_alcohol,w1c1_stress4_Divorce,w1c1_stress5_Lost_Job,w1c1_stress6_Emotionally_mistreated,w1c1_stress8_Legal_problems,w1c1_stress9_Unemployed,w1c1_stress10_Financial_probs,w1c1_traumanum_x,w1c1_PTSDlife_x,w1c1_PTSDpy_x,w1c1_PTSDpm_x,w1c1_phq9cat_Depression_severity,w1c1_gad7cat_generalized_anx_symp_severity,w2c1_stress2_drugs_alcohol,w2c1_stress4_Divorce,w2c1_stress5_Lost_Job,w2c1_stress6_Emotionally_mistreated,w2c1_stress8_Legal_problems,w2c1_stress9_Unemployed,w2c1_stress10_Financial_probs,w2c1_traumanum_x,w2c1_pyphq9cat_Depression_severity,w2c1_gad7cat_py_generalized_anx_symp_severity,w2c1_PTSDlife_x,w2c1_PTSDpy_x,w2c1_PTSDpm_x,w3_stress2_drugs_alcohol,w3_stress4_Divorce,w3_stress5_Lost_Job,w3_stress6_Emotionally_mistreated,w3_stress8_Legal_problems,w3_stress9_Unemployed,w3_stress10_Financial_probs,w3_traumanum_x,w3_slphq9cat_Depression_severity,...,w2c1_r_worst_intrusion,w2c1_r_worst_avoidance,w2c1_r_worst_hyperarousal,w3_inc_sumptsdworst_PTS_symptom_severity,w3_inc_worst_intrusion,w3_inc_worst_avoidance,w3_inc_worst_hyperarousal,w3_r_sumptsdworst_PTS_symptom_severity,w3_r_worst_intrusion,w3_r_worst_avoidance,w3_r_worst_hyperarousal,w4_stress2_drugs_alcohol,w4_stress4_Divorce,w4_stress5_Lost_Job,w4_stress6_Emotionally_mistreated,w4_stress8_Legal_problems,w4_stress9_Unemployed,w4_stress10_Financial_probs,w4_traumanum_x,w4_slphq9cat_Depression_severity,w4_gad7cat_sl_generalized_anx_symp_severity,w4_PTSDlife_x,w4_PTSDsl_x,w4_PTSDpm_x,w4_inc_sumptsdworst_PTS_symptom_severity,w4_inc_worst_intrusion,w4_inc_worst_avoidance,w4_inc_worst_hyperarousal,w4_r_sumptsdworst_PTS_symptom_severity,w4_r_worst_intrusion,w4_r_worst_avoidance,w4_r_worst_hyperarousal,w2c1_life_sumptsdworst_PTS_symptom_severity,w3_life_sumptsdworst_PTS_symptom_severity,w4_life_sumptsdworst_PTS_symptom_severity,w5_stress2_drugs_alcohol,w5_stress4_Divorce,w5_stress5_Lost_Job,w5_stress6_Emotionally_mistreated,w5_stress8_Legal_problems,w5_stress9_Unemployed,w5_stress10_Financial_probs,w5_traumanum_x,w5_slphq9cat_Depression_severity,w5_gad7cat_sl_generalized_anx_symp_severity,w5_PTSDlife_x,w5_PTSDsl_x,w5_PTSDpm_x,w5_inc_sumptsdworst_PTS_symptom_severity,w5_inc_worst_intrusion,w5_inc_worst_avoidance,w5_inc_worst_hyperarousal,w5_r_sumptsdworst_PTS_symptom_severity,w5_r_worst_intrusion,w5_r_worst_avoidance,w5_r_worst_hyperarousal,w5_life_sumptsdworst_PTS_symptom_severity,TraumaNum_x,Unnamed: 0,RESP_y,Sample_Name_y,w1c1_bloodid,w2_bloodid,w4_bloodid,w5_bloodid,w1c1_CP1C_Days_smoking_past_30days,w2c1_CP1C_Days_smoking_past_30days,w3_CP1C_Days_smoking_past_30days,w4_CP1C_Days_smoking_past_30days,w5_cp1c_Days_smoking_past_30days,w1c1_traumanum_y,w1c1_PTSDlife_y,w1c1_PTSDpy_y,w1c1_PTSDpm_y,w2c1_traumanum_y,w2c1_PTSDlife_y,w2c1_PTSDpy_y,w2c1_PTSDpm_y,w3_traumanum_y,w3_PTSDlife_y,w3_PTSDsl_y,w3_PTSDpm_y,w4_traumanum_y,w4_PTSDlife_y,w4_PTSDsl_y,w4_PTSDpm_y,w5_traumanum_y,w5_PTSDlife_y,w5_PTSDsl_y,w5_PTSDpm_y,Wave_y,race6cat,race3cat,Age,PTSDLife,PTSDpy,PTSDpm,Days_Smoking_Past_30_days,TraumaNum_y,Remitted
0,203257030261_R02C01,0.1312968782357869,0.249403596696435,0.0775285400419206,0.105092128987754,0.0920457255192543,0.344633130518849,1,W2,65,2009010455,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,8,2,2,2,1,2,1,2,4,2,4,4,4,4,3,4,4,5,5,5,5,5,1,1,1,0,0,0,0,0,1,6,0,0,0,3,3,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,2,...,10,11,6,33,17,15,10,32,5,14,10,1,0,0,0,0,0,0,1,2,2,0,0,0,40,13,16,11,41,12,18,11,35,35,41,0,0,0,0,0,0,1,0,2,2,0,0,0,26,6,9,5,18,7,7,5,41,6,9,65,2009010455,,2009010455,2011010264,,0.0,,,,,6,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,,,,W2,2,2,72,0,0,0,,6,0
1,203257030261_R05C01,0.0948840251364565,0.180723854163022,0.0547057377511308,0.0724766690427491,0.075012267962608,0.522197445944034,2,W2,12629,2009010236,2,2,2,2,2,2,1,2,2,2,2,2,1,1,2,2,2,2,1,1,1,2,2,2,2,2,1,1,2,2,8,2,2,2,1,2,1,2,3,3,4,2,2,2,2,2,2,7,8,8,8,8,2,2,0,0,0,0,0,0,1,0,0,0,0,2,2,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,...,7,10,10,43,15,17,11,17,5,7,5,0,0,0,0,0,1,1,2,1,1,1,0,0,21,5,9,7,17,5,7,5,31,43,43,0,0,0,0,0,0,1,0,2,2,1,0,0,42,6,17,16,63,7,26,19,43,1,395,12629,2009010236,23115600.0,2009010236,2011010017,,1.0,5.0,30.0,25.0,,0,0,0,0,1,0,0,0,1,1,1,1,2,1,0,0,0,,,,W2,2,2,34,0,0,0,5.0,1,0
2,203497730130_R01C01,0.0472621766058546,0.248464667898753,0.124656007266761,0.11159532710435,0.0888983025184511,0.3791235186058289,2,W1,12218,23112625,2,2,1,2,2,2,1,2,2,2,1,2,2,1,2,2,2,2,2,1,1,2,2,2,2,2,2,1,2,2,2,2,2,1,1,3,2,1,4,4,2,4,4,4,4,4,4,3,3,3,3,3,1,1,0,0,1,0,0,0,1,4,0,0,0,3,1,0,0,0,1,0,0,1,3,3,1,0,0,0,0,0,0,0,0,1,1,0,3,...,7,9,9,35,10,15,10,27,9,11,7,0,0,0,0,0,0,1,1,3,1,0,0,0,29,11,10,8,33,15,13,7,31,31,31,0,0,0,0,0,1,1,2,5,3,0,0,0,50,15,20,15,38,7,14,17,50,4,328,12218,23112625,23112625.0,2009010296,2011010148,2013010030.0,,,,,,4,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,2,0.0,0.0,0.0,W1,2,2,50,0,0,0,,4,0
3,203497730130_R02C01,0.1279506366269469,0.157801717029453,0.0551120979066981,0.0199784748402415,0.0984055447655586,0.540751528831102,2,W2,10792,2009010075,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,1,4,4,4,4,4,4,4,4,4,3,3,3,3,3,98,98,0,0,0,0,0,0,0,1,0,0,0,2,2,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,1,...,8,15,6,47,16,18,19,17,5,7,10,0,0,0,0,0,0,0,1,2,1,0,0,0,20,6,8,6,37,15,16,16,29,29,29,0,0,0,0,0,0,1,0,2,2,0,0,0,20,6,7,16,27,7,10,6,29,1,142,10792,2009010075,23005400.0,2009010075,2011010293,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,,,,W2,2,2,61,0,0,0,,1,0
4,203497730130_R08C01,0.1929587531929889,0.220010224001117,0.1047432490997969,0.0817510015519277,0.0841601634065853,0.316376608747584,1,W2,11963,2009010346,2,1,2,2,2,2,1,2,2,2,2,2,2,1,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,1,1,4,4,4,4,4,4,4,4,4,7,7,7,7,7,2,2,0,1,0,0,0,0,1,3,0,0,0,1,3,0,0,0,0,0,0,1,0,2,1,0,0,0,0,0,0,0,0,0,1,1,2,...,5,7,6,19,6,8,5,17,16,7,10,0,0,0,0,0,0,0,2,2,2,0,0,0,23,6,10,7,17,5,7,5,18,19,23,0,0,0,0,0,0,1,1,1,2,0,0,0,19,7,7,5,17,5,7,5,23,3,303,11963,2009010346,,2009010346,2011010129,2013010215.0,0.0,,,,,3,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,1,0.0,0.0,0.0,W2,2,2,62,0,0,0,,3,0


In [93]:
gender = np.round(samples['Gender'].value_counts(normalize = True)*100, 3)
print("% Females:", gender[0], ",", "Males:", gender[1])

% Females: 60.0 , Males: 40.0


In [94]:
race = np.round(comb_pheno['race3cat'].value_counts(normalize = True)*100,3)
print("% Race ", "AA:", race[0], ",", "Others:", race[1]+race[2])

% Race  AA: 93.81 , Others: 6.191


In [95]:
# Get mean and std of age
def MeanSD(df, col):
    val = np.asarray(df[col], dtype=int)
    mean, std = np.round([np.mean(val), np.std(val)], 3)
    return(mean,std)


# Function to count the number and percentage
def CountFrequency(df, col):
    num = np.array(df[col].value_counts())
    percentate = np.round(df[col].value_counts(normalize = True)*100, 3)
    return([num, percentate])

In [96]:
# Get cases and controls
# cases_df = comb_pheno[comb_pheno["PTSDpm"] == '1']
# controls_df = comb_pheno[comb_pheno["PTSDpm"] == '0']

cols = ['Age', 'TraumaNum_y', 'w4_life_sumptsdworst_PTS_symptom_severity']
# dfs = [cases_df, controls_df]

for i in range(len(cols)):
    mean_sd = MeanSD(comb_pheno, cols[i])
    print("Mean and SD case:"+ cols[i], mean_sd)
    


Mean and SD case:Age (54.567, 12.793)
Mean and SD case:TraumaNum_y (6.757, 4.555)
Mean and SD case:w4_life_sumptsdworst_PTS_symptom_severity (43.91, 16.258)


In [97]:
# get number and percentage now
cols1 = ['Gender', 'race3cat']
for i in range(len(cols1)):
    freq = CountFrequency(df=comb_pheno,col= cols1[i])
    print("Number and percentage:"+ cols1[i], freq)


Number and percentage:Gender [array([126,  84], dtype=int64), 1   60.000000
2   40.000000
Name: Gender, dtype: float64]
Number and percentage:race3cat [array([197,   9,   4], dtype=int64), 2   93.810000
1    4.286000
3    1.905000
Name: race3cat, dtype: float64]


In [98]:
current_ptsd = np.array(comb_pheno["PTSDpm"].value_counts())
print("Controls: ", current_ptsd[0], ",", "Cases: ", current_ptsd[1] )

Controls:  194 , Cases:  16


In [273]:
comb_pheno[comb_pheno["PTSDpm"] == '0']

Unnamed: 0.1,Row.names,CD8T,CD4T,NK,Bcell,Mono,Neu,Gender,Wave_x,RESP_x,Sample_Name_x,w1c1_L1_21_drugs_alcohol,w1c1_L1_23_Divorce,w1c1_L1_24_Lost_Job,w1c1_L1_25_Emotionally_mistreated,w1c1_L1_27_Legal_problems,w1c1_L1_28_Unemployed,w1c1_L1_29_Financial_probs,w2c1_L1_21_drugs_alcohol,w2c1_L1_23_Divorce,w2c1_L1_24_Lost_Job,w2c1_L1_25_Emotionally_mistreated,w2c1_L1_27_Legal_problems,w2c1_L1_28_Unemployed,w2c1_L1_29_Financial_probs,w3_L1_21_drugs_alcohol,w3_L1_23_Divorce,w3_L1_24_Lost_Job,w3_L1_25_Emotionally_mistreated,w3_L1_27_Legal_problems,w3_L1_28_Unemployed,w3_L1_29_Financial_probs,w4_L1_21_drugs_alcohol,w4_L1_23_Divorce,w4_L1_24_Lost_Job,w4_L1_25_Emotionally_mistreated,w4_L1_27_Legal_problems,w4_L1_28_Unemployed,w4_L1_29_Financial_probs,w5_l1_21_drugs_alcohol,w5_l1_23_Divorce,w5_l1_24_Lost_Job,w5_l1_25_Emotionally_mistreated,w5_l1_27_Legal_problems,w5_l1_28_Unemployed,w5_l1_29_Financial_probs,w3_NN1_Loneliness_Scale1,w3_NN2_Loneliness_Scale2,w3_NN3_Loneliness_Scale3,w3_J1_Perceived_discrimination1,w3_J2_Perceived_discrimination2,w3_J3_Perceived_discrimination3,w3_J4_Perceived_discrimination4,w3_J5_Perceived_discrimination5,w3_J6_Perceived_discrimination6,w3_J7_Perceived_discrimination7,w3_J8_Perceived_discrimination8,w3_J9_Perceived_discrimination9,w1_educ_Participant,w2_educ_Participant,w3_educ_Participant,w4_educ_Participant,w5_educ_Participant,w3_U8B_Mothers_edu,w3_U8C_Fathers_edu,w1c1_stress2_drugs_alcohol,w1c1_stress4_Divorce,w1c1_stress5_Lost_Job,w1c1_stress6_Emotionally_mistreated,w1c1_stress8_Legal_problems,w1c1_stress9_Unemployed,w1c1_stress10_Financial_probs,w1c1_traumanum_x,w1c1_PTSDlife_x,w1c1_PTSDpy_x,w1c1_PTSDpm_x,w1c1_phq9cat_Depression_severity,w1c1_gad7cat_generalized_anx_symp_severity,w2c1_stress2_drugs_alcohol,w2c1_stress4_Divorce,w2c1_stress5_Lost_Job,w2c1_stress6_Emotionally_mistreated,w2c1_stress8_Legal_problems,w2c1_stress9_Unemployed,w2c1_stress10_Financial_probs,w2c1_traumanum_x,w2c1_pyphq9cat_Depression_severity,w2c1_gad7cat_py_generalized_anx_symp_severity,w2c1_PTSDlife_x,w2c1_PTSDpy_x,w2c1_PTSDpm_x,w3_stress2_drugs_alcohol,w3_stress4_Divorce,w3_stress5_Lost_Job,w3_stress6_Emotionally_mistreated,w3_stress8_Legal_problems,w3_stress9_Unemployed,w3_stress10_Financial_probs,w3_traumanum_x,w3_slphq9cat_Depression_severity,...,w2c1_r_worst_intrusion,w2c1_r_worst_avoidance,w2c1_r_worst_hyperarousal,w3_inc_sumptsdworst_PTS_symptom_severity,w3_inc_worst_intrusion,w3_inc_worst_avoidance,w3_inc_worst_hyperarousal,w3_r_sumptsdworst_PTS_symptom_severity,w3_r_worst_intrusion,w3_r_worst_avoidance,w3_r_worst_hyperarousal,w4_stress2_drugs_alcohol,w4_stress4_Divorce,w4_stress5_Lost_Job,w4_stress6_Emotionally_mistreated,w4_stress8_Legal_problems,w4_stress9_Unemployed,w4_stress10_Financial_probs,w4_traumanum_x,w4_slphq9cat_Depression_severity,w4_gad7cat_sl_generalized_anx_symp_severity,w4_PTSDlife_x,w4_PTSDsl_x,w4_PTSDpm_x,w4_inc_sumptsdworst_PTS_symptom_severity,w4_inc_worst_intrusion,w4_inc_worst_avoidance,w4_inc_worst_hyperarousal,w4_r_sumptsdworst_PTS_symptom_severity,w4_r_worst_intrusion,w4_r_worst_avoidance,w4_r_worst_hyperarousal,w2c1_life_sumptsdworst_PTS_symptom_severity,w3_life_sumptsdworst_PTS_symptom_severity,w4_life_sumptsdworst_PTS_symptom_severity,w5_stress2_drugs_alcohol,w5_stress4_Divorce,w5_stress5_Lost_Job,w5_stress6_Emotionally_mistreated,w5_stress8_Legal_problems,w5_stress9_Unemployed,w5_stress10_Financial_probs,w5_traumanum_x,w5_slphq9cat_Depression_severity,w5_gad7cat_sl_generalized_anx_symp_severity,w5_PTSDlife_x,w5_PTSDsl_x,w5_PTSDpm_x,w5_inc_sumptsdworst_PTS_symptom_severity,w5_inc_worst_intrusion,w5_inc_worst_avoidance,w5_inc_worst_hyperarousal,w5_r_sumptsdworst_PTS_symptom_severity,w5_r_worst_intrusion,w5_r_worst_avoidance,w5_r_worst_hyperarousal,w5_life_sumptsdworst_PTS_symptom_severity,TraumaNum_x,Unnamed: 0,RESP_y,Sample_Name_y,w1c1_bloodid,w2_bloodid,w4_bloodid,w5_bloodid,w1c1_CP1C_Days_smoking_past_30days,w2c1_CP1C_Days_smoking_past_30days,w3_CP1C_Days_smoking_past_30days,w4_CP1C_Days_smoking_past_30days,w5_cp1c_Days_smoking_past_30days,w1c1_traumanum_y,w1c1_PTSDlife_y,w1c1_PTSDpy_y,w1c1_PTSDpm_y,w2c1_traumanum_y,w2c1_PTSDlife_y,w2c1_PTSDpy_y,w2c1_PTSDpm_y,w3_traumanum_y,w3_PTSDlife_y,w3_PTSDsl_y,w3_PTSDpm_y,w4_traumanum_y,w4_PTSDlife_y,w4_PTSDsl_y,w4_PTSDpm_y,w5_traumanum_y,w5_PTSDlife_y,w5_PTSDsl_y,w5_PTSDpm_y,Wave_y,race6cat,race3cat,Age,PTSDLife,PTSDpy,PTSDpm,Days_Smoking_Past_30_days,TraumaNum_y,Remitted
0,203257030261_R02C01,0.13129687823578698,0.249403596696435,0.0775285400419206,0.10509212898775401,0.0920457255192543,0.344633130518849,1,W2,65,2009010455,1,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,8,2,2,2,1,2,1,2,4,2,4,4,4,4,3,4,4,5,5,5,5,6,1,1,1,0,0,0,0,0,1,6,0,0,0,3,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,2,...,10,11,6,42,14,15,10,17,16,7,10,1,0,0,0,0,0,0,1,2,1,0,0,0,40,13,16,11,41,12,18,11,35,35,41,0,0,0,0,0,0,1,0,2,2,0,0,0,22,6,9,5,37,7,16,14,41,6,9,65,2009010455,,2009010455,2011010264,,0,,,,,6,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,,,,W2,2,2,72,0,0,0,,6,0
1,203257030261_R05C01,0.0948840251364565,0.180723854163022,0.0547057377511308,0.0724766690427491,0.075012267962608,0.522197445944034,2,W2,12629,2009010236,2,2,2,2,2,2,1,2,2,2,2,2,1,1,2,2,2,2,1,1,1,2,2,2,2,2,1,1,2,2,8,2,2,2,1,2,1,2,3,3,4,2,2,2,2,2,2,7,8,8,8,8,2,2,0,0,0,0,0,0,1,0,0,0,0,2,3,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,1,1,1,1,1,...,7,30,10,43,15,17,11,17,5,7,5,0,0,0,0,0,1,1,2,1,1,1,0,0,21,5,9,7,17,5,7,5,31,43,43,0,0,0,0,0,0,1,0,2,3,1,0,0,18,6,7,5,22,6,8,9,43,1,395,12629,2009010236,23115600,2009010236,2011010017,,1,5,30,25,,0,0,0,0,1,0,0,0,1,1,1,1,2,1,0,0,0,,,,W2,2,2,34,0,0,0,5,1,0
2,203497730130_R01C01,0.0472621766058546,0.24846466789875302,0.12465600726676102,0.11159532710435001,0.0888983025184511,0.37912351860582894,2,W1,12218,23112625,2,2,1,2,2,2,1,2,2,2,1,2,2,1,2,2,2,2,2,1,1,2,2,2,2,2,2,1,2,2,2,2,2,1,1,3,2,1,4,4,2,4,4,4,4,4,4,3,3,3,3,3,1,1,0,0,1,0,0,0,1,4,0,0,0,3,1,0,0,0,1,0,0,1,3,3,1,0,0,0,0,0,0,0,0,1,1,0,3,...,7,9,9,22,7,7,5,27,9,11,7,0,0,0,0,0,0,1,1,3,1,0,0,0,29,11,10,8,39,15,11,12,31,31,31,0,0,0,0,0,1,1,2,5,3,0,0,0,50,15,20,15,38,7,14,17,50,4,328,12218,23112625,23112625,2009010296,2011010148,2013010030,,,,,,4,0,0,0,3,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,W1,2,2,50,0,0,0,,4,0
3,203497730130_R02C01,0.12795063662694697,0.157801717029453,0.0551120979066981,0.0199784748402415,0.0984055447655586,0.540751528831102,2,W2,10792,2009010075,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,8,2,2,2,1,1,1,1,4,4,4,4,4,4,4,4,4,3,3,3,3,5,98,98,0,0,0,0,0,0,0,1,0,0,0,2,2,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0,1,...,8,15,6,70,22,28,19,25,16,7,10,0,0,0,0,0,0,0,1,2,1,0,0,0,20,6,8,6,41,12,16,11,29,29,29,0,0,0,0,0,0,1,0,2,2,0,0,0,39,6,13,5,35,7,17,11,29,1,142,10792,2009010075,23005400,2009010075,2011010293,,,,,,,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,,,,W2,2,2,61,0,0,0,,1,0
4,203497730130_R08C01,0.19295875319298897,0.220010224001117,0.10474324909979699,0.0817510015519277,0.0841601634065853,0.316376608747584,1,W2,11963,2009010346,2,1,2,2,2,2,1,2,2,2,2,2,2,1,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,1,1,4,4,4,4,4,4,4,4,4,7,7,7,7,7,2,2,0,1,0,0,0,0,1,3,0,0,0,1,3,0,0,0,0,0,0,1,0,2,1,0,0,0,0,0,0,0,0,0,1,1,2,...,5,7,6,19,6,8,5,58,16,26,6,0,0,0,0,0,0,0,2,2,2,0,0,0,23,6,10,7,17,5,7,5,18,19,23,0,0,0,0,0,0,1,1,1,3,0,0,0,19,7,7,5,17,5,7,5,23,3,303,11963,2009010346,,2009010346,2011010129,2013010215,0,,,,,3,0,0,0,0,0,0,0,1,0,0,0,2,0,0,0,1,0,0,0,W2,2,2,62,0,0,0,,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205,203516050050_R07C01,0.06741230778356,0.162028358805307,0.0330584215595807,0.0424825448199496,0.0946198886902369,0.6003984783413651,2,W2,11324,2009010124,2,2,1,2,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,1,1,1,2,4,2,4,4,1,4,4,4,6,6,6,6,6,2,2,0,0,1,0,1,1,1,10,0,0,0,2,2,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,2,1,...,5,7,5,21,7,9,5,21,16,7,6,0,0,0,0,0,0,0,0,1,1,0,0,0,22,13,8,9,22,10,7,5,32,32,32,0,0,0,0,0,0,0,1,1,2,0,0,0,20,8,7,5,21,6,7,5,32,10,201,11324,2009010124,23005675,2009010124,2011010136,2013010221,30,30,30,30,6,10,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,1,0,0,0,W2,2,2,48,0,0,0,30,10,0
206,203516050050_R08C01,0.10977840566482701,0.14427525342235,0.0737649714307238,0.14359624776665803,0.158119688336845,0.370465433378596,2,W1,451,23003675,1,2,2,2,2,1,2,2,2,2,2,2,1,2,1,2,2,2,2,2,2,2,2,2,1,2,1,1,2,2,2,2,2,2,1,2,2,2,4,2,4,1,2,4,2,4,4,5,5,7,7,8,2,2,1,0,0,0,0,1,0,4,0,0,0,3,3,0,0,0,0,0,1,0,0,3,1,1,1,0,1,0,0,0,0,0,0,2,5,...,8,23,13,79,23,31,25,41,16,16,6,0,0,0,1,0,1,1,0,5,3,1,0,0,25,6,8,25,62,8,29,25,44,79,79,0,0,0,0,0,0,1,0,2,3,1,0,0,48,7,23,16,18,7,8,5,79,4,78,451,23003675,23003675,2009010051,2011010216,,30,30,30,30,,4,0,0,0,0,1,1,0,2,1,1,1,0,1,0,0,0,,,,W1,2,2,47,0,0,0,30,4,0
207,203516050051_R01C01,0.0815181132099262,0.227112597958351,0.0145049270311488,0.0325731362759206,0.0923486524628137,0.5519425730618389,1,W1,11655,23112375,2,1,1,1,1,1,2,2,2,2,2,2,1,2,2,2,2,2,2,2,2,2,2,2,2,2,1,2,2,2,1,1,2,2,2,1,1,1,4,3,4,3,3,3,4,4,4,7,8,8,8,8,5,4,0,1,1,1,1,1,0,6,0,0,0,4,2,0,0,0,0,0,1,0,2,4,1,0,0,0,0,0,0,0,0,0,0,0,2,...,11,31,10,20,6,7,6,47,7,16,5,0,0,0,0,0,1,0,2,1,2,0,0,0,20,6,9,5,17,15,7,5,23,23,23,0,0,1,1,0,0,0,1,1,2,0,0,0,22,6,9,7,22,5,8,9,23,6,251,11655,23112375,23112375,2009010175,2011010443,2013010229,,,,,,6,0,0,0,2,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,W1,5,1,52,0,0,0,,6,0
208,203516050051_R03C01,0.11428384611521801,0.17011105760512601,0.0934730260625396,0.11932898558876598,0.0495768311706184,0.45322625345773204,1,W2,11842,2009010213,1,2,2,2,2,2,2,1,2,2,2,2,2,2,1,2,2,2,2,2,1,1,2,2,2,2,1,1,2,2,8,2,2,2,1,1,1,1,4,4,4,4,4,4,2,4,4,5,5,5,5,5,2,98,1,0,0,0,0,0,0,3,0,0,0,3,1,1,0,0,0,0,0,0,0,3,1,0,0,0,1,0,0,0,0,0,1,1,2,...,9,11,10,34,12,9,13,18,5,7,10,1,0,0,0,0,1,1,0,2,1,0,0,0,28,9,7,25,22,8,9,5,50,50,50,0,0,0,0,0,0,1,0,2,2,0,0,0,23,7,9,5,35,7,13,13,50,3,289,11842,2009010213,,2009010213,2011010297,,30,30,30,30,,3,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,,,,W2,2,2,32,0,0,0,30,3,0


In [100]:
# Now find the number of participants with more than one sample
len(comb_pheno['RESP_x'].unique())

148

In [101]:
comb_pheno.shape

(210, 213)

In [104]:
numb = comb_pheno['RESP_x'].value_counts()

In [105]:
numb.value_counts()

1    86
2    62
Name: RESP_x, dtype: int64

In [113]:
# percentage of participants with one time point
numb.value_counts()[1]/210

0.4095238095238095

In [112]:
numb.value_counts()[2]/210

0.29523809523809524