In [1]:
import pandas as pd
import pickle
import yaml
import os
import sys

In [2]:
# Local packages
sys.path.append('../src')
from ninetynine import ninetynine

In [3]:
#open data
with open('../config/paths.yaml','r') as file:
    paths_list = yaml.safe_load(file)
    PATH = os.path.abspath(paths_list['PATH'])

data_path = pd.read_csv(os.path.join(PATH,'raw','LW_Data_64e_wBL.csv'))
data_path.head()

Unnamed: 0,SAMPLE_NUMBER,STUDY_NO,CLASS_CODE,CLASS_DESCRIPTOR,SUBJECT_ID,SEX,DOB,SPECIES_STRAIN,COUNTRY_ORIGIN,STUDY_DAY,SAMPLED_DATE,ANALYSIS,NAME,ENTRY
0,737066,SARS-COV-2-NHP-064E-2,0,Mock,B03942,M,06/15/2016 12:00:00 AM,CYNO,Cambodia,-7,06/09/2020 12:00:00 AM,CBC_PROCYTEDX,White Blood Cell,12.28
1,737066,SARS-COV-2-NHP-064E-2,0,Mock,B03942,M,06/15/2016 12:00:00 AM,CYNO,Cambodia,-7,06/09/2020 12:00:00 AM,CBC_PROCYTEDX,Red Blood Cell,5.1
2,737066,SARS-COV-2-NHP-064E-2,0,Mock,B03942,M,06/15/2016 12:00:00 AM,CYNO,Cambodia,-7,06/09/2020 12:00:00 AM,CBC_PROCYTEDX,Hemoglobin,12.2
3,737066,SARS-COV-2-NHP-064E-2,0,Mock,B03942,M,06/15/2016 12:00:00 AM,CYNO,Cambodia,-7,06/09/2020 12:00:00 AM,CBC_PROCYTEDX,Hematocrit,37.7
4,737066,SARS-COV-2-NHP-064E-2,0,Mock,B03942,M,06/15/2016 12:00:00 AM,CYNO,Cambodia,-7,06/09/2020 12:00:00 AM,CBC_PROCYTEDX,Mean Corpuscular Volume,73.9


In [4]:
# Views of the raw data
#data_path.astype('object').describe()
#len(data_path[data_path.NAME == 'C-Reactive Protein'])
#data_path.loc[data_path.NAME == 'C-Reactive Protein',['SAMPLE_NUMBER','SUBJECT_ID','STUDY_DAY','STUDY_NO']]
#data_path.loc[data_path.NAME == 'ICT',['SAMPLE_NUMBER','SUBJECT_ID','STUDY_DAY','STUDY_NO','ENTRY']].astype(object).describe()
#data_path.info()
#data_path.loc[data_path['SUBJECT_ID'] == 'B03757','NAME']
#data_path.loc[(data_path['SUBJECT_ID'] == 'G21E') & (data_path['NAME'] == 'Hemoglobin')]

Some subjects have errors on these entries (StudyDay does not match SAMPLED_DATE)\
However some subjects need this data (ex. B0757)\
Solution is to change STUDY_NO to something other than HOLDING in the specific entries we need to keep

### Rename columns and values

In [5]:
# Rename columns and values to follow previous convention
data_path.rename(columns = {'SUBJECT_ID':'Subject','CLASS_DESCRIPTOR':'Class','STUDY_DAY':'StudyDay'},inplace=True)
data_path.loc[data_path.Class == 'Virus','Class'] = 'Infected'

# Get rid of spaces and slashes, these are annoying later
data_path['NAME'] = data_path['NAME'].replace(' ','_',regex=True)
data_path['NAME'] = data_path['NAME'].replace('/','v',regex=True)

# Calculate age
data_path['Age_d'] = (pd.to_datetime(data_path.SAMPLED_DATE) - pd.to_datetime(data_path.DOB)).dt.days

### Remove error values

In [6]:
# Confirmed with SB that this is reasonable
data_path.loc[(data_path.NAME == 'C-Reactive_Protein') & (data_path.ENTRY == '<0.1'),'ENTRY'] = 0
data_path.loc[(data_path.NAME == 'Total_Bilirubin') & (data_path.ENTRY == '<0.1'),'ENTRY'] = 0
data_path.loc[(data_path.NAME == 'Alanine_Aminotransferase') & (data_path.ENTRY == '<10'),'ENTRY'] = 0
data_path.loc[(data_path.NAME == 'Triglyceride') & (data_path.ENTRY == '<10'),'ENTRY'] = 0
data_path.loc[(data_path.NAME == 'Creatinine_Kinase') & (data_path.ENTRY == 'Not enough sample'),'ENTRY'] = None

# Remove all values from the HOLDING protocol
    # Some subjects have errors on these entries (StudyDay does not match SAMPLED_DATE)
    # However some subjects need this data (ex. B0757)
    # Solution is to change STUDY_NO to something other than HOLDING in the specific entries we need to keep
# For specific subjects, save them by assigning a ney study procotol name
data_path.loc[(data_path['Subject'] == 'B03757') & (data_path['STUDY_NO'] == 'HOLDING'),'STUDY_NO'] = 'SAVE_WC'
# Remove remaining values on HOLDING protocol
data_path = data_path.loc[data_path.STUDY_NO != 'HOLDING']

# Remove additional timepoints, confirmed that these three subjects have other baseline values to pull from
data_path = data_path.loc[~data_path.StudyDay.isin([-48,-49,10,12,19,30])]

# Convert days to string (for delta timepoints)
data_path['StudyDay'] = data_path['StudyDay'].astype(str)

# Convert data values to numeric, will error if there are non-numeric data values
data_path.ENTRY = pd.to_numeric(data_path.ENTRY)

### Reshape table

In [7]:
# Not including ANALYSIS and SAMPLE_NUMBER so the rows are merged
# Not including for simplicity: DOB, SAMPLED_DATE, CLASS_CODE, and 'SPECIES_STRAIN'
id_cols_path = ['STUDY_NO','Class','Subject','SEX','COUNTRY_ORIGIN','StudyDay','Age_d']

In [8]:
# Check that the pivot won't be averaging any rows together by outputting the count instead of the mean (All values should be 1)
test = data_path.pivot_table(values='ENTRY',columns='NAME',index=id_cols_path,aggfunc=lambda x: len(x.unique()))
var_cols_path = test.columns.tolist() # save for later
# Check for any values over 1
ninetynine(test[test > 1].any().any(),'merging values across rows')

FALSE: I've got 99 problems, but merging values across rows is not one


In [9]:
# Perform table pivot
data_path = data_path.pivot_table(values='ENTRY',columns='NAME',index=id_cols_path)
data_path = data_path.reset_index()
data_path.head()

NAME,STUDY_NO,Class,Subject,SEX,COUNTRY_ORIGIN,StudyDay,Age_d,Abolute_Neutrophil,Absolute_Basophil,Absolute_Eosinophil,...,Percent_Reticulocyte,Platelet,Platelet_Distribution_Width,Platelet_Large_Cell_-_Ratio,Plateletcrit,Red_Blood_Cell,Red_Cell_Distrubtion_Width_-_Coefficient,Red_Cell_Distrubtion_Width_-_Standard_De,Total_Protein,White_Blood_Cell_
0,SARS-COV-2-NHP-064E-1,Infected,B03757,F,Cambodia,2,1602,3.24,0.0,0.0,...,0.52,320.0,12.6,27.8,0.44,4.82,14.1,35.2,6.8,4.95
1,SARS-COV-2-NHP-064E-1,Infected,B03757,F,Cambodia,4,1604,3.21,0.0,0.03,...,0.43,358.0,14.0,34.9,0.5,5.32,14.3,35.6,6.4,6.11
2,SARS-COV-2-NHP-064E-1,Infected,B03757,F,Cambodia,6,1606,3.96,0.01,0.04,...,0.16,373.0,13.7,32.2,0.51,5.19,14.6,36.0,6.6,6.81
3,SARS-COV-2-NHP-064E-1,Infected,B03955,M,Cambodia,-11,1561,2.75,0.01,0.09,...,0.81,344.0,9.2,11.8,0.41,5.48,12.8,33.8,7.0,5.96
4,SARS-COV-2-NHP-064E-1,Infected,B03955,M,Cambodia,2,1574,1.23,0.0,0.03,...,0.42,280.0,9.8,15.3,0.33,5.26,12.4,33.3,6.7,3.37


In [10]:
# check for duplicates after the table reshaping
ninetynine(len(data_path[data_path.duplicated(subset=['Subject','StudyDay'])]) > 0,'duplicate values')

FALSE: I've got 99 problems, but duplicate values is not one


### Calculate delta terms

In [11]:
# Create temporary dataframe to add the delta timepoints
data_path = data_path.set_index(['Subject','StudyDay']).unstack()

for var in var_cols_path:
    # Collapse pre-exposure values (should have one per subject)
    data_path.loc[:,(var,'pre')] = data_path.loc[:,(var,['-11','-7'])].mean(axis = 1, skipna = True)

    # Calculate the change from pre-exposure
    data_path.loc[:,(var,'pre_delta')] = data_path.loc[:,(var,'pre')]-data_path.loc[:,(var,'pre')] #should be all 0
    data_path.loc[:,(var,'2_delta')] = data_path.loc[:,(var,'2')]-data_path.loc[:,(var,'pre')]
    data_path.loc[:,(var,'4_delta')] = data_path.loc[:,(var,'4')]-data_path.loc[:,(var,'pre')]
    data_path.loc[:,(var,'6_delta')] = data_path.loc[:,(var,'6')]-data_path.loc[:,(var,'pre')]
    data_path.loc[:,(var,'8_delta')] = data_path.loc[:,(var,'8')]-data_path.loc[:,(var,'pre')]

In [12]:
# Copy the class and age to the new columns from the original columns

# stack and unstack so that missing columns fill in with null
data_path = data_path.stack().unstack()

# Copy class to other time points
time_points=data_path.loc[:,'Class'].columns.to_list()
# back-fill and foward-fill incase the column order changes
data_path.loc[:,('Class',time_points)] = data_path.loc[:,('Class',time_points)].fillna(method='ffill',axis=1).fillna(method='bfill',axis=1)

# Copy age to delta time point
tmp_days = ['2','2_delta','4','4_delta','6','6_delta','8','8_delta']
data_path.loc[:,('Age_d',tmp_days)] = data_path.loc[:,('Age_d',tmp_days)].fillna(method='ffill',axis=1,limit=1)
data_path.loc[:,('Age_d','pre')] = data_path.loc[:,('Age_d',['-7','-11'])].mean(axis=1)
data_path.loc[:,('Age_d','pre_delta')] = data_path.loc[:,('Age_d',['-7','-11'])].mean(axis=1)

In [13]:
# Reformat for readability
data_path = data_path.stack()

### Save tables

In [14]:
# Save full table (no error values)
data_path.to_csv(os.path.join(PATH,'tables','data_path.csv'))

# Save simplified table (only delta timepoints)
#data_path = data_path.reset_index()
data_path_delta = data_path.reset_index().drop(data_path.reset_index()[data_path.reset_index()['StudyDay'].isin(['-11','-7','2','4','6','8','pre','pre_delta'])].index)
data_path_delta.set_index(['Subject','StudyDay']).to_csv(os.path.join(PATH,'tables','data_path_delta.csv'))

### Exclude variables
with missing values and from domain-specific feature screening

In [15]:
# Uncomment to view variables to be excluded
#data_path[['C-Reactive_Protein','Class']].groupby(['StudyDay','Class']).describe()
#data_path[['Creatinine_Kinase','Class']].groupby(['StudyDay','Class']).describe()
#data_path[['ICT','Class']].groupby(['StudyDay','Class']).describe()

In [16]:
# Variables with missing values
var_cols_path.remove('C-Reactive_Protein')
    # Missing from most subjects
var_cols_path.remove('Creatinine_Kinase')
    # Confirmed missing G57L day 6 from raw
var_cols_path.remove('ICT')

In [17]:
# Variables removed by recommendation of TC
var_cols_exclude = ['Percent_Basophil',
                    'Percent_Eosinophil',
                    'Percent_Lymphocyte',
                    'Percent_Monocyte',
                    'Percent_Neutrophil',
                    'Percent_Reticulocyte']

var_cols_warning = ['Mean_Platelet_Volume_',
                    'Platelet_Distribution_Width',
                    'Platelet_Large_Cell_-_Ratio',
                    'Globulin',
                    'HEM',
                    'LIP']

for var in var_cols_exclude:
    var_cols_path.remove(var)

for var in var_cols_warning:
    var_cols_path.remove(var)

In [18]:
#Write status message, change this anytime this part of the analysis changes
status='CoopExcl'
with open(os.path.join('..','config','analysis_status','q_var_path.txt'),'w') as out_file:
    out_file.write(status)

In [19]:
# save lists
with open('../config/lists_path.pkl', 'wb') as f:
    pickle.dump([id_cols_path,var_cols_path], f)

In [20]:
print(len(var_cols_path))
var_cols_path

25


['Abolute_Neutrophil',
 'Absolute_Basophil',
 'Absolute_Eosinophil',
 'Absolute_Lymphocyte',
 'Absolute_Monocyte',
 'Absolute_Reticulocyte',
 'Alanine_Aminotransferase',
 'Albumin',
 'AlbuminvGlobulin_Ratio',
 'Aspartate_Aminotransferase',
 'Blood_Urea_Nitrogen',
 'Creatinine_',
 'Hematocrit',
 'Hemoglobin',
 'Lactate_Dehydrogenase',
 'Mean_Corpuscular_Hemoglobin',
 'Mean_Corpuscular_Hemoglobin_Concentraion',
 'Mean_Corpuscular_Volume',
 'Platelet',
 'Plateletcrit',
 'Red_Blood_Cell',
 'Red_Cell_Distrubtion_Width_-_Coefficient',
 'Red_Cell_Distrubtion_Width_-_Standard_De',
 'Total_Protein',
 'White_Blood_Cell_']

### Useful views of the data

In [21]:
# Useful views of the data for troubleshooting error values

#id_cols_path = ['SAMPLE_NUMBER','STUDY_NO','CLASS_CODE','Class','Subject','SEX','DOB','SPECIES_STRAIN','COUNTRY_ORIGIN','StudyDay','SAMPLED_DATE'
#data_path.iloc[5354]
#data_path.NAME.unique()
#data_path.loc[(data_path.NAME == 'C-Reactive Protein') & (data_path.ENTRY == '<0.1'),'ENTRY'] = 0
#data_path.loc[data_path.NAME == 'Creatinine Kinase','ENTRY'].unique()
#data_path.describe()
#data_path.loc[data_path.Subject == 'B03757',['STUDY_NO','CLASS_CODE','Class','Subject','SEX','DOB','SPECIES_STRAIN','COUNTRY_ORIGIN','StudyDay','SAMPLED_DATE','ANALYSIS']]
#data_path.loc[data_path.Subject == 'H54R',id_cols_path]
#data_path.StudyDay.unique()

#len(data_path.loc[(data_path.Subject == 'B03757') & (data_path.ANALYSIS == 'CHEM_CATALYSTONE')].dropna(how='all',axis=1).columns)
#len(data_path.loc[(data_path.Subject == 'B03757') & (data_path.ANALYSIS == 'CBC_PROCYTEDX')].dropna(how='all',axis=1).columns)
#len(data_path.loc[data_path.Subject == 'B03757'].dropna(how='all',axis=1).columns)
#len(data_path.loc[data_path.Subject == 'B03757'].columns)
#all_col = data_path.columns.to_list().remove(id_cols_path)
#all_sanNA = data_path.loc[data_path.Subject == 'B03757'].dropna(how='all',axis=1).columns

#data_path.describe().loc['count',:]
#data_path.info()

#data_path.groupby(['StudyDay','Class']).agg(['count','mean','std','min','max'])

#data_path.describe()

#data_path.reset_index().loc[data_path.reset_index().Subject == 'G57L',['StudyDay','Creatinine_Kinase']]

#data_path.loc[(data_path.Class == 'Mock') & (data_path.StudyDay == '2')]

#data_path.reset_index().loc[data_path.reset_index().Class == 'Mock','Subject'].unique()
# data_path[['Creatinine_']].loc['H89K']

data_path.groupby(['StudyDay','Class','SEX','COUNTRY_ORIGIN']).agg(['count','mean','std','min','max'])

# data_path.reset_index().loc[~data_path.reset_index().Subject.isin(['H56N', 'H89K','B03942','G21D'])].groupby(['StudyDay','Class','SEX','COUNTRY_ORIGIN']).agg(['count','mean','std','min','max'])

#data_path.reset_index().loc[(data_path.reset_index().SEX == 'F') & (data_path.reset_index().Class == 'Mock'),'Subject'].unique()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Abolute_Neutrophil,Abolute_Neutrophil,Abolute_Neutrophil,Abolute_Neutrophil,Abolute_Neutrophil,Absolute_Basophil,Absolute_Basophil,Absolute_Basophil,Absolute_Basophil,Absolute_Basophil,...,Total_Protein,Total_Protein,Total_Protein,Total_Protein,Total_Protein,White_Blood_Cell_,White_Blood_Cell_,White_Blood_Cell_,White_Blood_Cell_,White_Blood_Cell_
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,count,mean,std,min,max,count,mean,std,min,max,...,count,mean,std,min,max,count,mean,std,min,max
StudyDay,Class,SEX,COUNTRY_ORIGIN,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
-11,Infected,F,Cambodia,1,6.14,,6.14,6.14,1,0.02,,0.02,0.02,...,1,7.1,,7.1,7.1,1,8.76,,8.76,8.76
-11,Infected,M,Cambodia,2,3.295,0.770746,2.75,3.84,2,0.005,0.007071,0.0,0.01,...,2,6.95,0.070711,6.9,7.0,2,8.005,2.892067,5.96,10.05
-11,Mock,F,Cambodia,2,2.315,0.968736,1.63,3.0,2,0.0,0.0,0.0,0.0,...,2,6.65,0.212132,6.5,6.8,2,9.35,2.206173,7.79,10.91
-11,Mock,M,Cambodia,1,5.43,,5.43,5.43,1,0.0,,0.0,0.0,...,1,6.7,,6.7,6.7,1,10.7,,10.7,10.7
-7,Infected,F,Cambodia,2,6.105,3.203194,3.84,8.37,2,0.02,0.014142,0.01,0.03,...,2,6.6,0.0,6.6,6.6,2,10.265,6.526596,5.65,14.88
-7,Infected,F,nil,3,3.476667,1.259616,2.7,4.93,3,0.003333,0.005774,0.0,0.01,...,3,7.0,0.173205,6.8,7.1,3,8.443333,0.170978,8.33,8.64
-7,Infected,M,Cambodia,1,2.22,,2.22,2.22,1,0.01,,0.01,0.01,...,1,6.5,,6.5,6.5,1,9.16,,9.16,9.16
-7,Infected,M,nil,3,4.403333,1.659558,2.91,6.19,3,0.003333,0.005774,0.0,0.01,...,3,7.133333,0.251661,6.9,7.4,3,8.176667,1.622385,6.35,9.45
-7,Mock,F,Cambodia,2,4.745,2.948635,2.66,6.83,2,0.0,0.0,0.0,0.0,...,2,6.45,0.212132,6.3,6.6,2,10.045,2.679935,8.15,11.94
-7,Mock,F,nil,4,3.9925,0.650762,3.15,4.73,4,0.005,0.01,0.0,0.02,...,4,7.35,0.208167,7.1,7.6,4,6.1675,1.14468,5.02,7.71


### Notes
* Columns removed because only contains data >20 days pre-infection (and for only 3 subjects): 'Adjusted Ca', 'Calcium', 'Gamma-glutamyltransferase', 'Potassium', 'Sodium', 'Total Bilirubin', 'Triglyceride'
* C-Reactive Protein was only collected for 64E-1 so that is why there are only 47 entries for this modality
