# Data Parsing and Collection

In [1]:
import pandas as pd
import numpy as np

### Flow Data Processing

In [2]:
WSP = pd.read_csv('Metadata/SDY404-DR54_Subject_2_Flow_cytometry_workspace.txt', sep="\t")
WSP = WSP[['Subject Accession','Subject Phenotype','Planned Visit Accession','File Detail','File Name']] #Important Things
WSP.head()

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,File Detail,File Name
0,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry workspace,Panel_L1_Run_1.804538.wsp
1,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry workspace,Panel_P2_Run_1-regated.804546.wsp
2,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry workspace,Panel_P13_Run_1-regated.804545.wsp
3,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry workspace,Panel_P9-P10_Run_1.804544.wsp
4,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry workspace,Panel P1.804543.wsp


In [3]:
FCS = pd.read_csv('Metadata/SDY404-DR54_Subject_2_Flow_cytometry_result.txt', sep="\t")
FCS = FCS[['Subject Accession','Subject Phenotype','Planned Visit Accession','File Detail','File Name']] #Important Things
FCS.head()

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,File Detail,File Name
0,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry result,Panel_P13_A10_P13_110191_PBMC_10112011_A10.578...
1,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry result,Panel_P9_A1_P9_110191_PBMC_10112011_A01.578824...
2,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry result,Panel_P2_A7_P2_110191_PBMC_10112011_A07.579100...
3,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry result,Panel_L2_A4_L2_110191_PBMC_10112011_A04.579270...
4,SUB120417,"Older adult, not-frail",PV3222,Flow cytometry result,Panel_L1_A1_L1_110191_PBMC_10112011_A01.579376...


#### DESIRABLE PARAMETERS:
**Subject Phenotype:** Younger Adult \
**Planned Visit Accession:** PV3222 (Pre-Vaccination Point) \
**Panel:** L1

In [4]:
WSP = WSP.loc[WSP['Subject Phenotype'] == 'Younger adult']
WSP = WSP.loc[WSP['Planned Visit Accession'] == 'PV3222']
WSP = WSP.loc[WSP['File Name'].str.contains('L1')]
display(WSP)

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,File Detail,File Name
111,SUB120420,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_2.804556.wsp
219,SUB120423,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_1.804538.wsp
777,SUB120445,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_4.804574.wsp
813,SUB120446,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_5.804583.wsp
849,SUB120449,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_4.804574.wsp
876,SUB120450,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_5.804583.wsp
912,SUB120451,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_5.804583.wsp
939,SUB120452,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_5.804583.wsp
974,SUB120457,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_5.804583.wsp
1010,SUB120458,Younger adult,PV3222,Flow cytometry workspace,Panel_L1_Run_5.804583.wsp


In [5]:
FCS = FCS.loc[FCS['Subject Phenotype'] == 'Younger adult']
FCS = FCS.loc[FCS['Planned Visit Accession'] == 'PV3222']
FCS = FCS.loc[FCS['File Name'].str.contains('L1')]
display(FCS)
FCS.shape

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,File Detail,File Name
129,SUB120420,Younger adult,PV3222,Flow cytometry result,Panel_L1_A1_L1_110194_PBMC_10112011_A01.580227...
245,SUB120423,Younger adult,PV3222,Flow cytometry result,Panel_L1_A2_L1_110197_PBMC_10112011_A02.579447...
863,SUB120445,Younger adult,PV3222,Flow cytometry result,Panel_L1_A3_L1_110243_PBMC_10172011_A03.578999...
905,SUB120446,Younger adult,PV3222,Flow cytometry result,Panel_L1_A1_L1_110244_PBMC_10172011_A01.579604...
950,SUB120449,Younger adult,PV3222,Flow cytometry result,Panel_L1_E3_L1_110247_PBMC_10172011_E03.580335...
973,SUB120450,Younger adult,PV3222,Flow cytometry result,Panel_L1_E1_L1_110248_PBMC_10172011_E01.578572...
1017,SUB120451,Younger adult,PV3222,Flow cytometry result,Panel_L1_E3_L1_110249_PBMC_10172011_E03.579792...
1046,SUB120452,Younger adult,PV3222,Flow cytometry result,Panel_L1_A2_L1_110250_PBMC_10172011_A02.579307...
1094,SUB120457,Younger adult,PV3222,Flow cytometry result,Panel_L1_E2_L1_110255_PBMC_10172011_E02.580138...
1133,SUB120458,Younger adult,PV3222,Flow cytometry result,Panel_L1_A3_L1_110256_PBMC_10172011_A03.579670...


(21, 5)

### Getting Files to Download

In [6]:
#The Panel we are looking for is Panel L1
wsps = WSP['File Name'].unique()
display(wsps)

array(['Panel_L1_Run_2.804556.wsp', 'Panel_L1_Run_1.804538.wsp',
       'Panel_L1_Run_4.804574.wsp', 'Panel_L1_Run_5.804583.wsp',
       'Panel_L1_Run_6.804592.wsp', 'Panel_L1_Run_7-regated.804601.wsp',
       'Panel_L1_Run_8-regated.804610.wsp'], dtype=object)

In [7]:
#FCS Files
fcss = FCS['File Name'].unique()
display(fcss)

array(['Panel_L1_A1_L1_110194_PBMC_10112011_A01.580227.fcs',
       'Panel_L1_A2_L1_110197_PBMC_10112011_A02.579447.fcs',
       'Panel_L1_A3_L1_110243_PBMC_10172011_A03.578999.fcs',
       'Panel_L1_A1_L1_110244_PBMC_10172011_A01.579604.fcs',
       'Panel_L1_E3_L1_110247_PBMC_10172011_E03.580335.fcs',
       'Panel_L1_E1_L1_110248_PBMC_10172011_E01.578572.fcs',
       'Panel_L1_E3_L1_110249_PBMC_10172011_E03.579792.fcs',
       'Panel_L1_A2_L1_110250_PBMC_10172011_A02.579307.fcs',
       'Panel_L1_E2_L1_110255_PBMC_10172011_E02.580138.fcs',
       'Panel_L1_A3_L1_110256_PBMC_10172011_A03.579670.fcs',
       'Panel_L1_A1_L1_110257_PBMC_10172011_A01.580299.fcs',
       'Panel_L1_E1_L1_110258_PBMC_10172011_E01.579967.fcs',
       'Panel_L1_E3_L1_110259_PBMC_10172011_E03.578407.fcs',
       'Panel_L1_A2_L1_110260_PBMC_10172011_A02.578398.fcs',
       'Panel_L1_A3_L1_110265_PBMC_10172011_A03.578294.fcs',
       'Panel_L1_A1_L1_110267_PBMC_10172011_A01.579602.fcs',
       'Panel_L1_E1_L1_1

### File Collection
Was done manually and uploaded to github (would have done with code, but compatability issues :( ) \ 
**RAW DATA:** Uploaded to github as original FCS files. (Note that this is compensated already) \ 
**PREPROCESSED DATA:** Opened each raw file in its corresponding WSP. Used existing CD3 Viability/CD3 T Cell gate (CD3 x Red Viability Dye) to draw viability gate. Exported gate population to new FCS file.


## TODO: Add HAI Titer 
(and day if possible -- check if its all day 28) to FCS File dataframe. Change FCS File names to .csv and add preprocessing column if necessary

In [8]:
HAI1 = pd.read_csv("HAI/ALL_HAI_year2.388515.txt", sep="\t")
HAI2 = pd.read_csv("HAI/ALL_HAI_year2.388540.txt", sep="\t")
display(HAI1.shape)
display(HAI2.shape)
display(HAI1.equals(HAI2))
HAI1.head(10)
#File Name does not matter because they're the same

(138, 6)

(138, 6)

True

Unnamed: 0,Subject,Day,H1,H3,B,Age Group
0,110191,0,64,8,32,Old
1,110191,28,128,16,32,Old
2,110192,0,32,16,64,Old
3,110192,28,32,16,64,Old
4,110193,0,8,8,16,Old
5,110193,28,16,8,16,Old
6,110194,0,32,16,16,Young
7,110194,28,64,16,16,Young
8,110195,0,16,16,8,Old
9,110195,28,64,32,32,Old


In [9]:
HAI = pd.read_csv("Metadata/SDY404-DR54_Subject_2_HAI_result.txt", sep="\t")
HAI = HAI[['Subject Accession', 'Subject Phenotype', 'Planned Visit Accession','Study Time Collected']]
display(HAI.head())
HAI.shape

Unnamed: 0,Subject Accession,Subject Phenotype,Planned Visit Accession,Study Time Collected
0,SUB120417,"Older adult, not-frail",PV3222,0
1,SUB120417,"Older adult, not-frail",PV3225,28
2,SUB120418,"Older adult, frail",PV3222,0
3,SUB120418,"Older adult, frail",PV3225,28
4,SUB120419,"Older adult, pre-frail",PV3222,0


(138, 4)

In [10]:
def oldyoung(x):
    if x == "Younger adult":
        return "Young"
    else:
        return "Old"

HAI['Subject Phenotype'] = HAI['Subject Phenotype'].apply(oldyoung)
HAI.rename(columns={'Subject Phenotype': 'Age Group', 'Study Time Collected': 'Day'}, inplace=True)
HAI.head()

Unnamed: 0,Subject Accession,Age Group,Planned Visit Accession,Day
0,SUB120417,Old,PV3222,0
1,SUB120417,Old,PV3225,28
2,SUB120418,Old,PV3222,0
3,SUB120418,Old,PV3225,28
4,SUB120419,Old,PV3222,0


In [11]:
df = HAI[['Age Group','Day']].eq(HAI1[['Age Group','Day']])
display(df['Age Group'].unique()) #Age Group Matches, so can drop
display(df['Day'].unique()) #Day does not match, why?

array([ True])

array([ True, False])

In [12]:
HAI.iloc[df[df['Day'] == False].index]

Unnamed: 0,Subject Accession,Age Group,Planned Visit Accession,Day
61,SUB120449,Young,PV3225,24
83,SUB120461,Young,PV3225,36
101,SUB120470,Young,PV3225,24
105,SUB120472,Young,PV3225,32
115,SUB120477,Old,PV3225,35
117,SUB120478,Old,PV3225,35


In [13]:
HAI1.iloc[df[df['Day'] == False].index]

Unnamed: 0,Subject,Day,H1,H3,B,Age Group
61,110247,28,1024,128,64,Young
83,110259,28,128,128,64,Young
101,110268,28,512,32,64,Young
105,110270,28,128,16,128,Young
115,110275,28,32,8,128,Old
117,110276,28,256,16,32,Old


In [14]:
#So day is slightly different, but otherwise probably ok to assume. HAI titer still later so probably ok. We will go based on HAI Titer Document with actual HAI data.
HAI1.drop(columns = ['Subject','Age Group'], inplace=True)
HAI.drop(columns = 'Day', inplace=True)
HAIall = HAI.join(HAI1)
HAIall.tail()

Unnamed: 0,Subject Accession,Age Group,Planned Visit Accession,Day,H1,H3,B
133,SUB120486,Old,PV3225,28,32,16,16
134,SUB120487,Old,PV3222,0,8,8,64
135,SUB120487,Old,PV3225,28,8,8,64
136,SUB120488,Old,PV3222,0,64,8,16
137,SUB120488,Old,PV3225,28,64,8,32


In [15]:
HAIall = HAIall.loc[HAIall['Age Group'] == 'Young']
HAIall = HAIall.loc[HAIall['Day'] == 28]
display(HAIall)
HAIall.shape

Unnamed: 0,Subject Accession,Age Group,Planned Visit Accession,Day,H1,H3,B
7,SUB120420,Young,PV3225,28,64,16,16
13,SUB120423,Young,PV3225,28,256,16,16
53,SUB120444,Young,PV3225,28,512,16,32
55,SUB120445,Young,PV3225,28,64,8,32
57,SUB120446,Young,PV3225,28,1024,256,64
59,SUB120447,Young,PV3225,28,128,64,32
61,SUB120449,Young,PV3225,28,1024,128,64
63,SUB120450,Young,PV3225,28,64,32,16
65,SUB120452,Young,PV3225,28,128,128,128
67,SUB120453,Young,PV3225,28,128,128,64


(32, 7)

In [16]:
HAIall.drop(columns = ['Age Group','Planned Visit Accession','Day'], inplace = True)
HAIall.reset_index(drop=True, inplace = True)
HAIall.head()
#Y values: H1, H3, B

Unnamed: 0,Subject Accession,H1,H3,B
0,SUB120420,64,16,16
1,SUB120423,256,16,16
2,SUB120444,512,16,32
3,SUB120445,64,8,32
4,SUB120446,1024,256,64


In [17]:
All = pd.merge(HAIall, FCS[['Subject Accession','File Name']], on = 'Subject Accession')
All.rename(columns = {'File Name':'FCS Raw'}, inplace = True)
display(All)

Unnamed: 0,Subject Accession,H1,H3,B,FCS Raw
0,SUB120420,64,16,16,Panel_L1_A1_L1_110194_PBMC_10112011_A01.580227...
1,SUB120423,256,16,16,Panel_L1_A2_L1_110197_PBMC_10112011_A02.579447...
2,SUB120445,64,8,32,Panel_L1_A3_L1_110243_PBMC_10172011_A03.578999...
3,SUB120446,1024,256,64,Panel_L1_A1_L1_110244_PBMC_10172011_A01.579604...
4,SUB120449,1024,128,64,Panel_L1_E3_L1_110247_PBMC_10172011_E03.580335...
5,SUB120450,64,32,16,Panel_L1_E1_L1_110248_PBMC_10172011_E01.578572...
6,SUB120452,128,128,128,Panel_L1_A2_L1_110250_PBMC_10172011_A02.579307...
7,SUB120457,128,128,32,Panel_L1_E2_L1_110255_PBMC_10172011_E02.580138...
8,SUB120458,256,64,32,Panel_L1_A3_L1_110256_PBMC_10172011_A03.579670...
9,SUB120459,128,32,64,Panel_L1_A1_L1_110257_PBMC_10172011_A01.580299...


In [18]:
# For Verification of FCS File Mapping -- done manually
All = pd.merge(All, WSP[['Subject Accession','File Name']], on = 'Subject Accession')
All.rename(columns = {'File Name':'WSP File'}, inplace = True)
display(All)

Unnamed: 0,Subject Accession,H1,H3,B,FCS Raw,WSP File
0,SUB120420,64,16,16,Panel_L1_A1_L1_110194_PBMC_10112011_A01.580227...,Panel_L1_Run_2.804556.wsp
1,SUB120423,256,16,16,Panel_L1_A2_L1_110197_PBMC_10112011_A02.579447...,Panel_L1_Run_1.804538.wsp
2,SUB120445,64,8,32,Panel_L1_A3_L1_110243_PBMC_10172011_A03.578999...,Panel_L1_Run_4.804574.wsp
3,SUB120446,1024,256,64,Panel_L1_A1_L1_110244_PBMC_10172011_A01.579604...,Panel_L1_Run_5.804583.wsp
4,SUB120449,1024,128,64,Panel_L1_E3_L1_110247_PBMC_10172011_E03.580335...,Panel_L1_Run_4.804574.wsp
5,SUB120450,64,32,16,Panel_L1_E1_L1_110248_PBMC_10172011_E01.578572...,Panel_L1_Run_5.804583.wsp
6,SUB120452,128,128,128,Panel_L1_A2_L1_110250_PBMC_10172011_A02.579307...,Panel_L1_Run_5.804583.wsp
7,SUB120457,128,128,32,Panel_L1_E2_L1_110255_PBMC_10172011_E02.580138...,Panel_L1_Run_5.804583.wsp
8,SUB120458,256,64,32,Panel_L1_A3_L1_110256_PBMC_10172011_A03.579670...,Panel_L1_Run_5.804583.wsp
9,SUB120459,128,32,64,Panel_L1_A1_L1_110257_PBMC_10172011_A01.580299...,Panel_L1_Run_6.804592.wsp


In [19]:
# Add Preprocessing File Names 
def preprocessfile(name):
    names = name.split('.')
    return "export_" + names[0] + "_Viability." + names[2]

All['FCS Preprocessed'] = All['FCS Raw'].apply(preprocessfile)
All = All.iloc[:,[0, 1, 2, 3, 4, 6, 5]]
display(All)

Unnamed: 0,Subject Accession,H1,H3,B,FCS Raw,FCS Preprocessed,WSP File
0,SUB120420,64,16,16,Panel_L1_A1_L1_110194_PBMC_10112011_A01.580227...,export_Panel_L1_A1_L1_110194_PBMC_10112011_A01...,Panel_L1_Run_2.804556.wsp
1,SUB120423,256,16,16,Panel_L1_A2_L1_110197_PBMC_10112011_A02.579447...,export_Panel_L1_A2_L1_110197_PBMC_10112011_A02...,Panel_L1_Run_1.804538.wsp
2,SUB120445,64,8,32,Panel_L1_A3_L1_110243_PBMC_10172011_A03.578999...,export_Panel_L1_A3_L1_110243_PBMC_10172011_A03...,Panel_L1_Run_4.804574.wsp
3,SUB120446,1024,256,64,Panel_L1_A1_L1_110244_PBMC_10172011_A01.579604...,export_Panel_L1_A1_L1_110244_PBMC_10172011_A01...,Panel_L1_Run_5.804583.wsp
4,SUB120449,1024,128,64,Panel_L1_E3_L1_110247_PBMC_10172011_E03.580335...,export_Panel_L1_E3_L1_110247_PBMC_10172011_E03...,Panel_L1_Run_4.804574.wsp
5,SUB120450,64,32,16,Panel_L1_E1_L1_110248_PBMC_10172011_E01.578572...,export_Panel_L1_E1_L1_110248_PBMC_10172011_E01...,Panel_L1_Run_5.804583.wsp
6,SUB120452,128,128,128,Panel_L1_A2_L1_110250_PBMC_10172011_A02.579307...,export_Panel_L1_A2_L1_110250_PBMC_10172011_A02...,Panel_L1_Run_5.804583.wsp
7,SUB120457,128,128,32,Panel_L1_E2_L1_110255_PBMC_10172011_E02.580138...,export_Panel_L1_E2_L1_110255_PBMC_10172011_E02...,Panel_L1_Run_5.804583.wsp
8,SUB120458,256,64,32,Panel_L1_A3_L1_110256_PBMC_10172011_A03.579670...,export_Panel_L1_A3_L1_110256_PBMC_10172011_A03...,Panel_L1_Run_5.804583.wsp
9,SUB120459,128,32,64,Panel_L1_A1_L1_110257_PBMC_10172011_A01.580299...,export_Panel_L1_A1_L1_110257_PBMC_10172011_A01...,Panel_L1_Run_6.804592.wsp


In [20]:
# Export CSV
All.to_csv("patient_data.csv", index=False)