# Notebook to get better overview of experiments
- In this notebook, I create a rudimentary structure of a dataframe that could help us slice the right datasets selecting T cell number or WT or new protocol

- I just realized it is better to concatenate all data in a single dataframe as this has exactly the indexing I am trying to reproduce

- For a single dataframe, only thing that is lacking are the 8 tests that are defined here, but I could easily add these too. Then, should also be a WT/nonWT level.

- In this file we exclude antagonism levels, as we have a separate notebooks to visualize this (too messy otherwise).

In [None]:
import os
import pandas as pd
import numpy as np

In [None]:
folder="../data/final/"
tests=["CD25Mutant","ITAMDeficient","Tumor","TCellNumber","Activation","CAR","TCellType","Macrophages"]

In [None]:
df=pd.DataFrame([],columns=["Date","Name","Protocol"]+tests).T
unique_levels=[]
peptides={}
concentrations={}
for idx,file in enumerate(os.listdir(folder)):
    if not file.endswith(".pkl"):
        continue
    date = file[32:40]
    name = file[41:-10]
    tmp=pd.read_hdf("../output/dataframes/"+name+".hdf")        
    
    level_names={level:vals for level,vals in zip(tmp.index.names[:-3],tmp.index.levels[:-3])}
    [unique_levels.append(level) for level in level_names.keys() if level not in unique_levels]
    
    if "Peptide" in tmp.index.names:
        peptides[name]=[peptide for peptide in tmp.index.levels[tmp.index.names.index("Peptide")]]
    else: 
        peptides[name]=np.nan
        
    if "Concentration" in tmp.index.names:
        concentrations[name]=[concentration for concentration in tmp.index.levels[tmp.index.names.index("Concentration")]]
    else:
        concentrations[name]=np.nan
        
    protocol="New"
    if pd.to_datetime(date) < pd.to_datetime("20191022"):
        protocol="Old"
    exp_test = [level_names if test in name else np.nan for test in tests]

    df[idx]=[date,name,protocol]+exp_test
    df[idx]
df.head()

In [None]:
df=df.T
df.Date=df.Date.astype("datetime64")
df=df.sort_values("Date").reset_index(drop=True)
df.set_index(["Date","Name","Protocol"],inplace=True)
df.head()

In [None]:
# Add empty columns with tests to prepare filling them per experiment
df.columns.names=["Test"]
df=pd.DataFrame(df.stack("Test"))
for level in unique_levels:
    df[level]=[np.nan]*len(df)
    df[level]=df[level].astype("object")
df=df.unstack("Test")

In [None]:
# Loop over dictionaries of experimental conditions (level names and level values)
# assign them to the right row/column
for idx in df.index:
    row=df.loc[idx].dropna() # drop columns without dictionaries
    for idy in row.index:
        for key in row[idy].keys(): # put the values (list) in the column (key)
            df.loc[idx,(key,idy[1])]=[elem for elem in row[idy][key]]
df.head()

In [None]:
# Drop columns without entries and change orientation to add WT peptide and concentration columns
df=df.dropna(axis=1,how="all").drop(0,axis=1).swaplevel(axis=1)
df=df.stack("Test")
df["Peptide"]=pd.Series([np.nan]*len(df),dtype="object")
df["Concentration"]=pd.Series([np.nan]*len(df),dtype="object")

In [None]:
# Assign list of peptides and concentrations to the entries in the dataframe
for idx in df.index:
    data=idx[1]
    df.loc[idx,"Peptide"]=peptides[data]
    df.loc[idx,"Concentration"]=concentrations[data]
df

In [None]:
naives=["B6","Splenocyte","Naive","None","None","WT","None","100k","OT1","None","0k","N4","1uM"]
naive_dict={col:naive for col,naive in zip(df.columns,naives)}

In [None]:
def custom_unique(group):
    unique_matrix=[]
    for col in group.columns:
        unique_list=[naive_dict[col]]
        for col_list in group[col]:
            if type(col_list) == list:
                [unique_list.append(elem) for elem in col_list if elem not in unique_list]
        unique_matrix.append(unique_list)
    
    return unique_matrix


In [None]:
B=df.groupby("Test").apply(custom_unique)
A=pd.DataFrame([],columns=B.index,index=df.columns)
for idx in range(len(B)):
    A.iloc[:,idx]=B[idx]
A.T