# Dataset preparation

In [1]:
import numpy as np
from scipy import io
import pandas as pd

## Determine the label

Determine the label (low/optimal/high) and $\Delta$CHIR concentration for each well. Under each CHIR duration level (24h, 36h, and 48h), The CHIR concentrations with percentage of cTnT+ cells $\ge$ `thres` are identified as optimal concentration range. Then, for a concentration level $c$:
* If $c < c_1$: label($c$) = "low", $\Delta$CHIR concentration($c$) $ = c - c_1$;
* If $c_1 \le c \le c_2$: label($c$) = "optimal", $\Delta$CHIR concentration($c$) $ = 0$;
* If $c > c_2$: label($c$) = "high", $\Delta$CHIR concentration($c$) $ = c - c_2$.

In [2]:
data = []

thres = 0.2

for batch_name in ("CD01-1", "CD01-3", "CD01-2", "CD01-4"):
    
    # load the experimental configuration
    exp_config = pd.read_csv("%s/%s_exp_config.csv" % (batch_name, batch_name), index_col=0)
        
    # identify the optimal CHIR concentration range under each CHIR duration (24h, 36h, 48h)
    optim_conc_range = {24:None, 36:None, 48:None}
    for dur in (24, 36, 48):

        mean_perc_cTNT_pos = exp_config[exp_config["chir_hour"] == dur].groupby("chir")["percentage_of_cTNT+_cells"].mean()
        
        tmp = mean_perc_cTNT_pos[mean_perc_cTNT_pos >= thres].index
        optim_conc_range[dur] = (tmp[0], tmp[-1])
    
    # obtain the label for each well
    for S_id in range(1, 97):

        # Obtain the CHIR concentration label (low/optimal/high) and "ΔCHIR concentration"
        chir_conc = exp_config.loc["S%d" % S_id, "chir"]
        label = {24:None, 36:None, 48:None}
        delta_CHIR_conc = {24:None, 36:None, 48:None}
        for dur in (24, 36, 48):
            if (chir_conc < optim_conc_range[dur][0]):
                label[dur] = "low"
                delta_CHIR_conc[dur] = chir_conc - optim_conc_range[dur][0]
            elif (chir_conc > optim_conc_range[dur][1]):
                label[dur] = "high"
                delta_CHIR_conc[dur] = chir_conc - optim_conc_range[dur][1]
            else:
                label[dur] = "optimal"
                delta_CHIR_conc[dur] = 0
        
        # Add a row to the summary table
        data.append((batch_name, S_id, None, chir_conc,
                     label[24], delta_CHIR_conc[24], 
                     label[36], delta_CHIR_conc[36], 
                     label[48], delta_CHIR_conc[48], 
                     ))

In [3]:
summary_df = pd.DataFrame(data, columns = ["batch_name", "S_id", 
                                           "feature", "CHIR_conc", 
                                           "label_24", "delta_CHIR_conc_24", 
                                           "label_36", "delta_CHIR_conc_36", 
                                           "label_48", "delta_CHIR_conc_48"
                                          ])

In [4]:
summary_df.to_csv("dataset.csv")
summary_df.to_pickle("dataset.pkl")

In [5]:
summary_df

Unnamed: 0,batch_name,S_id,feature,CHIR_conc,label_24,delta_CHIR_conc_24,label_36,delta_CHIR_conc_36,label_48,delta_CHIR_conc_48
0,CD01-1,1,,4,low,-2,optimal,0,optimal,0
1,CD01-1,2,,4,low,-2,optimal,0,optimal,0
2,CD01-1,3,,4,low,-2,optimal,0,optimal,0
3,CD01-1,4,,4,low,-2,optimal,0,optimal,0
4,CD01-1,5,,4,low,-2,optimal,0,optimal,0
...,...,...,...,...,...,...,...,...,...,...
379,CD01-4,92,,6,optimal,0,optimal,0,optimal,0
380,CD01-4,93,,4,optimal,0,optimal,0,optimal,0
381,CD01-4,94,,4,optimal,0,optimal,0,optimal,0
382,CD01-4,95,,2,low,-2,low,-2,low,-2


## Load computed features

Extract features of each well from its 0-12h image stream. You should first run the python script `compute_features.py` in a command line window. This will generate DataFrame files `./data/CD01-*/CD01-*_features.pkl`. The computed features will be merged to the summary Dataframe `summary_df`.

In [6]:
feature_name = ["Local Entropy (pre-phase)",
                "Local Entropy (mid-phase)",
                "Local Entropy (post-phase)",
                "Cell Brightness (pre-phase)",
                "Cell Brightness (mid-phase)",
                "Cell Brightness (post-phase)",
                "Fractal Dimension (pre-phase)",
                "Fractal Dimension (mid-phase)",
                "Fractal Dimension (post-phase)",
                "Area (pre-phase)",
                "Area (mid-phase)",
                "Area (post-phase)",
                "Circumference (pre-phase)",
                "Circumference (mid-phase)",
                "Circumference (post-phase)",
                "A-C Ratio (pre-phase)",
                "A-C Ratio (mid-phase)",
                "A-C Ratio (post-phase)",
                "Optical Flow (pre-phase)",
                "Optical Flow (mid-phase)",
                "Optical Flow (post-phase)",
                ]

ind = 0

for batch_name in ("CD01-1", "CD01-3", "CD01-2", "CD01-4"):

    # obtain the feature vector for each well
    for S_id in range(1, 97):

        # load the computed features
        features = pd.read_pickle("%s/%s_features.pkl" % (batch_name, batch_name))

        # Obtain the 21-D feature vector
        feat = []
        row = features[features.S_id == S_id]
        # Type-I absolute features
        for variable_name in ["local_entropy", "cell_brightness", "fractal_dimension"]: 
            seq = np.array([row["%s_%d" % (variable_name, i)].to_numpy()[0] for i in range(10)])
            for (start_id, end_id) in [(0, 4), (4, 7), (7, 10)]: # pre-phase, mid-phase, post-phase
                feat.append(seq[start_id : end_id].mean())
        # Type-I relative features
        for variable_name in ["area", "circumference", "A_C_ratio"]: 
            seq = np.array([row["%s_%d" % (variable_name, i)].to_numpy()[0] for i in range(10)])
            for (start_id, end_id) in [(1, 4), (4, 7), (7, 10)]: # pre-phase, mid-phase, post-phase
                feat.append(seq[start_id : end_id].mean() / seq[0])
        # Type-II relative features
        for variable_name in ["optical_flow",]:
            seq = np.array([row["%s_%d" % (variable_name, i)].to_numpy()[0] for i in range(9)])
            for (start_id, end_id) in [(1, 4), (4, 7), (7, 9)]: # pre-phase, mid-phase, post-phase
                feat.append(seq[start_id : end_id].mean() / seq[0])

        summary_df.at[ind, "feature"] = feat
        ind += 1

In [7]:
summary_df.to_csv("dataset.csv")
summary_df.to_pickle("dataset.pkl")

In [8]:
summary_df

Unnamed: 0,batch_name,S_id,feature,CHIR_conc,label_24,delta_CHIR_conc_24,label_36,delta_CHIR_conc_36,label_48,delta_CHIR_conc_48
0,CD01-1,1,"[5.455338259759867, 5.7414706257300425, 5.8044...",4,low,-2,optimal,0,optimal,0
1,CD01-1,2,"[5.237066656750007, 5.487482317753688, 5.50061...",4,low,-2,optimal,0,optimal,0
2,CD01-1,3,"[5.254863889958337, 5.508179838042186, 5.55930...",4,low,-2,optimal,0,optimal,0
3,CD01-1,4,"[5.082029808865771, 5.423085463008182, 5.67174...",4,low,-2,optimal,0,optimal,0
4,CD01-1,5,"[4.910609109618155, 5.303015952970745, 5.61214...",4,low,-2,optimal,0,optimal,0
...,...,...,...,...,...,...,...,...,...,...
379,CD01-4,92,"[4.1183550214657565, 4.416986956262265, 4.6213...",6,optimal,0,optimal,0,optimal,0
380,CD01-4,93,"[4.158540541832133, 4.433769827927805, 4.66982...",4,optimal,0,optimal,0,optimal,0
381,CD01-4,94,"[4.211973225590269, 4.441425968888667, 4.64296...",4,optimal,0,optimal,0,optimal,0
382,CD01-4,95,"[4.137073257000284, 4.368685672167193, 4.58494...",2,low,-2,low,-2,low,-2


## Split the Dataset

Divide the dataset into a training set and a test set.

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_df, test_df = train_test_split(summary_df, test_size=0.3, random_state=123)

In [11]:
train_df.to_csv("dataset_train.csv")
train_df.to_pickle("dataset_train.pkl")

test_df.to_csv("dataset_test.csv")
test_df.to_pickle("dataset_test.pkl")

In [12]:
train_df

Unnamed: 0,batch_name,S_id,feature,CHIR_conc,label_24,delta_CHIR_conc_24,label_36,delta_CHIR_conc_36,label_48,delta_CHIR_conc_48
105,CD01-3,10,"[4.745773110762957, 5.011450635868556, 5.06936...",8,optimal,0,optimal,0,optimal,0
5,CD01-1,6,"[4.862419025915603, 5.291853964338873, 5.58093...",4,low,-2,optimal,0,optimal,0
171,CD01-3,76,"[4.768596097966914, 4.984262302882079, 5.01737...",8,optimal,0,optimal,0,optimal,0
190,CD01-3,95,"[4.811991832527234, 5.131142828764627, 5.20718...",4,low,-2,low,-2,low,-2
19,CD01-1,20,"[4.850729926231816, 5.264420744842791, 5.61798...",4,low,-2,optimal,0,optimal,0
...,...,...,...,...,...,...,...,...,...,...
230,CD01-2,39,"[4.353344028403218, 4.63093879896084, 4.797606...",6,optimal,0,optimal,0,high,2
98,CD01-3,3,"[4.751908721132576, 5.033764426581706, 5.11984...",6,optimal,0,optimal,0,optimal,0
322,CD01-4,35,"[4.077717425949446, 4.4099447250277715, 4.6694...",12,optimal,0,optimal,0,high,2
382,CD01-4,95,"[4.137073257000284, 4.368685672167193, 4.58494...",2,low,-2,low,-2,low,-2


In [13]:
test_df

Unnamed: 0,batch_name,S_id,feature,CHIR_conc,label_24,delta_CHIR_conc_24,label_36,delta_CHIR_conc_36,label_48,delta_CHIR_conc_48
184,CD01-3,89,"[4.715319470296373, 5.040546713341095, 5.11014...",4,low,-2,low,-2,low,-2
239,CD01-2,48,"[4.368473528733308, 4.53504550233312, 4.675459...",6,optimal,0,optimal,0,high,2
170,CD01-3,75,"[4.77758245350142, 5.047552470505511, 5.111018...",6,optimal,0,optimal,0,optimal,0
261,CD01-2,70,"[4.5076923543600795, 4.751261600583177, 4.8568...",8,optimal,0,optimal,0,high,4
374,CD01-4,87,"[4.1113133758788365, 4.430241128375676, 4.6109...",10,optimal,0,optimal,0,optimal,0
...,...,...,...,...,...,...,...,...,...,...
263,CD01-2,72,"[4.592105371188776, 4.815631022431958, 4.90947...",8,optimal,0,optimal,0,high,4
181,CD01-3,86,"[4.830049461698792, 4.959374869350324, 4.92102...",10,optimal,0,high,2,optimal,0
274,CD01-2,83,"[4.496706002483118, 4.774334736702716, 4.88138...",10,optimal,0,high,2,high,6
210,CD01-2,19,"[4.387521666858259, 4.629605652998734, 4.79659...",4,optimal,0,optimal,0,optimal,0
