# Dataset preparation



In [1]:
import numpy as np
from scipy import io
import pandas as pd

Merge the compyted features and expriment configurations to a single table. 

In [2]:
data = []

thres = 0.2

feature_name = ["Local Entropy (pre-phase)",
                "Local Entropy (mid-phase)",
                "Local Entropy (post-phase)",
                "Cell Brightness (pre-phase)",
                "Cell Brightness (mid-phase)",
                "Cell Brightness (post-phase)",
                "Fractal Dimension (pre-phase)",
                "Fractal Dimension (mid-phase)",
                "Fractal Dimension (post-phase)",
                "Area (pre-phase)",
                "Area (mid-phase)",
                "Area (post-phase)",
                "Circumference (pre-phase)",
                "Circumference (mid-phase)",
                "Circumference (post-phase)",
                "A-C Ratio (pre-phase)",
                "A-C Ratio (mid-phase)",
                "A-C Ratio (post-phase)",
                "Optical Flow (pre-phase)",
                "Optical Flow (mid-phase)",
                "Optical Flow (post-phase)",
                ]

for batch_name in ("CD01-1", "CD01-3", "CD01-2", "CD01-4"):
    
    # load the experimental configuration
    exp_config = pd.read_csv("%s/%s_exp_config.csv" % (batch_name, batch_name), index_col=0)
    
    # load the computed features
    features = pd.read_pickle("%s/%s_features.pkl" % (batch_name, batch_name))
    
    # identify the optimal CHIR concentration range under each CHIR duration (24h, 36h, 48h)
    optim_conc_range = {24:None, 36:None, 48:None}
    for dur in (24, 36, 48):

        mean_perc_cTNT_pos = exp_config[exp_config["chir_hour"] == dur].groupby("chir")["percentage_of_cTNT+_cells"].mean()
        
        tmp = mean_perc_cTNT_pos[mean_perc_cTNT_pos >= thres].index
        optim_conc_range[dur] = (tmp[0], tmp[-1])
    
    # obtain the feature vector and the label for each well
    for S_id in range(1, 97):
        
        # Obtain the 21-D feature vector
        feat = []
        row = features[features.S_id == S_id]
        # Type-I absolute features
        for variable_name in ["local_entropy", "cell_brightness", "fractal_dimension"]: 
            seq = np.array([row["%s_%d" % (variable_name, i)].to_numpy()[0] for i in range(10)])
            for (start_id, end_id) in [(0, 4), (4, 7), (7, 10)]: # pre-phase, mid-phase, post-phase
                feat.append(seq[start_id : end_id].mean())
        # Type-I relative features
        for variable_name in ["area", "circumference", "A_C_ratio"]: 
            seq = np.array([row["%s_%d" % (variable_name, i)].to_numpy()[0] for i in range(10)])
            for (start_id, end_id) in [(1, 4), (4, 7), (7, 10)]: # pre-phase, mid-phase, post-phase
                feat.append(seq[start_id : end_id].mean() / seq[0])
        # Type-II relative features
        for variable_name in ["optical_flow",]:
            seq = np.array([row["%s_%d" % (variable_name, i)].to_numpy()[0] for i in range(9)])
            for (start_id, end_id) in [(1, 4), (4, 7), (7, 9)]: # pre-phase, mid-phase, post-phase
                feat.append(seq[start_id : end_id].mean() / seq[0])
        
        # Obtain the CHIR concentration label (low/optimal/high) and "ΔCHIR concentration"
        chir_conc = exp_config.loc["S%d" % S_id, "chir"]
        label = {24:None, 36:None, 48:None}
        delta_CHIR_conc = {24:None, 36:None, 48:None}
        for dur in (24, 36, 48):
            if (chir_conc < optim_conc_range[dur][0]):
                label[dur] = "low"
                delta_CHIR_conc[dur] = chir_conc - optim_conc_range[dur][0]
            elif (chir_conc > optim_conc_range[dur][1]):
                label[dur] = "high"
                delta_CHIR_conc[dur] = chir_conc - optim_conc_range[dur][1]
            else:
                label[dur] = "optimal"
                delta_CHIR_conc[dur] = 0
        
        # Add a row to the summary table
        data.append((batch_name, S_id, feat, chir_conc,
                     label[24], delta_CHIR_conc[24], 
                     label[36], delta_CHIR_conc[36], 
                     label[48], delta_CHIR_conc[48], 
                     ))

In [3]:
summary_df = pd.DataFrame(data, columns = ["batch_name", "S_id", 
                                           "feature", "CHIR_conc", 
                                           "label_24", "delta_CHIR_conc_24", 
                                           "label_36", "delta_CHIR_conc_36", 
                                           "label_48", "delta_CHIR_conc_48"
                                          ])

In [4]:
summary_df.to_csv("dataset.csv")
summary_df.to_pickle("dataset.pkl")

In [5]:
summary_df

Unnamed: 0,batch_name,S_id,feature,CHIR_conc,label_24,delta_CHIR_conc_24,label_36,delta_CHIR_conc_36,label_48,delta_CHIR_conc_48
0,CD01-1,1,"[5.455338259759867, 5.7414706257300425, 5.8044...",4,low,-2,optimal,0,optimal,0
1,CD01-1,2,"[5.237066656750007, 5.487482317753688, 5.50061...",4,low,-2,optimal,0,optimal,0
2,CD01-1,3,"[5.254863889958337, 5.508179838042186, 5.55930...",4,low,-2,optimal,0,optimal,0
3,CD01-1,4,"[5.082029808865771, 5.423085463008182, 5.67174...",4,low,-2,optimal,0,optimal,0
4,CD01-1,5,"[4.910609109618155, 5.303015952970745, 5.61214...",4,low,-2,optimal,0,optimal,0
...,...,...,...,...,...,...,...,...,...,...
379,CD01-4,92,"[4.1183550214657565, 4.416986956262265, 4.6213...",6,optimal,0,optimal,0,optimal,0
380,CD01-4,93,"[4.158540541832133, 4.433769827927805, 4.66982...",4,optimal,0,optimal,0,optimal,0
381,CD01-4,94,"[4.211973225590269, 4.441425968888667, 4.64296...",4,optimal,0,optimal,0,optimal,0
382,CD01-4,95,"[4.137073257000284, 4.368685672167193, 4.58494...",2,low,-2,low,-2,low,-2


Divide the dataset into a training set and a test set.

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train_df, test_df = train_test_split(summary_df, test_size=0.3, random_state=123)

In [8]:
train_df.to_csv("dataset_train.csv")
train_df.to_pickle("dataset_train.pkl")

test_df.to_csv("dataset_test.csv")
test_df.to_pickle("dataset_test.pkl")