## 0. Load datasets

In [1]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sb

# General utils
from tqdm import tqdm
from os import path

from measurements import get_prev_or_next_frame
from sklearn import preprocessing, model_selection, feature_selection, metrics, inspection, pipeline
from sklearn import ensemble, linear_model

def get_balanced_df_by_category(df,category):
    # assert(logical.dtype == bool)
    
    # Find the category with least members
    categories = {cat:mem for cat, mem in df.groupby(category)}
    num_per_category = np.array([len(mem) for mem in categories.values()])
    smallest_category = list(categories.keys())[num_per_category.argmin()]
    
    output = []
    for cat, member in categories.items():
        if cat == smallest_category:
            output.append(member)
        else:
            output.append(member.sample(num_per_category.min()))
    output = pd.concat(output,ignore_index=True)

    return output
    
def predict_feature(df:pd.DataFrame,
                    classifier,
                    feature2predict,
                    Niter:int=100,
                    rebalance:bool=True,
                    subsample=None,
                    measurement_level_name:str='Measurement'):
    if subsample == None:
        subsample = len(df)
    Y_test = []
    Y_pred = []
    models = []
    for i in tqdm(range(Niter)):

        # rng = np.random.RandomState(0)
        df_ = get_balanced_df_by_category(df, df[feature2predict,'Meta'].values)
        
        pipe = pipeline.make_pipeline(preprocessing.StandardScaler(),
                                      classifier() )
        
        y = df_[feature2predict,'Meta'].astype(float)
        X = df_.xs(measurement_level_name,level=1,axis=1).astype(float)
        X[np.isinf(X)] = np.nan
        X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y)
        
        model = pipe.fit(X_train,y_train)
        models.append(model)
        
        Y_test.append(y_test)
        Y_pred.append(model.predict(X_test))
        
    return [np.stack((np.array(Y_test),np.array(Y_pred))),models]
    

### Isolate specific time points

In [2]:

dataset_dir = '/Users/xies/Library/CloudStorage/OneDrive-Stanford/Skin/Mesa et al/Lineage models/Dataset pickles'
all_df = pd.read_pickle(path.join(dataset_dir,f'all_df.pkl'))
divisions = pd.read_pickle(path.join(dataset_dir,f'divisions.pkl'))

# censor the 'answers'
features2drop = [(f,a) for f,a in divisions.columns if 'Time' in f]
features2drop += [(f,a) for f,a in divisions.columns if 'smoothed' in f and not 'rate' in f]
features2drop += [(f,a) for f,a in divisions.columns if 'standard' in f and not 'rate' in f]
features2drop += [(f,a) for f,a in divisions.columns if 'cell coords' in f and not 'rate' in f]
features2drop += ['Z','Z-cyto']
divisions = divisions.drop(columns=features2drop)
print(len(divisions))


668


  divisions = divisions.drop(columns=features2drop)


In [8]:
divisions

Unnamed: 0_level_0,Name,Nuclear volume,Nuclear solidity,Y,X,Nuclear height,Cell volume,Y-cyto,X-cyto,Axial component,Planar component 1,...,Relative Nuclear volume at 1 frame prior,Num neighbor division 1 frame prior,Num neighbor delamination 1 frame prior,Num daughter differentiated,At least one daughter differentiated,Both daughters differentiated,Region,Cell cycle duration,G1 duration,SG2 duration
Unnamed: 0_level_1,Metadata,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,Measurement,...,Measurement,Measurement,Measurement,Meta,Meta,Meta,Unnamed: 19_level_1,Measurement,Measurement,Measurement
Frame,TrackID,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
0,R1_2,161.5000,0.879211,106.386223,39.835913,-13.0,477.0625,106.461254,40.170935,5.422715,22.096147,...,,,,2.0,True,True,R1,,0.0,
0,R1_28,327.5000,0.894656,18.882968,81.480105,-15.0,648.1250,18.734884,81.618756,8.179093,20.625474,...,,,,2.0,True,True,R1,,0.0,
0,R1_38,166.1875,0.874383,43.553404,90.039300,-11.0,536.6250,43.781592,89.928081,7.143922,17.733668,...,,,,1.0,True,False,R1,,0.0,
0,R1_53,163.1875,0.893566,92.486212,38.868633,-11.0,588.7500,92.722320,38.981768,5.573642,28.083831,...,,,,1.0,True,False,R1,,0.0,
0,R1_90,248.6250,0.914903,91.877325,28.223542,-13.0,612.7500,92.214224,28.403458,7.055729,20.925242,...,,,,2.0,True,True,R1,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,R2_388,199.1875,0.910311,23.771023,16.280515,-10.0,596.3750,23.849691,16.584285,7.833814,16.787922,...,1.188983,0.0,0.0,,,,R2,60.0,,
13,R2_529,234.8125,0.859529,16.878161,25.113388,-11.0,741.7500,16.607137,25.583776,11.700021,17.721571,...,1.172629,0.0,0.0,,,,R2,96.0,84.0,12.0
13,R2_680,249.5625,0.885562,17.805284,89.724017,-10.0,601.8125,17.752337,89.885736,9.245483,15.434613,...,1.055838,1.0,0.0,,,,R2,60.0,48.0,12.0
13,R2_817,215.1250,0.677826,104.740122,35.565369,-12.0,730.2500,104.987847,35.621191,8.398562,26.422073,...,1.543759,0.0,0.0,,,,R2,60.0,,


## Recursive feature elimination CV: division

In [None]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

list_of_feature_list = []
for Niters in tqdm(range(10)):
    min_features_to_select = 1  # Minimum number of features to consider
    clf = ensemble.RandomForestClassifier()
    cv = StratifiedKFold(5)
    
    feature2predict = 'At least one daughter differentiated'
    
    X = divisions.xs('Measurement',axis=1,level=1)
    y = divisions['At least one daughter differentiated','Meta'].astype(bool)
    
    rfecv_div = RFECV(
        estimator=clf,
        step=1,
        cv=cv,
        scoring="accuracy",
        min_features_to_select=min_features_to_select,
        n_jobs=2,
    )
    rfecv_div.fit(X, y)
    
    print(f"Optimal number of features: {rfecv_div.n_features_}")
    list_of_feature_list.append(rfecv_div.get_feature_names_out())


 10%|████▊                                           | 1/10 [56:37<8:29:36, 3397.39s/it]

Optimal number of features: 24


 20%|█████████▏                                    | 2/10 [1:49:48<7:16:47, 3275.88s/it]

Optimal number of features: 18


In [None]:
list_of_feature_list

AttributeError: 'RFECV' object has no attribute 'coef_'

## RFECV: Frame prev to division

In [13]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

min_features_to_select = 1  # Minimum number of features to consider
clf = ensemble.RandomForestClassifier()
cv = StratifiedKFold(5)

feature2predict = 'At least one daughter differentiated'
cols2keep = [(f,'Measurement') for f in conservative_feature_set]
cols2keep += [(feature2predict,'Meta')]

X = prev_div_frame[cols2keep].xs('Measurement',axis=1,level=1)
y = prev_div_frame['At least one daughter differentiated','Meta'].astype(bool)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 133


In [14]:
rfecv.get_feature_names_out()

array(['Axial component', 'Basal orientation', 'Middle area',
       'cyto_shcoeffs_L1M0C', 'cyto_shcoeffs_L2M0C',
       'cyto_shcoeffs_L3M0C', 'cyto_shcoeffs_L5M4C',
       'cyto_shcoeffs_L5M5C', 'cyto_shcoeffs_L3M1S',
       'cyto_shcoeffs_L3M2S', 'cyto_shcoeffs_L4M2S',
       'cyto_shcoeffs_L5M1S', 'nuc_shcoeffs_L3M0C', 'nuc_shcoeffs_L4M2C',
       'nuc_shcoeffs_L4M3C', 'nuc_shcoeffs_L5M0C', 'nuc_shcoeffs_L5M1C',
       'nuc_shcoeffs_L3M1S', 'nuc_shcoeffs_L3M3S', 'nuc_shcoeffs_L4M2S',
       'nuc_shcoeffs_L4M3S', 'nuc_shcoeffs_L4M4S', 'nuc_shcoeffs_L5M2S',
       'nuc_shcoeffs_L5M3S', 'Distance to closest macrophage',
       'Basal area smoothed', 'Apical area smoothed',
       'Mean adjac Apical area', 'Mean adjac Height to BM',
       'Mean adjac Gaussian curvature - cell coords',
       'Mean adjac cyto_shcoeffs_L3M0C', 'Mean adjac cyto_shcoeffs_L4M2C',
       'Mean adjac cyto_shcoeffs_L4M4C', 'Mean adjac cyto_shcoeffs_L5M1C',
       'Mean adjac cyto_shcoeffs_L3M3S', 'Mean adjac

In [15]:
'Mean curvature' in rfecv.get_feature_names_out()

False