# Content
Random forest classifier models of Dividing v. Differentiating cells
1. Model of birth frame
2. Model of mother division frame
3. Model of mother division frame - group 1diff and 2diff daughters
4. Model of mother division frame -12h

## 0. Load datasets

In [67]:
# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sb

# General utils
from tqdm import tqdm
from os import path

from measurements import get_prev_or_next_frame
from sklearn import preprocessing, model_selection
from sklearn import ensemble, metrics, inspection

def get_balanced_df_by_category(df,category):
    # assert(logical.dtype == bool)
    
    # Find the category with least members
    categories = {cat:mem for cat, mem in df.groupby(category)}
    num_per_category = np.array([len(mem) for mem in categories.values()])
    smallest_category = list(categories.keys())[num_per_category.argmin()]
    
    output = []
    for cat, member in categories.items():
        if cat == smallest_category:
            output.append(member)
        else:
            output.append(member.sample(num_per_category.min()))
    output = pd.concat(output,ignore_index=True)

    return output
    
def predict_feature(df:pd.DataFrame,
                    classifier,
                    feature2predict,
                    Niter:int=100,
                    rebalance:bool=True,
                    subsample=None):
    if subsample == None:
        subsample = len(df)
    Y_test = []
    Y_pred = []
    forests = []
    for i in tqdm(range(Niter)):
        
        df_ = get_balanced_df_by_category(df, df[feature2predict,'Meta'].values)
        
        y = df_[feature2predict,'Meta'].astype(float)
        X = df_.xs('Measurement',level=1,axis=1).astype(float)
        X = preprocessing.scale(X)
        
        X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y)
        
        model = classifier.fit(X_train,y_train)
        forests.append(model)
        
        Y_test.append(y_test)
        Y_pred.append(model.predict(X_test))
        
    return [np.stack((np.array(Y_test),np.array(Y_pred))),forests]
    

In [68]:
dirnames = {'R1':'/Users/xies/OneDrive - Stanford/Skin/Mesa et al/W-R1/',
           'R2':'/Users/xies/OneDrive - Stanford/Skin/Mesa et al/W-R2/'}
all_df = []
for name,dirname in dirnames.items():
    _df = pd.read_pickle(path.join(dirname,'Mastodon/single_timepoints_dynamics_aggregated_lookback_history.pkl'))
    _df = _df.drop_duplicates().sort_index().reset_index()
    _df['TrackID'] = name + '_' + _df['TrackID'].astype(str)
    _df = _df.set_index(['Frame','TrackID'])
    _df['Region'] = name
    all_df.append(_df)

all_df = pd.concat(all_df)
all_tracks = {trackID:t for trackID,t in all_df.reset_index().groupby('TrackID')}


In [69]:
len(all_tracks)

2323

### Isolate specific time points

In [70]:
df = all_df[all_df['Fate known','Meta']]
df = df[ ~df['Border','Meta'].astype(bool)]
df = df[ df['Cell type','Meta'] == 'Basal']

births = df[df['Birth frame','Meta']]
births_raw = births.copy()
print(f'Number of births: {len(births)}')

df = all_df[~np.isnan(all_df['Both daughters differentiated','Meta'].astype(float))]
df = df[ ~df['Border','Meta'].astype(bool)]
df = df[ df['Cell type','Meta'] == 'Basal']

divisions = df[df[('Divide next frame','Meta')]].copy()
divisions = divisions[~divisions['Border','Meta'].astype(bool)]
divisions = divisions.reset_index()
print(f'Number of mother divisions: {len(divisions)}')

prev_div_frame = [get_prev_or_next_frame(df,f,direction='prev') for _,f in divisions.iterrows()]
prev_div_frame = pd.concat(prev_div_frame,axis=1,ignore_index=False).T
#@todo: investigate efficiency of force-casting at different points
for col in df.columns:
    prev_div_frame[col] = prev_div_frame[col].astype(df[col].dtype)
prev_div_frame = prev_div_frame[~prev_div_frame['Border','Meta'].astype(bool)]
prev_div_frame = prev_div_frame.reset_index().rename(columns={'level_0':'Frame','level_1':'TrackID'}).set_index('TrackID')
print(f'Number of 12h prior to divisions: {len(prev_div_frame)}')

prev2_div_frame = [get_prev_or_next_frame(all_df,f,direction='prev', increment=2) for _,f in divisions.iterrows()]
prev2_div_frame = pd.concat(prev2_div_frame,axis=1).T
prev2_div_frame = prev2_div_frame[~prev2_div_frame['Border','Meta'].astype(bool)]
for col in df.columns:
    prev2_div_frame[col] = prev2_div_frame[col].astype(df[col].dtype)
print(f'Number of 24h prior to divisions: {len(prev2_div_frame)}')

prev3_div_frame = [get_prev_or_next_frame(all_df,f,direction='prev', increment=3) for _,f in divisions.iterrows()]
prev3_div_frame = pd.concat(prev3_div_frame,axis=1).T
prev3_div_frame = prev3_div_frame[~prev3_div_frame['Border','Meta'].astype(bool)]
for col in df.columns:
    prev3_div_frame[col] = prev3_div_frame[col].astype(df[col].dtype)
print(f'Number of 36h prior to divisions: {len(prev3_div_frame)}')

prev4_div_frame = [get_prev_or_next_frame(all_df,f,direction='prev', increment=4) for _,f in divisions.iterrows()]
prev4_div_frame = pd.concat(prev4_div_frame,axis=1).T
prev4_div_frame = prev4_div_frame[~prev4_div_frame['Border','Meta'].astype(bool)]
for col in df.columns:
    prev4_div_frame[col] = prev4_div_frame[col].astype(df[col].dtype)
print(f'Number of 48h prior to divisions: {len(prev4_div_frame)}')


Number of births: 764
Number of mother divisions: 307
Number of 12h prior to divisions: 243
Number of 24h prior to divisions: 206
Number of 36h prior to divisions: 157
Number of 48h prior to divisions: 115


In [71]:
# Censor age / time til diff
features2drop = [f for f in births.columns.get_level_values(0) if 'Time to differentiation' in f]
features2drop += [f for f in births.columns.get_level_values(0) if 'Age' in f]
# Censor height information
features2drop = features2drop + ['Z','Mean curvature - cell coords','Z-cyto','Height to BM',
                                 'Basal area','Apical area','Relative Height to BM','Relative Height to BM at 1 frame prior']
# Censor all exponential rates (but not mother's)
features2drop = features2drop + [f for f in births.columns.get_level_values(0)
                                 if ('exponential' in f and 'frame prior' not in f \
                                         and 'G1' not in f)]

np.isnan(births['Cell volume G1 only exponential growth rate']).sum()

Metadata
Measurement    348
dtype: int64

In [72]:
metas = pd.DataFrame()
metas['name'] = births.xs('Meta',level=1,axis=1).columns
features = pd.DataFrame()

measurements = births.xs('Measurement',level=1,axis=1)
features['name'] = measurements.columns
features =features.set_index('name')
features['Num NA'] = measurements.isna().sum(axis=0)
feature_names = features.index
num_nas = (features['Num NA'] > 0).sum()

# print(f'Number of features with NAs: {num_nas}')
# print(features.sort_values('Num NA').tail(50))

# Set the acceptable number of NAs (will be dropped)
na_thresh = 350
features2drop += features[features['Num NA'] > na_thresh].index.values.tolist()

births = births.drop(columns=features2drop)
divisions = divisions.drop(columns=features2drop)
prev_div_frame = prev_div_frame.drop(columns=features2drop)
prev2_div_frame = prev2_div_frame.drop(columns=features2drop)

  births = births.drop(columns=features2drop)
  divisions = divisions.drop(columns=features2drop)
  prev_div_frame = prev_div_frame.drop(columns=features2drop)
  prev2_div_frame = prev2_div_frame.drop(columns=features2drop)


In [66]:
conservative_feature_set

Index(['Nuclear volume', 'Nuclear solidity', 'Y', 'X', 'Nuclear height',
       'Cell volume', 'Y-cyto', 'X-cyto', 'Axial component',
       'Planar component 1',
       ...
       'Relative nuc_shcoeffs_L4M4S', 'Relative nuc_shcoeffs_L5M1S',
       'Relative nuc_shcoeffs_L5M2S', 'Relative nuc_shcoeffs_L5M3S',
       'Relative nuc_shcoeffs_L5M4S', 'Relative nuc_shcoeffs_L5M5S',
       'Relative nuc_shcoeffs_surface_area',
       'Relative Distance to closest macrophage',
       'Relative Nuclear volume standard', 'Relative Cell volume standard'],
      dtype='object', name='Name', length=753)

In [73]:
# Drop the NA samples and update the leftover features list

births = births.dropna(axis=0,subset=[(a,b) for (a,b) in births.columns if b == 'Measurement'], how='any')
divisions = divisions.dropna(axis=1)
prev_div_frame = prev_div_frame.dropna(axis=1)
prev2_div_frame = prev2_div_frame.dropna(axis=1)

print('---')
print(f'Number of births: {len(births)}')
print(f'Number of mother divisions: {len(divisions)}')
print(f'Number of 12h prior to divisions: {len(prev_div_frame)}')
print(f'Number of 24h prior to divisions: {len(prev2_div_frame)}')

print('---')
print(f'Births: {len(births.columns)} features')
print(f'Mother divisions: {len(divisions.columns)} features')
print(f'12h prior to divisions: {len(prev_div_frame.columns)} features')
print(f'24h prior to divisions: {len(prev2_div_frame.columns)} features')

birth_feature_set = births.xs('Measurement',level=1,axis=1).columns
conservative_feature_set = prev2_div_frame.xs('Measurement',level=1,axis=1).columns
assert(np.isnan(births[conservative_feature_set]).values.sum() == 0)
assert(np.isnan(divisions[conservative_feature_set]).values.sum() == 0)
assert(np.isnan(prev_div_frame[conservative_feature_set]).values.sum() == 0)
assert(np.isnan(prev2_div_frame[conservative_feature_set]).values.sum() == 0)

---
Number of births: 377
Number of mother divisions: 307
Number of 12h prior to divisions: 243
Number of 24h prior to divisions: 206
---
Births: 1366 features
Mother divisions: 781 features
12h prior to divisions: 779 features
24h prior to divisions: 778 features


## Recursive feature elimination CV: birth

In [74]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

min_features_to_select = 1  # Minimum number of features to consider
clf = ensemble.RandomForestClassifier()
cv = StratifiedKFold(5)

feature2predict = 'Will differentiate'
cols2keep = [(f,'Measurement') for f in conservative_feature_set]
cols2keep += [(feature2predict,'Meta')]

X = births[cols2keep].xs('Measurement',axis=1,level=1)
y = births['Will differentiate','Meta']

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 443


In [75]:
rfecv.get_feature_names_out()

array(['Nuclear volume', 'Nuclear height', 'Cell volume', 'X-cyto',
       'Axial component', 'Planar component 1', 'Planar component 2',
       'Mean H2B intensity', 'Total H2B intensity',
       'Mean FUCCI intensity', 'Total FUCCI intensity',
       'Basal orientation', 'Basal eccentricity', 'Cell height',
       'Collagen coherence', 'Collagen intensity',
       'Gaussian curvature - cell coords', 'Mean curvature',
       'cyto_shcoeffs_L1M0C', 'cyto_shcoeffs_L1M1C',
       'cyto_shcoeffs_L2M0C', 'cyto_shcoeffs_L3M0C',
       'cyto_shcoeffs_L3M2C', 'cyto_shcoeffs_L4M0C',
       'cyto_shcoeffs_L4M1C', 'cyto_shcoeffs_L4M2C',
       'cyto_shcoeffs_L4M3C', 'cyto_shcoeffs_L4M4C',
       'cyto_shcoeffs_L5M0C', 'cyto_shcoeffs_L5M1C',
       'cyto_shcoeffs_L5M2C', 'cyto_shcoeffs_L5M5C',
       'cyto_shcoeffs_L3M1S', 'cyto_shcoeffs_L4M1S',
       'cyto_shcoeffs_L4M2S', 'cyto_shcoeffs_L4M3S',
       'cyto_shcoeffs_L4M4S', 'cyto_shcoeffs_L5M1S',
       'cyto_shcoeffs_L5M2S', 'cyto_shcoeffs_L5

## Recursive feature elimination CV: division

In [76]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

min_features_to_select = 1  # Minimum number of features to consider
clf = ensemble.RandomForestClassifier()
cv = StratifiedKFold(5)

feature2predict = 'At least one daughter differentiated'
cols2keep = [(f,'Measurement') for f in conservative_feature_set]
cols2keep += [(feature2predict,'Meta')]

X = divisions[cols2keep].xs('Measurement',axis=1,level=1)
y = divisions['At least one daughter differentiated','Meta'].astype(bool)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 6


In [77]:
rfecv.get_feature_names_out()

array(['Mean curvature', 'cyto_shcoeffs_L3M0C', 'Basal area smoothed',
       'Mean adjac Cell volume standard',
       'Median adjac Cell volume standard', 'Relative Basal area'],
      dtype=object)

In [60]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

min_features_to_select = 1  # Minimum number of features to consider
clf = ensemble.RandomForestClassifier()
cv = StratifiedKFold(5)

feature2predict = 'At least one daughter differentiated'
cols2keep = [(f,'Measurement') for f in conservative_feature_set]
cols2keep += [(feature2predict,'Meta')]

X = prev_div_frame[cols2keep].xs('Measurement',axis=1,level=1)
y = prev_div_frame['At least one daughter differentiated','Meta'].astype(bool)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

Optimal number of features: 114


In [61]:
rfecv.get_feature_names_out()

array(['Axial component', 'Axial angle', 'Middle area',
       'Collagen coherence', 'Mean curvature', 'cyto_shcoeffs_L1M0C',
       'cyto_shcoeffs_L2M0C', 'cyto_shcoeffs_L3M0C',
       'cyto_shcoeffs_L4M0C', 'cyto_shcoeffs_L5M4C',
       'cyto_shcoeffs_L3M1S', 'cyto_shcoeffs_L3M2S',
       'cyto_shcoeffs_L5M1S', 'cyto_shcoeffs_L5M2S',
       'cyto_shcoeffs_surface_area', 'nuc_shcoeffs_L3M0C',
       'nuc_shcoeffs_L4M0C', 'nuc_shcoeffs_L4M2C', 'nuc_shcoeffs_L5M0C',
       'nuc_shcoeffs_L3M3S', 'nuc_shcoeffs_L4M3S', 'nuc_shcoeffs_L5M2S',
       'nuc_shcoeffs_L5M3S', 'Nuclear volume smoothed',
       'Basal area smoothed', 'Mean adjac Apical area',
       'Mean adjac cyto_shcoeffs_L4M4C', 'Mean adjac cyto_shcoeffs_L5M1C',
       'Mean adjac cyto_shcoeffs_L3M3S', 'Mean adjac cyto_shcoeffs_L5M3S',
       'Mean adjac cyto_shcoeffs_L5M5S', 'Mean adjac nuc_shcoeffs_L2M1S',
       'Mean adjac nuc_shcoeffs_L4M1S', 'Mean adjac nuc_shcoeffs_L4M4S',
       'Mean adjac nuc_shcoeffs_L5M1S', 'Mean ad