In [41]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr

These are all available features provided by the mappings_with_conductance.csv dataset.

In [42]:
ELECPHYS_FEATURES = ['decay_CaDynamics_E2_axonal', 'decay_CaDynamics_E2_somatic', 'e_pas_axonal', 'e_pas_somatic', 'gCa_LVAstbar_Ca_LVAst_axonal', 
                    'gCa_LVAstbar_Ca_LVAst_somatic', 'gCabar_Ca_axonal', 'gCabar_Ca_somatic', 'gIhbar_Ih_dend', 'gImbar_Im_axonal', 'gImbar_Im_dend', 
                    'gImbar_Im_somatic', 'gK_Pstbar_K_Pst_axonal', 'gK_Pstbar_K_Pst_somatic', 'gK_Tstbar_K_Tst_axonal', 'gK_Tstbar_K_Tst_dend', 
                    'gK_Tstbar_K_Tst_somatic', 'gNaTa_tbar_NaTa_t_axonal', 'gNaTs2_tbar_NaTs2_t_dend', 'gNaTs2_tbar_NaTs2_t_somatic', 'gNap_Et2bar_Nap_Et2_axonal', 
                    'gNap_Et2bar_Nap_Et2_dend',  'gSK_E2bar_SK_E2_axonal', 'gSK_E2bar_SK_E2_somatic', 'gSKv3_1bar_SKv3_1_axonal', 'gSKv3_1bar_SKv3_1_dend', 
                    'gSKv3_1bar_SKv3_1_somatic', 'g_pas_axonal', 'g_pas_somatic', 'gamma_CaDynamics_E2_axonal', 'gamma_CaDynamics_E2_somatic', 'g_pas_dend', 
                    'gNap_Et2bar_Nap_Et2_somatic']

MORPH_FEATURES = ['length_soma', 'avg_length_dendrite', 'avg_length_axon', 'total_length_dendrite', 'total_length_axon', 
                    'length_dendrite_var', 'length_axon_var', 'count_axon', 'count_dendrite', 'avg_diameter_dendrite',
                    'avg_diameter_axon', 'var_diameter_dendrite', 'var_diameter_axon', 'median_length_dendrite', 'median_length_axon',
                    'median_diameter_dendrite', 'median_diameter_axon', 'var_children_axon', 'var_children_dendrite', 'median_children_axon',
                    'median_children_dendrite', 'mean_children_axon', 'mean_children_dendrite']

ALL_FEATURES = ELECPHYS_FEATURES + MORPH_FEATURES

ISOLATED_MORPH_FEATURES = ['total_length_axon', 'length_soma', 'var_children_dendrite']

FEATURES_TO_RANK = ELECPHYS_FEATURES + MORPH_FEATURES

In [43]:
DCA_THRESHOLD_PER_CELL_UNNORM_95 = pd.read_csv('data/dca_threshold_per_cell_unnorm_950.0.csv')
DCA_THRESHOLD_PER_CELL_UNNORM_68 = pd.read_csv('data/dca_threshold_per_cell_unnorm_680.0.csv')

DCA_THRESHOLD_PER_CELL_UNNORM = DCA_THRESHOLD_PER_CELL_UNNORM_95

MAPPINGS_WITH_CONDUCTANCE = pd.read_csv('data/mappings_with_conductance.csv')

# 1. Random Forest Predictor

Objective is to predict the number of DCA components required to cross $n\%$ of the threshold where $n=68,95$, as well as the predictive information at that number of dynamical components. 

In [44]:
def features_with_pi_from_dataframe(features, dataframe=MAPPINGS_WITH_CONDUCTANCE, threshold_percent=0.95):
    dataframe_with_features = dataframe[['bbp_name'] + FEATURES_TO_RANK].rename(columns={'bbp_name': 'cell_name'})
    threshold_info = None
    if threshold_percent == 0.95:
        threshold_info = DCA_THRESHOLD_PER_CELL_UNNORM_95
    elif threshold_percent == 0.68:
        threshold_info = DCA_THRESHOLD_PER_CELL_UNNORM_68
    
    return dataframe_with_features.merge(threshold_info[['cell_name', 'pi']], on='cell_name', how='inner')

def features_with_dca_from_dataframe(features, dataframe=MAPPINGS_WITH_CONDUCTANCE, threshold_percent=0.95):
    dataframe_with_features = dataframe[['bbp_name'] + FEATURES_TO_RANK].rename(columns={'bbp_name': 'cell_name'})
    threshold_info = None
    if threshold_percent == 0.95:
        threshold_info = DCA_THRESHOLD_PER_CELL_UNNORM_95
    elif threshold_percent == 0.68:
        threshold_info = DCA_THRESHOLD_PER_CELL_UNNORM_68
    
    return dataframe_with_features.merge(threshold_info[['cell_name', 'dca_level']], on='cell_name', how='inner')

FEATURES_WITH_PI = features_with_pi_from_dataframe(ALL_FEATURES)
FEATURES_WITH_DCA = features_with_dca_from_dataframe(ALL_FEATURES)

## 1.1 Grid Search for optimal parameters

In [45]:
def random_forest_predictor(features_with_predicted_val, val_to_predict='pi', random_state=1, num_estimators=100, split=0.2):
    X = features_with_predicted_val.drop([val_to_predict], axis=1)  
    y = features_with_predicted_val[val_to_predict]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=random_state)
    X_train = X_train.drop(['cell_name'], axis=1)
    X_test = X_test.drop(['cell_name'], axis=1)
    
    rf_model = RandomForestRegressor(n_estimators=num_estimators, random_state=random_state)
    rf_model.fit(X_train, y_train)   
    
    y_pred = rf_model.predict(X_test)
    
    return {'MSE' : mean_squared_error(y_test, y_pred), 'R2' : r2_score(y_test, y_pred), 'model' : rf_model,
           'y_pred' : y_pred, 'y_test' : np.array(y_test)}

def grid_search_random_forest_predictor(features_with_predicted_val, val_to_predict='pi', random_state=1,
                                        estimators=[25, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500], num_folds=5, split=0.2):
    param_grid = {
        'n_estimators': estimators
    }
    
    X = features_with_predicted_val.drop([val_to_predict], axis=1)  
    y = features_with_predicted_val[val_to_predict]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=random_state)
    X_train = X_train.drop(['cell_name'], axis=1)
    X_test = X_test.drop(['cell_name'], axis=1)
    X = X.drop(['cell_name'], axis=1)
    
    rf_model = grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=random_state), param_grid=param_grid, cv=num_folds)
    rf_model.fit(X, y)   
    
    best_params = rf_model.best_params_
    final_rf_model = RandomForestRegressor(**best_params, random_state=random_state)
    final_rf_model.fit(X_train, y_train)
    
    y_pred = final_rf_model.predict(X_test)
    
    return {'MSE' : mean_squared_error(y_test, y_pred), 'R2' : r2_score(y_test, y_pred), 'model' : final_rf_model,
           'y_pred' : y_pred, 'y_test' : np.array(y_test)}

In [51]:
MODEL_PARAMS = grid_search_random_forest_predictor(FEATURES_WITH_PI)
RF_MODEL_PI = MODEL_PARAMS['model']

print(MODEL_PARAMS)

{'MSE': 290.76946771349833, 'R2': 0.29358516910319055, 'model': RandomForestRegressor(n_estimators=25, random_state=1), 'y_pred': array([ 65.29756098,  68.0196679 ,  43.53135584,  55.41102508,
        45.95468666,  66.15395689,  49.48634629,  40.79350602,
        52.63128363,  65.29756098, 155.69007711,  45.95468666,
        45.95468666,  60.51012537,  65.29756098,  55.41102508,
        60.51012537,  54.65564092,  60.51012537,  54.65564092,
        36.49053377,  54.65564092,  36.49053377,  40.79350602,
        36.49053377,  68.0196679 ,  49.48634629,  45.1131849 ,
        60.51012537,  65.29756098,  40.79350602,  63.20772878,
        36.49053377,  40.79350602]), 'y_test': array([ 59.84206766,  74.03384269,  47.07146614,  85.43888415,
        39.74704804,  51.43794247,  25.13691713,  30.09060007,
        32.42495284,  30.62176143, 131.21212296,  72.60549004,
        46.2709677 ,  48.0459711 ,  55.41654192,  51.05681338,
        56.79624655,  46.33505967,  36.5437342 ,  34.46967823,
    

In [52]:
grid_search_random_forest_predictor(features_with_pi_from_dataframe(ELECPHYS_FEATURES + ISOLATED_MORPH_FEATURES))

{'MSE': 290.76946771349833,
 'R2': 0.29358516910319055,
 'model': RandomForestRegressor(n_estimators=25, random_state=1),
 'y_pred': array([ 65.29756098,  68.0196679 ,  43.53135584,  55.41102508,
         45.95468666,  66.15395689,  49.48634629,  40.79350602,
         52.63128363,  65.29756098, 155.69007711,  45.95468666,
         45.95468666,  60.51012537,  65.29756098,  55.41102508,
         60.51012537,  54.65564092,  60.51012537,  54.65564092,
         36.49053377,  54.65564092,  36.49053377,  40.79350602,
         36.49053377,  68.0196679 ,  49.48634629,  45.1131849 ,
         60.51012537,  65.29756098,  40.79350602,  63.20772878,
         36.49053377,  40.79350602]),
 'y_test': array([ 59.84206766,  74.03384269,  47.07146614,  85.43888415,
         39.74704804,  51.43794247,  25.13691713,  30.09060007,
         32.42495284,  30.62176143, 131.21212296,  72.60549004,
         46.2709677 ,  48.0459711 ,  55.41654192,  51.05681338,
         56.79624655,  46.33505967,  36.5437342 ,  3

## 1.2 Feature Importance with Spearman

In [53]:
def get_feature_spearman_corr(dataframe, model, val_to_predict='pi'):
    X = dataframe.drop([val_to_predict], axis=1)  
    X = X.drop(['cell_name'], axis=1)
    y = dataframe[val_to_predict]
    spearman_rank_correlation = {}
    
    y_pred = model.predict(X)
    
    for column in X.columns:
        correlation, _ = spearmanr(X[column], y_pred)
        spearman_rank_correlation[column] = correlation
    
    ranked_features_df = pd.DataFrame(spearman_rank_correlation.items(), columns=['Feature', 'Spearman_Correlation'])
    ranked_features_df = ranked_features_df.sort_values(by='Spearman_Correlation', ascending=False)
    return ranked_features_df

In [54]:
get_feature_spearman_corr(FEATURES_WITH_PI, RF_MODEL)



Unnamed: 0,Feature,Spearman_Correlation
12,gK_Pstbar_K_Pst_axonal,0.787144
22,gSK_E2bar_SK_E2_axonal,0.785518
19,gNaTs2_tbar_NaTs2_t_somatic,0.783436
30,gamma_CaDynamics_E2_somatic,0.777461
1,decay_CaDynamics_E2_somatic,0.645859
24,gSKv3_1bar_SKv3_1_axonal,0.642062
0,decay_CaDynamics_E2_axonal,0.505672
5,gCa_LVAstbar_Ca_LVAst_somatic,0.486422
51,var_children_dendrite,0.463573
33,length_soma,0.439083
