# Cortical layer prediction  
Training models to predict cortical layers. This is based on actual section (human SS data) or information on the cell type labels (all datasets)

Load useful libraries

In [2]:
# data management
import pandas as pd
from scipy.io import mmread
import scanpy as sc
import pickle
import numpy as np

# scaler
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# addition to models
from sklearn.multioutput import MultiOutputClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multioutput import ClassifierChain

# data splitting
from sklearn.model_selection import train_test_split

# scoring
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

# time execution
import time

ncores = 32

## Human 10x

Read in data

In [39]:
exp_human = mmread("../data/expression/indiv_human/human10x_SCTdata.mtx").tocsr().transpose()
meta_human = pd.read_csv("../data/expression/indiv_human/human10x_metadata_layers.csv", index_col = 0).iloc[:,4:11]

Split data into train and test fractions

In [40]:
X_train, X_test, y_train, y_test = train_test_split(exp_human, meta_human.iloc[:,1:], 
                                                    test_size=0.2, random_state=42, stratify = meta_human["layer"].values)

Scale data (based on training data)

In [41]:
scaler = StandardScaler(with_mean = False)
scaler.fit(X_train)

# scale
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

Create classifier

In [5]:
rfc = RandomForestClassifier(random_state = 1, n_estimators = 300, n_jobs = ncores)

Train classifier

In [6]:
start_time = time.time()
cal_rfc = MultiOutputClassifier(CalibratedClassifierCV(rfc, method="sigmoid", cv=5, n_jobs = ncores), n_jobs = ncores)
cal_rfc.fit(X_train_sc, y_train)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/cross_sp_predictions/cal_rfc_human_layersMulti_model.pkl", "wb") as f:
    pickle.dump(cal_rfc, file=f)

RF 1vRest: 3060.78 seconds


Use models to predict the test data

In [7]:
start_time = time.time()
pred_rfc = cal_rfc.predict(X_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 93.33 seconds


Get F1 score for predictions

In [8]:
f1_rfc = f1_score(y_test, pred_rfc, average = "macro")
f1_rfc

0.9380289633746748

In [9]:
all_rfc = precision_recall_fscore_support(y_test, pred_rfc, zero_division = 0)

Get probabilities

In [10]:
start_time = time.time()
proba_rfc = cal_rfc.predict_proba(X_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 118.39 seconds


In [3]:
with open("../results/cross_sp_predictions/cal_rfc_human_layersMulti_model.pkl", "rb") as f:
    cal_rfc = pickle.load(f)

In [10]:
with open("../results/cross_sp_predictions/cal_lr_human_model.pkl", "rb") as f:
    cal_lr = pickle.load(f)

In [42]:
proba_rfc = cal_rfc.predict_proba(X_test_sc)

In [52]:
cal_rfc.estimators_[0].calibrated_classifiers_[0].base_estimator.feature_importances_

array([9.14055339e-07, 0.00000000e+00, 1.20497052e-06, ...,
       3.22673511e-05, 6.87246292e-06, 4.65000386e-05])

## Human SS

Read in data

In [13]:
exp_human_SS = mmread("../data/expression/indiv_human/humanSS_SCTdata.mtx").tocsr().transpose()
meta_human_SS = pd.read_csv("../data/expression/indiv_human/humanSS_metadata_layers.csv", index_col = 0).iloc[:,4:12]

### Using the cell type label

Split data into train and test fractions

In [26]:
XSS_train, XSS_test, ySS_train, ySS_test = train_test_split(exp_human_SS, meta_human_SS.iloc[:,1:7], 
                                                            test_size=0.2, random_state=42, stratify = meta_human_SS["layer"].values)

Scale data (based on training data)

In [27]:
scalerSS = StandardScaler(with_mean = False)
scalerSS.fit(XSS_train)

# scale
XSS_train_sc = scalerSS.transform(XSS_train)
XSS_test_sc = scalerSS.transform(XSS_test)

Create classifier

In [28]:
rfc = RandomForestClassifier(random_state = 1, n_estimators = 300, n_jobs = ncores)

Train classifier

In [29]:
start_time = time.time()
cal_rfc_SS = MultiOutputClassifier(CalibratedClassifierCV(rfc, method="sigmoid", cv=5, n_jobs = ncores), n_jobs = ncores)
cal_rfc_SS.fit(XSS_train_sc, ySS_train)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/cross_sp_predictions/cal_rfc_humanSS_layersMulti_model.pkl", "wb") as f:
    pickle.dump(cal_rfc_SS, file=f)

RF 1vRest: 2030.81 seconds


Use models to predict the test data

In [30]:
start_time = time.time()
pred_rfc_SS = cal_rfc_SS.predict(XSS_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 89.71 seconds


Get F1 score for predictions

In [31]:
f1_rfc = f1_score(ySS_test, pred_rfc_SS, average = "macro")
f1_rfc

0.9311425767045359

In [32]:
all_rfc_SS = precision_recall_fscore_support(ySS_test, pred_rfc_SS, zero_division = 0)

In [36]:
all_rfc_SS[2]

array([0.89266797, 0.91185682, 0.9462516 , 0.9339308 , 0.94365871,
       0.95848956])

Get probabilities

In [34]:
start_time = time.time()
proba_rfc_SS = cal_rfc_SS.predict_proba(XSS_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 114.79 seconds


310

### Using section labels

Subset data to remove WM

In [35]:
cond = meta_human_SS["layer2"]!="WM"

Simplify layers

In [66]:
meta_human_SS["layerS"] = [x[0:2] for x in meta_human_SS["layer2"]]

Split data into train and test fractions

In [67]:
XSSl2_train, XSSl2_test, ySSl2_train, ySSl2_test = train_test_split(exp_human_SS[cond.values,:], meta_human_SS.iloc[cond.values,:]["layerS"].values, 
                                                            test_size=0.2, random_state=42, stratify = meta_human_SS.iloc[cond.values,:]["layerS"].values)

Scale data (based on training data)

In [68]:
scalerSSl2 = StandardScaler(with_mean = False)
scalerSSl2.fit(XSSl2_train)

# scale
XSSl2_train_sc = scalerSSl2.transform(XSSl2_train)
XSSl2_test_sc = scalerSSl2.transform(XSSl2_test)

Create classifier

In [69]:
rfc = RandomForestClassifier(random_state = 1, n_estimators = 300, n_jobs = ncores)

Train classifier

In [70]:
start_time = time.time()
cal_rfc_SSl2 = CalibratedClassifierCV(rfc, method="sigmoid", cv=5, n_jobs = ncores)
cal_rfc_SSl2.fit(XSSl2_train_sc, ySSl2_train)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/cross_sp_predictions/cal_rfc_humanSSl2_layersMulti_model.pkl", "wb") as f:
    pickle.dump(cal_rfc_SSl2, file=f)

RF 1vRest: 297.92 seconds


Use models to predict the test data

In [71]:
start_time = time.time()
pred_rfc_SSl2 = cal_rfc_SSl2.predict(XSSl2_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 18.22 seconds


Get F1 score for predictions

In [72]:
f1_rfc = f1_score(ySSl2_test, pred_rfc_SSl2, average = "macro")
f1_rfc

0.5739986800741198

In [73]:
all_rfc_SSl2 = precision_recall_fscore_support(ySSl2_test, pred_rfc_SSl2, zero_division = 0)

In [74]:
all_rfc_SSl2[2]

array([0.56826057, 0.52329975, 0.55238095, 0.59970194, 0.48379761,
       0.71655126])

Get probabilities

In [75]:
start_time = time.time()
proba_rfc_SSl2 = cal_rfc_SSl2.predict_proba(XSSl2_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 16.75 seconds


## Try chained models

Setup base model

In [43]:
cal_rfc_chain = CalibratedClassifierCV(rfc, method="sigmoid", cv=5, n_jobs = ncores)

Try random chains

In [45]:
start_time = time.time()
chains = [ClassifierChain(cal_rfc_chain, order='random', random_state=i)
          for i in range(10)]
for chain in chains:
    chain.fit(X_train_sc, y_train)
print("10 random chains: %s seconds" % (round(time.time() - start_time, 2)))

a
a
a
a
a




a




a
a
a




a
10 random chains: 13189.93 seconds


Try two more chains: L1-L6 and L6-L1

In [46]:
start_time = time.time()
chains_ord = [ClassifierChain(cal_rfc_chain, order=[0,1,2,3,4,5], random_state=1),
              ClassifierChain(cal_rfc_chain, order=[5,4,3,2,1,0], random_state=1)]
for chain in chains_ord:
    chain.fit(X_train_sc, y_train)
print("2 ordered chains: %s seconds" % (round(time.time() - start_time, 2)))

a




a




2 ordered chains: 2925.81 seconds


Join both lists

In [51]:
chains.extend(chains_ord)

Test models

In [53]:
preds_l = []
proba_l = []
for chain in chains:
    preds_l.append(chain.predict(X_test_sc))
    proba_l.append(chain.predict(X_test_sc))

In [54]:
f1_list = []
for preds in preds_l:
    f1_list.append(f1_score(y_test, preds, average = "macro"))
print(f1_list)

[0.9380191579836086, 0.9382388595291444, 0.9386338428719577, 0.939394041261254, 0.9386636148561704, 0.9381346289955567, 0.9387063271736391, 0.93889462868579, 0.9378003619419877, 0.9387150989096473, 0.9388738623455158, 0.9386688545346414]


In [62]:
y_pred_chains = np.array(preds_l)
y_pred_ensemble = y_pred_chains.mean(axis=0) >= .5
f1_score(y_test, y_pred_ensemble, average = "macro")

0.9387704469691833

In [67]:
for preds in preds_l:
    print(precision_recall_fscore_support(y_test, preds, zero_division = 0)[2])
print(precision_recall_fscore_support(y_test, y_pred_ensemble, zero_division = 0)[2])

[0.93457509 0.94713807 0.94131171 0.90797141 0.95870881 0.93840985]
[0.93553642 0.94777854 0.94245912 0.90670171 0.95686064 0.94009674]
[0.93566434 0.94752685 0.94432639 0.90584737 0.95629958 0.94213852]
[0.93813953 0.94856753 0.94470046 0.90848659 0.957517   0.93895313]
[0.93553642 0.94853242 0.94432665 0.90713265 0.95629958 0.94015396]
[0.93594223 0.94800847 0.94131171 0.90807617 0.95705934 0.93840985]
[0.93671475 0.94877566 0.94486185 0.90543215 0.95629958 0.94015396]
[0.9360911  0.94817344 0.943936   0.90848659 0.95772752 0.93895313]
[0.93507098 0.94831123 0.94178165 0.90614745 0.95614212 0.93934874]
[0.93619003 0.94753403 0.94202372 0.90981082 0.95832213 0.93840985]
[0.93507098 0.94771421 0.9427238  0.90861044 0.95828287 0.94084088]
[0.93777674 0.94905326 0.94377802 0.90633911 0.95665614 0.93840985]
[0.93633829 0.94867595 0.94371142 0.90733633 0.95706511 0.93949559]
