# Axolotl pallium region prediction

Training models to predict pallium regions. This is based on microdissected pallial regions, and predicting on whole pallium. A caveat is that the striatum was not microdissected.

## Load necessary packages

In [2]:
# data management
import pandas as pd
from scipy.io import mmread
import scanpy as sc
import pickle
import numpy as np

# scaler
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# addition to models
from sklearn.calibration import CalibratedClassifierCV

# data splitting
from sklearn.model_selection import train_test_split

# scoring
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

# time execution
import time

ncores = 32

## Load and prepare the data

Read in data (doublets already filtered out)

In [3]:
exp_ax = mmread("../data/processed/axolotl_parts/ax_regions_data.mtx").tocsr().transpose()
meta_ax = pd.read_csv("../data/processed/axolotl_parts/ax_regions_meta.csv", index_col = 0)

Split whole pallium vs regionalised data

In [4]:
meta_reg = meta_ax.loc[meta_ax.regions!="whole pallium",:]
exp_reg = exp_ax[:,meta_ax.regions!="whole pallium"]
# make genes uniform
g_use = np.asarray(exp_reg.sum(axis = 1)>0)[:,0]
exp_reg = exp_reg[g_use,:].transpose()
exp_wp = exp_ax[:,meta_ax.regions=="whole pallium"]
exp_wp = exp_wp[g_use,:].transpose()
# variable for startification
meta_reg["cc_reg"] = meta_reg.cellclusters+".."+meta_reg.regions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_reg["cc_reg"] = meta_reg.cellclusters+".."+meta_reg.regions


For cc_reg with fewer than 5 occurences, will default to the region (regardless of cell type)

In [5]:
ct_regs = [x.split("..")[0] for x in meta_reg["cc_reg"].value_counts().index.values[meta_reg["cc_reg"].value_counts()<5]]
meta_reg.loc[[x in ct_regs for x in meta_reg.cellclusters],"cc_reg"] = meta_reg.loc[[x in ct_regs for x in meta_reg.cellclusters],"regions"].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


## Train model for all cells

Split data into train and test fractions

In [6]:
X_train, X_test, y_train, y_test = train_test_split(exp_reg, meta_reg["regions"], test_size=0.2, 
                                                    random_state=42, stratify = meta_reg["cc_reg"].values)

Scale data (based on training data)

In [7]:
scaler = StandardScaler(with_mean = False)
scaler.fit(X_train)

# scale
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

# save scaler
with open("../results/multiome/scaler_axolotlRegions_all.pkl", "wb") as f:
    pickle.dump(scaler, file=f)

Create classifiers

In [8]:
rfc = RandomForestClassifier(random_state = 1, n_estimators = 1000, n_jobs = ncores)
lr = LogisticRegression(random_state = 1, max_iter = 250, n_jobs = ncores)

Train classifiers

In [9]:
start_time = time.time()
cal_rfc = CalibratedClassifierCV(rfc, method="sigmoid", cv=5, n_jobs = ncores)
cal_rfc.fit(X_train_sc, y_train)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/multiome/cal_rfc_axolotlRegions_all_model.pkl", "wb") as f:
    pickle.dump(cal_rfc, file=f)
    
start_time = time.time()
cal_lr = CalibratedClassifierCV(lr, method="isotonic", cv=5, n_jobs = ncores)
cal_lr.fit(X_train_sc, y_train)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/multiome/cal_lr_axolotlRegions_all_model.pkl", "wb") as f:
    pickle.dump(cal_lr, file=f)

RF 1vRest: 168.16 seconds
LR 1vRest: 164.3 seconds


Use models to predict the test data

In [10]:
start_time = time.time()
pred_rfc = cal_rfc.predict(X_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

start_time = time.time()
pred_lr = cal_lr.predict(X_test_sc)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 8.74 seconds
LR 1vRest: 0.59 seconds


Get F1 score for predictions

In [11]:
f1_rfc = f1_score(y_test, pred_rfc, average = "macro")
print(f1_rfc)

f1_lr = f1_score(y_test, pred_lr, average = "macro")
print(f1_lr)

0.9710627005065581
0.9973711481654144


In [12]:
all_rfc = precision_recall_fscore_support(y_test, pred_rfc, zero_division = 0)
with open("../results/multiome/PRFSup_rfc_allax.pkl", "wb") as f:
    pickle.dump(all_rfc, file=f)

all_lr = precision_recall_fscore_support(y_test, pred_lr, zero_division = 0)
with open("../results/multiome/PRFSup_lr_allax.pkl", "wb") as f:
    pickle.dump(all_lr, file=f)

Get probabilities

In [13]:
start_time = time.time()
proba_rfc = cal_rfc.predict_proba(X_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_rfc, index=y_test.index.values, columns=cal_rfc.classes_)
pd.concat([pd.DataFrame({"y_test": y_test, "pred_rfc": pred_rfc}), probs_df], axis = 1).to_csv("../results/multiome/preds_rfc_allax.csv")

start_time = time.time()
proba_lr = cal_lr.predict_proba(X_test_sc)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_lr, index=y_test.index.values, columns=cal_lr.classes_)
pd.concat([pd.DataFrame({"y_test": y_test, "pred_lr": pred_lr}), probs_df], axis = 1).to_csv("../results/multiome/preds_lr_allax.csv")


RF 1vRest: 8.28 seconds
LR 1vRest: 0.58 seconds


## Train model for neurons

Subset the data

In [14]:
cond_neu = np.logical_or(meta_reg["subclasses"]=="Glutamatergic", meta_reg["subclasses"]=="GABAergic")
meta_neu = meta_reg.loc[cond_neu,:]
exp_neu = exp_reg[cond_neu,:]
# make genes uniform
g_use = np.asarray(exp_neu.transpose().sum(axis = 1)>0)[:,0]
exp_neu = exp_neu[:,g_use]
cond_wp_neu = np.logical_or(meta_ax["subclasses"]=="Glutamatergic", meta_ax["subclasses"]=="GABAergic")
exp_wp_neu = exp_ax[:,np.logical_and(meta_ax.regions=="whole pallium",
                                     cond_wp_neu.ravel())]
exp_wp_neu = exp_wp_neu[g_use,:].transpose()

Split data into train and test fractions

In [15]:
X_train, X_test, y_train, y_test = train_test_split(exp_neu, meta_neu["regions"], test_size=0.2, 
                                                    random_state=42, stratify = meta_neu["cc_reg"].values)

Scale data (based on training data)

In [16]:
scaler = StandardScaler(with_mean = False)
scaler.fit(X_train)

# scale
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

# save scaler
with open("../results/multiome/scaler_axolotlRegions_neu.pkl", "wb") as f:
    pickle.dump(scaler, file=f)

Create classifiers

In [17]:
rfc = RandomForestClassifier(random_state = 1, n_estimators = 1000, n_jobs = ncores)
lr = LogisticRegression(random_state = 1, max_iter = 250, n_jobs = ncores)

Train classifiers

In [18]:
start_time = time.time()
cal_rfc = CalibratedClassifierCV(rfc, method="sigmoid", cv=5, n_jobs = ncores)
cal_rfc.fit(X_train_sc, y_train)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/multiome/cal_rfc_axolotlRegions_neu_model.pkl", "wb") as f:
    pickle.dump(cal_rfc, file=f)
    
start_time = time.time()
cal_lr = CalibratedClassifierCV(lr, method="isotonic", cv=5, n_jobs = ncores)
cal_lr.fit(X_train_sc, y_train)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/multiome/cal_lr_axolotlRegions_neu_model.pkl", "wb") as f:
    pickle.dump(cal_lr, file=f)

RF 1vRest: 114.38 seconds
LR 1vRest: 142.92 seconds


Use models to predict the test data

In [19]:
start_time = time.time()
pred_rfc = cal_rfc.predict(X_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

start_time = time.time()
pred_lr = cal_lr.predict(X_test_sc)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 7.38 seconds
LR 1vRest: 0.49 seconds


Get F1 score for predictions

In [20]:
f1_rfc = f1_score(y_test, pred_rfc, average = "macro")
print(f1_rfc)

f1_lr = f1_score(y_test, pred_lr, average = "macro")
print(f1_lr)

0.9690835341075289
0.9931599702272251


In [21]:
all_rfc = precision_recall_fscore_support(y_test, pred_rfc, zero_division = 0)
with open("../results/multiome/PRFSup_rfc_neuax.pkl", "wb") as f:
    pickle.dump(all_rfc, file=f)

all_lr = precision_recall_fscore_support(y_test, pred_lr, zero_division = 0)
with open("../results/multiome/PRFSup_lr_neuax.pkl", "wb") as f:
    pickle.dump(all_lr, file=f)

Get probabilities

In [22]:
start_time = time.time()
proba_rfc = cal_rfc.predict_proba(X_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_rfc, index=y_test.index.values, columns=cal_rfc.classes_)
pd.concat([pd.DataFrame({"y_test": y_test, "pred_rfc": pred_rfc}), probs_df], axis = 1).to_csv("../results/multiome/preds_rfc_neuax.csv")

start_time = time.time()
proba_lr = cal_lr.predict_proba(X_test_sc)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_lr, index=y_test.index.values, columns=cal_lr.classes_)
pd.concat([pd.DataFrame({"y_test": y_test, "pred_lr": pred_lr}), probs_df], axis = 1).to_csv("../results/multiome/preds_lr_neuax.csv")


RF 1vRest: 7.44 seconds
LR 1vRest: 0.5 seconds


## Train model for ependymal cells

Subset the data

In [23]:
meta_ep = meta_reg.loc[meta_reg["subclasses"]=="Ependymal",:]
exp_ep = exp_reg[meta_reg["subclasses"]=="Ependymal",:]
# make genes uniform
g_use = np.asarray(exp_neu.transpose().sum(axis = 1)>0)[:,0]
exp_ep = exp_ep[:,g_use]
exp_wp_ep = exp_ax[:,np.logical_and(meta_ax.regions=="whole pallium",
                                    meta_ax.subclasses=="Ependymal")]
exp_wp_ep = exp_wp_ep[g_use,:].transpose()

Split data into train and test fractions

In [24]:
X_train, X_test, y_train, y_test = train_test_split(exp_ep, meta_ep["regions"], test_size=0.2, 
                                                    random_state=42, stratify = meta_ep["cc_reg"].values)

Scale data (based on training data)

In [25]:
scaler = StandardScaler(with_mean = False)
scaler.fit(X_train)

# scale
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

# save scaler
with open("../results/multiome/scaler_axolotlRegions_ep.pkl", "wb") as f:
    pickle.dump(scaler, file=f)

Create classifiers

In [26]:
rfc = RandomForestClassifier(random_state = 1, n_estimators = 1000, n_jobs = ncores)
lr = LogisticRegression(random_state = 1, max_iter = 250, n_jobs = ncores)

Train classifiers

In [27]:
start_time = time.time()
cal_rfc = CalibratedClassifierCV(rfc, method="sigmoid", cv=5, n_jobs = ncores)
cal_rfc.fit(X_train_sc, y_train)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/multiome/cal_rfc_axolotlRegions_ep_model.pkl", "wb") as f:
    pickle.dump(cal_rfc, file=f)
    
start_time = time.time()
cal_lr = CalibratedClassifierCV(lr, method="isotonic", cv=5, n_jobs = ncores)
cal_lr.fit(X_train_sc, y_train)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/multiome/cal_lr_axolotlRegions_ep_model.pkl", "wb") as f:
    pickle.dump(cal_lr, file=f)

RF 1vRest: 9.89 seconds
LR 1vRest: 20.88 seconds


Use models to predict the test data

In [28]:
start_time = time.time()
pred_rfc = cal_rfc.predict(X_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

start_time = time.time()
pred_lr = cal_lr.predict(X_test_sc)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 2.9 seconds
LR 1vRest: 0.06 seconds


Get F1 score for predictions

In [29]:
f1_rfc = f1_score(y_test, pred_rfc, average = "macro")
print(f1_rfc)

f1_lr = f1_score(y_test, pred_lr, average = "macro")
print(f1_lr)

0.9788329362797447
0.9893180331669192


In [30]:
all_rfc = precision_recall_fscore_support(y_test, pred_rfc, zero_division = 0)
with open("../results/multiome/PRFSup_rfc_epax.pkl", "wb") as f:
    pickle.dump(all_rfc, file=f)

all_lr = precision_recall_fscore_support(y_test, pred_lr, zero_division = 0)
with open("../results/multiome/PRFSup_lr_epax.pkl", "wb") as f:
    pickle.dump(all_lr, file=f)

Get probabilities

In [31]:
start_time = time.time()
proba_rfc = cal_rfc.predict_proba(X_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_rfc, index=y_test.index.values, columns=cal_rfc.classes_)
pd.concat([pd.DataFrame({"y_test": y_test, "pred_rfc": pred_rfc}), probs_df], axis = 1).to_csv("../results/multiome/preds_rfc_epax.csv")

start_time = time.time()
proba_lr = cal_lr.predict_proba(X_test_sc)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_lr, index=y_test.index.values, columns=cal_lr.classes_)
pd.concat([pd.DataFrame({"y_test": y_test, "pred_lr": pred_lr}), probs_df], axis = 1).to_csv("../results/multiome/preds_lr_epax.csv")


RF 1vRest: 2.53 seconds
LR 1vRest: 0.06 seconds


## Predictions for the whole pallium segments

WP metadata

In [32]:
meta_wp = meta_ax.loc[meta_ax.regions=="whole pallium",:]
cond_wp_neu = np.logical_or(meta_ax["subclasses"]=="Glutamatergic", meta_ax["subclasses"]=="GABAergic")
meta_wp_neu = meta_ax.loc[np.logical_and(meta_ax.regions=="whole pallium",
                                         cond_wp_neu),:]
meta_wp_ep = meta_ax.loc[np.logical_and(meta_ax.regions=="whole pallium",
                                        meta_ax.subclasses=="Ependymal"),:]

Load trained models

In [33]:
with open("../results/multiome/cal_rfc_axolotlRegions_all_model.pkl", "rb") as f:
    rfc_all = pickle.load(f)
with open("../results/multiome/cal_rfc_axolotlRegions_neu_model.pkl", "rb") as f:
    rfc_neu = pickle.load(f)
with open("../results/multiome/cal_rfc_axolotlRegions_ep_model.pkl", "rb") as f:
    rfc_ep = pickle.load(f)

with open("../results/multiome/cal_lr_axolotlRegions_all_model.pkl", "rb") as f:
    lr_all = pickle.load(f)
with open("../results/multiome/cal_lr_axolotlRegions_neu_model.pkl", "rb") as f:
    lr_neu = pickle.load(f)
with open("../results/multiome/cal_lr_axolotlRegions_ep_model.pkl", "rb") as f:
    lr_ep = pickle.load(f)

Load scalers

In [34]:
with open("../results/multiome/scaler_axolotlRegions_all.pkl", "rb") as f:
    scaler_all = pickle.load(f)
with open("../results/multiome/scaler_axolotlRegions_neu.pkl", "rb") as f:
    scaler_neu = pickle.load(f)
with open("../results/multiome/scaler_axolotlRegions_ep.pkl", "rb") as f:
    scaler_ep = pickle.load(f)

Get WP predictions

In [35]:
# RFC
start_time = time.time()
pred_rfc_all = rfc_all.predict(scaler_all.transform(exp_wp))
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
start_time = time.time()
pred_rfc_neu = rfc_neu.predict(scaler_neu.transform(exp_wp_neu))
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
start_time = time.time()
pred_rfc_ep = rfc_ep.predict(scaler_ep.transform(exp_wp_ep))
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

# LR
start_time = time.time()
pred_lr_all = lr_all.predict(scaler_all.transform(exp_wp))
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
start_time = time.time()
pred_lr_neu = lr_neu.predict(scaler_neu.transform(exp_wp_neu))
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
start_time = time.time()
pred_lr_ep = lr_ep.predict(scaler_ep.transform(exp_wp_ep))
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 35.09 seconds
RF 1vRest: 25.59 seconds
RF 1vRest: 3.75 seconds
LR 1vRest: 3.58 seconds
LR 1vRest: 2.94 seconds
LR 1vRest: 0.28 seconds


Get WP probabilities

In [36]:
# RFC
start_time = time.time()
proba_rfc_all = rfc_all.predict_proba(scaler_all.transform(exp_wp))
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_rfc_all, columns=rfc_all.classes_, index=meta_wp.index.values)
pd.concat([pd.DataFrame({"pred_rfc": pred_rfc_all}, index=meta_wp.index.values), probs_df], axis = 1).to_csv("../results/multiome/preds_rfc_wp_all.csv")
start_time = time.time()
proba_rfc_neu = rfc_neu.predict_proba(scaler_neu.transform(exp_wp_neu))
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_rfc_neu, columns=rfc_neu.classes_, index=meta_wp_neu.index.values)
pd.concat([pd.DataFrame({"pred_rfc": pred_rfc_neu}, index=meta_wp_neu.index.values), probs_df], axis = 1).to_csv("../results/multiome/preds_rfc_wp_neu.csv")
start_time = time.time()
proba_rfc_ep = rfc_ep.predict_proba(scaler_ep.transform(exp_wp_ep))
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_rfc_ep, columns=rfc_ep.classes_, index=meta_wp_ep.index.values)
pd.concat([pd.DataFrame({"pred_rfc": pred_rfc_ep}, index=meta_wp_ep.index.values), probs_df], axis = 1).to_csv("../results/multiome/preds_rfc_wp_ep.csv")

# LR
start_time = time.time()
proba_lr_all = lr_all.predict_proba(scaler_all.transform(exp_wp))
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_lr_all, columns=lr_all.classes_, index=meta_wp.index.values)
pd.concat([pd.DataFrame({"pred_lr": pred_lr_all}, index=meta_wp.index.values), probs_df], axis = 1).to_csv("../results/multiome/preds_lr_wp_all.csv")
start_time = time.time()
proba_lr_neu = lr_neu.predict_proba(scaler_neu.transform(exp_wp_neu))
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_lr_neu, columns=lr_neu.classes_, index=meta_wp_neu.index.values)
pd.concat([pd.DataFrame({"pred_lr": pred_lr_neu}, index=meta_wp_neu.index.values), probs_df], axis = 1).to_csv("../results/multiome/preds_lr_wp_neu.csv")
start_time = time.time()
proba_lr_ep = lr_ep.predict_proba(scaler_ep.transform(exp_wp_ep))
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_lr_ep, columns=lr_ep.classes_, index=meta_wp_ep.index.values)
pd.concat([pd.DataFrame({"pred_lr": pred_lr_ep}, index=meta_wp_ep.index.values), probs_df], axis = 1).to_csv("../results/multiome/preds_lr_wp_ep.csv")

RF 1vRest: 35.2 seconds
RF 1vRest: 25.94 seconds
RF 1vRest: 3.87 seconds
LR 1vRest: 3.57 seconds
LR 1vRest: 2.85 seconds
LR 1vRest: 0.24 seconds


In [59]:
print(pd.value_counts(pred_rfc_all))
print(pd.value_counts(pred_rfc_neu))
print(pd.value_counts(pred_rfc_ep))

print(pd.value_counts(pred_lr_all))
print(pd.value_counts(pred_lr_neu))
print(pd.value_counts(pred_lr_ep))

lateral    9869
dorsal     8606
medial     7816
dtype: int64
dorsal     23343
lateral        3
dtype: int64
dorsal     1372
lateral     103
dtype: int64
lateral    9915
medial     9637
dorsal     6739
dtype: int64
lateral    11685
dorsal     11494
medial       167
dtype: int64
lateral    1384
dorsal       54
medial       37
dtype: int64
