# Cross-species predictions

Training models for predictions across species, based on available annotations. Data comes already with matched orthologs, obtained before saving it in R.

## Load necessary packages

In [1]:
# data management
import pandas as pd
from scipy.io import mmread
import scanpy as sc
import pickle
import numpy as np

# scaler
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression

# addition to models
from sklearn.calibration import CalibratedClassifierCV

# data splitting
from sklearn.model_selection import train_test_split

# scoring
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

# time execution
import time

ncores = 50

## Load and prepare the data

Read in data

In [None]:
species = ["zebrafish", "axolotl", "paintedturtle", "lizard", "bengalesefinch", "zebrafinch", "mouse", "human"]
sourcedir = "/local1/USERS/tomasgomes/tmp/tmptables/"
speciesexpr = {sp: mmread(sourcedir + sp + ".mtx").tocsr().transpose() for sp in species}
speciesmeta = {sp: pd.read_csv(sourcedir + sp + "_meta.csv", index_col = 0) for sp in species}

In [None]:
ccl_train_dic = {}
ccl_test_dic = {}
scl_train_dic = {}
scl_test_dic = {}
for sp in species:
    print(sp)
    X_train, X_test, y_train, y_test = train_test_split(speciesexpr[sp], speciesmeta[sp]["cellclusters"], 
                                                        test_size=0.2, random_state=42, 
                                                        stratify = speciesmeta[sp]["cellclusters"].values)
    ccl_train_dic[sp] = {"X": X_train, "y" = y_train}
    ccl_test_dic[sp] = {"X": X_test, "y" = y_test}
    
    X_train, X_test, y_train, y_test = train_test_split(speciesexpr[sp], speciesmeta[sp]["subclasses"], 
                                                        test_size=0.2, random_state=42, 
                                                        stratify = speciesmeta[sp]["subclasses"].values)
    scl_train_dic[sp] = {"X": X_train, "y" = y_train}
    scl_test_dic[sp] = {"X": X_test, "y" = y_test}

Make scalers for data

In [None]:
ccl_scaler_dic = {}
scl_scaler_dic = {}
for sp in species:
    ccl_scaler_dic[sp] = StandardScaler(with_mean = False)
    ccl_scaler_dic[sp].fit(ccl_train_dic[sp]["X"])
    
    scl_scaler_dic[sp] = StandardScaler(with_mean = False)
    scl_scaler_dic[sp].fit(scl_train_dic[sp]["X"])

In [None]:
lr = LogisticRegression(random_state = 1, max_iter = 1000, n_jobs = ncores)
ccl_model_dic = {}
scl_model_dic = {}
for sp in species:
    ccl_model_dic[sp] = CalibratedClassifierCV(lr, method="isotonic", cv=5, n_jobs = ncores)
    ccl_model_dic[sp].fit(ccl_scaler_dic[sp].transform(ccl_train_dic[sp]["X"]), ccl_train_dic[sp]["y"])
    
    scl_model_dic[sp] = CalibratedClassifierCV(lr, method="isotonic", cv=5, n_jobs = ncores)
    scl_model_dic[sp].fit(scl_scaler_dic[sp].transform(scl_train_dic[sp]["X"]), scl_train_dic[sp]["y"])

In [None]:
pred_lr = cal_lr.predict(X_test_sc)
f1_lr = f1_score(y_test, pred_lr, average = "macro")
all_lr = precision_recall_fscore_support(y_test, pred_lr, zero_division = 0)
proba_lr = cal_lr.predict_proba(X_test_sc)
probs_df = pd.DataFrame(proba_lr, index=y_test.index.values, columns=cal_lr.classes_)
pd.concat([pd.DataFrame({"y_test": y_test, "pred_lr": pred_lr}), probs_df], axis = 1).to_csv("../results/multiome/preds_lr_allax.csv")

In [2]:
exp_ax = mmread("../data/processed/ax_regions_data.mtx").tocsr().transpose()
meta_ax = pd.read_csv("../data/processed/ax_regions_meta.csv", index_col = 0)

Split whole pallium vs regionalised data

In [3]:
meta_reg = meta_ax.loc[meta_ax.regions!="whole pallium",:]
exp_reg = exp_ax[:,meta_ax.regions!="whole pallium"]
# make genes uniform
g_use = np.asarray(exp_reg.sum(axis = 1)>0)[:,0]
exp_reg = exp_reg[g_use,:].transpose()
exp_wp = exp_ax[:,meta_ax.regions=="whole pallium"]
exp_wp = exp_wp[g_use,:].transpose()
# variable for startification
meta_reg["cc_reg"] = meta_reg.cellclusters+".."+meta_reg.regions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_reg["cc_reg"] = meta_reg.cellclusters+".."+meta_reg.regions


For cc_reg with fewer than 5 occurences, will default to the region (regardless of cell type)

In [4]:
ct_regs = [x.split("..")[0] for x in meta_reg["cc_reg"].value_counts().index.values[meta_reg["cc_reg"].value_counts()<5]]
meta_reg.loc[[x in ct_regs for x in meta_reg.cellclusters],"cc_reg"] = meta_reg.loc[[x in ct_regs for x in meta_reg.cellclusters],"regions"].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


## Train model for all cells

Split data into train and test fractions

In [5]:
X_train, X_test, y_train, y_test = train_test_split(exp_reg, meta_reg["regions"], test_size=0.2, 
                                                    random_state=42, stratify = meta_reg["cc_reg"].values)

Scale data (based on training data)

In [6]:
scaler = StandardScaler(with_mean = False)
scaler.fit(X_train)

# scale
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

# save scaler
with open("../results/multiome/scaler_axolotlRegions_all.pkl", "wb") as f:
    pickle.dump(scaler, file=f)

Create classifiers

In [7]:
lr = LogisticRegression(random_state = 1, max_iter = 1000, n_jobs = ncores)

Train classifiers

In [8]:
start_time = time.time()
cal_rfc = CalibratedClassifierCV(rfc, method="sigmoid", cv=5, n_jobs = ncores)
cal_rfc.fit(X_train_sc, y_train)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/multiome/cal_rfc_axolotlRegions_all_model.pkl", "wb") as f:
    pickle.dump(cal_rfc, file=f)
    
start_time = time.time()
cal_lr = CalibratedClassifierCV(lr, method="isotonic", cv=5, n_jobs = ncores)
cal_lr.fit(X_train_sc, y_train)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
with open("../results/multiome/cal_lr_axolotlRegions_all_model.pkl", "wb") as f:
    pickle.dump(cal_lr, file=f)

RF 1vRest: 108.86 seconds
LR 1vRest: 263.83 seconds


Use models to predict the test data

In [9]:
start_time = time.time()
pred_rfc = cal_rfc.predict(X_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

start_time = time.time()
pred_lr = cal_lr.predict(X_test_sc)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

RF 1vRest: 9.7 seconds
LR 1vRest: 0.64 seconds


Get F1 score for predictions

In [10]:
f1_rfc = f1_score(y_test, pred_rfc, average = "macro")
print(f1_rfc)

f1_lr = f1_score(y_test, pred_lr, average = "macro")
print(f1_lr)

0.9692502376813734
0.99564678359155


In [11]:
all_rfc = precision_recall_fscore_support(y_test, pred_rfc, zero_division = 0)
with open("../results/multiome/PRFSup_rfc_allax.pkl", "wb") as f:
    pickle.dump(all_rfc, file=f)

all_lr = precision_recall_fscore_support(y_test, pred_lr, zero_division = 0)
with open("../results/multiome/PRFSup_lr_allax.pkl", "wb") as f:
    pickle.dump(all_lr, file=f)

Get probabilities

In [12]:
start_time = time.time()
proba_rfc = cal_rfc.predict_proba(X_test_sc)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_rfc, index=y_test.index.values, columns=cal_rfc.classes_)
pd.concat([pd.DataFrame({"y_test": y_test, "pred_rfc": pred_rfc}), probs_df], axis = 1).to_csv("../results/multiome/preds_rfc_allax.csv")

start_time = time.time()
proba_lr = cal_lr.predict_proba(X_test_sc)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
probs_df = pd.DataFrame(proba_lr, index=y_test.index.values, columns=cal_lr.classes_)
pd.concat([pd.DataFrame({"y_test": y_test, "pred_lr": pred_lr}), probs_df], axis = 1).to_csv("../results/multiome/preds_lr_allax.csv")


RF 1vRest: 9.81 seconds
LR 1vRest: 0.67 seconds
