In [1]:
# data management
import pandas as pd
from scipy.io import mmread
import scanpy as sc
import pickle
import numpy as np

# scaler
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# addition to models
from sklearn.calibration import CalibratedClassifierCV

# data splitting
from sklearn.model_selection import train_test_split

# scoring
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

# time execution
import time

ncores = 32

In [2]:
exp_ax = mmread("../data/processed/ax_regions_data.mtx").tocsr().transpose()
meta_ax = pd.read_csv("../data/processed/ax_regions_meta.csv", index_col = 0)

In [3]:
meta_reg = meta_ax.loc[meta_ax.regions!="whole pallium",:]
exp_reg = exp_ax[:,meta_ax.regions!="whole pallium"]
# make genes uniform
g_use = np.asarray(exp_reg.sum(axis = 1)>0)[:,0]
exp_reg = exp_reg[g_use,:].transpose()
exp_wp = exp_ax[:,meta_ax.regions=="whole pallium"]
exp_wp = exp_wp[g_use,:].transpose()
# variable for startification
meta_reg["cc_reg"] = meta_reg.cellclusters+".."+meta_reg.regions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_reg["cc_reg"] = meta_reg.cellclusters+".."+meta_reg.regions


In [4]:
ct_regs = [x.split("..")[0] for x in meta_reg["cc_reg"].value_counts().index.values[meta_reg["cc_reg"].value_counts()<5]]
meta_reg.loc[[x in ct_regs for x in meta_reg.cellclusters],"cc_reg"] = meta_reg.loc[[x in ct_regs for x in meta_reg.cellclusters],"regions"].values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [24]:
meta_reg["regions_ord"] = pd.Categorical(meta_reg.regions, categories=["medial", "dorsal", "lateral"]).codes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_reg["regions_ord"] = pd.Categorical(meta_reg.regions, categories=["medial", "dorsal", "lateral"]).codes


In [26]:
X_train, X_test, y_train, y_test = train_test_split(exp_reg, meta_reg["regions_ord"], test_size=0.2, 
                                                    random_state=42, stratify = meta_reg["cc_reg"].values)

In [27]:
scaler = StandardScaler(with_mean = False)
scaler.fit(X_train)

# scale
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

In [28]:
from sklearn.base import clone


class OrdinalClassifier():
    
    def __init__(self, clf):
        self.clf = clf
        self.clfs = {}
    
    def fit(self, X, y):
        self.unique_class = np.sort(np.unique(y))
        if self.unique_class.shape[0] > 2:
            for i in range(self.unique_class.shape[0]-1):
                # for each k - 1 ordinal value we fit a binary classification problem
                binary_y = (y > self.unique_class[i]).astype(np.uint8)
                clf = clone(self.clf)
                clf.fit(X, binary_y)
                self.clfs[i] = clf
    
    def predict_proba(self, X):
        clfs_predict = {k:self.clfs[k].predict_proba(X) for k in self.clfs}
        predicted = []
        for i,y in enumerate(self.unique_class):
            if i == 0:
                # V1 = 1 - Pr(y > V1)
                predicted.append(1 - clfs_predict[y][:,1])
            elif y in clfs_predict:
                # Vi = Pr(y > Vi-1) - Pr(y > Vi)
                 predicted.append(clfs_predict[y-1][:,1] - clfs_predict[y][:,1])
            else:
                # Vk = Pr(y > Vk-1)
                predicted.append(clfs_predict[y-1][:,1])
        return np.vstack(predicted).T
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

In [29]:
lr = LogisticRegression(random_state = 1, max_iter = 1000, n_jobs = ncores)

In [30]:
start_time = time.time()
cal_lr = CalibratedClassifierCV(lr, method="isotonic", cv=5, n_jobs = ncores)
cal_lr.fit(X_train_sc, y_train)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

start_time = time.time()
ord_lr = OrdinalClassifier(CalibratedClassifierCV(lr, method="isotonic", cv=5, n_jobs = ncores))
ord_lr.fit(X_train_sc, y_train)
print("LR Ord 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

LR 1vRest: 510.82 seconds
LR Ord 1vRest: 256.39 seconds


In [31]:
start_time = time.time()
pred_lr = cal_lr.predict(X_test_sc)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

start_time = time.time()
pred_ord = ord_lr.predict(X_test_sc)
print("LR Ord 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

LR 1vRest: 0.78 seconds
LR Ord 1vRest: 0.99 seconds


In [32]:
f1_lr = f1_score(y_test, pred_lr, average = "macro")
print(f1_lr)

f1_ord = f1_score(y_test, pred_ord, average = "macro")
print(f1_ord)

0.9959553029439522
0.9951486281675854


In [45]:
all_lr = precision_recall_fscore_support(y_test, pred_lr, zero_division = 0)
print(all_lr)
all_ord = precision_recall_fscore_support(y_test, pred_ord, zero_division = 0)
print(all_ord)

(array([0.99863014, 0.99284985, 0.99666667]), array([0.99954296, 0.99386503, 0.99418121]), array([0.99908634, 0.99335718, 0.99542239]), array([2188,  978, 1203]))
(array([0.99771898, 0.99383984, 0.99501247]), array([0.99954296, 0.98977505, 0.99501247]), array([0.99863014, 0.99180328, 0.99501247]), array([2188,  978, 1203]))


In [48]:
proba_lr = cal_lr.predict_proba(X_test_sc)
probs_df = pd.DataFrame(proba_lr, index=y_test.index.values, columns=cal_lr.classes_)
print(pd.concat([pd.DataFrame({"y_test": y_test, "pred_lr": pred_lr}), probs_df], axis = 1))

proba_ord = ord_lr.predict_proba(X_test_sc)
probs_df = pd.DataFrame(proba_ord, index=y_test.index.values, columns=cal_lr.classes_)
print(pd.concat([pd.DataFrame({"y_test": y_test, "pred_lr": pred_lr}), probs_df], axis = 1))

                        y_test  pred_lr         0         1         2
CCGTGAGAGCCGATTT-1_2_5       1        1  0.000000  1.000000  0.000000
CAAGGGAAGCAACAAT-1_6_5       0        0  1.000000  0.000000  0.000000
CTCAAGACAACGGCTC-1_5_5       0        0  1.000000  0.000000  0.000000
ATCCTATGTGGTCCCA-1_2_5       1        1  0.000000  1.000000  0.000000
TACGGTAAGAGGCTGT-1_5_5       0        0  0.999734  0.000000  0.000266
...                        ...      ...       ...       ...       ...
ATGATCGTCTCTTCAA-1_4_5       2        2  0.000000  0.000000  1.000000
GCGAGAAAGCAAGTCG-1_6_5       0        0  1.000000  0.000000  0.000000
ATCTCTATCAGACTGT-1_1_5       1        1  0.000000  0.999734  0.000266
CGGAGAATCCCAACTC-1_5_5       0        0  1.000000  0.000000  0.000000
CTTCCGATCTGTGTGA-1_5_5       0        0  0.998786  0.000000  0.001214

[4369 rows x 5 columns]
                        y_test  pred_lr    0         1         2
CCGTGAGAGCCGATTT-1_2_5       1        1  0.0  1.000000  0.000000
CAAGG

In [49]:
pred_lr_all = cal_lr.predict(scaler.transform(exp_wp))
pred_lr_ord = ord_lr.predict(scaler.transform(exp_wp))

  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]
  proba /= np.sum(proba, axis=1)[:, np.newaxis]


In [50]:
pd.crosstab(pred_lr_all, pred_lr_ord)

col_0,0,1,2
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,7135,2083,24
1,33,7852,16
2,3,2145,7000


In [82]:
likelyreg = pd.read_csv("../singlereg.csv", index_col = 0, header=0)
likelyreg = likelyreg.loc[likelyreg.regions=="whole pallium",:]
likelyreg["likelyregion"] = pd.Categorical(likelyreg["likelyregion"], categories = ["medial", "dorsal", "lateral"])
meta_wp = meta_ax.loc[meta_ax.regions=="whole pallium",:]

In [83]:
pred_df = pd.DataFrame({"pred_all": pred_lr_all, "pred_ord": pred_lr_ord}, index = meta_wp.index.values)

In [84]:
pred_df = pred_df.loc[likelyreg.index.values,:]

In [85]:
pred_df.head()

Unnamed: 0,pred_all,pred_ord
AAACAGCCAACCTGGT-1_1,1,1
AAACAGCCAACTGGCT-1_1,1,1
AAACAGCCAAGGGTTG-1_1,1,1
AAACAGCCAAGGTCCT-1_1,0,0
AAACAGCCATGACTAT-1_1,2,2


In [86]:
likelyreg.head()

Unnamed: 0,cellclusters,regions,likelyregion
AAACAGCCAACCTGGT-1_1,glut_SUBSET_1,whole pallium,dorsal
AAACAGCCAACTGGCT-1_1,glut_SUBSET_1,whole pallium,dorsal
AAACAGCCAAGGGTTG-1_1,glut_SUBSET_1,whole pallium,dorsal
AAACAGCCAAGGTCCT-1_1,glut_SUBSET_0,whole pallium,medial
AAACAGCCATGACTAT-1_1,GABA_SUBSET_0,whole pallium,lateral


In [87]:
pd.crosstab(pred_df.pred_all, likelyreg.likelyregion)

likelyregion,medial,dorsal,lateral
pred_all,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3145,242,319
1,227,2152,397
2,73,24,6683


In [88]:
pd.crosstab(pred_df.pred_ord, likelyreg.likelyregion)

likelyregion,medial,dorsal,lateral
pred_ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2945,120,109
1,477,2295,1434
2,23,3,5856
