In [29]:
# data management
import pandas as pd
from scipy.io import mmread
import scanpy as sc
import pickle

# scaler
from sklearn.preprocessing import StandardScaler

# models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

# addition to models
from sklearn.calibration import CalibratedClassifierCV
from sklearn.multiclass import OneVsOneClassifier

# data splitting
from sklearn.model_selection import train_test_split

# scoring
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support

# time execution
import time

ncores = 30

Read in data

In [2]:
exp_human = mmread("../data/expression/tmp/human10x_SCTdata_hvg.mtx").tocsr().transpose()
meta_human = pd.read_csv("../data/expression/tmp/human10x_metadata.csv", index_col = 0)

Split data into train and test fractions

In [3]:
X_train, X_test, y_train, y_test = train_test_split(exp_human, meta_human["subclasses"].values, 
                                                    test_size=0.2, random_state=42, stratify = meta_human["subclasses"].values)

Scale data (based on training data)

In [4]:
scaler = StandardScaler(with_mean = False)
scaler.fit(X_train)

# scale
X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

Create classifiers

In [5]:
oo_lr = OneVsOneClassifier(LogisticRegression(random_state = 1, max_iter = 250, n_jobs = ncores), n_jobs = ncores)
oo_svc = OneVsOneClassifier(LinearSVC(random_state = 1), n_jobs = ncores)
oo_rfc = OneVsOneClassifier(RandomForestClassifier(random_state = 1, n_estimators = 250, n_jobs = ncores), n_jobs = ncores)

lr = LogisticRegression(random_state = 1, max_iter = 250, n_jobs = ncores)
svc = LinearSVC(random_state = 1)
rfc = RandomForestClassifier(random_state = 1, n_estimators = 250, n_jobs = ncores)

Train classifiers (CV 5 due to class size)

In [6]:
start_time = time.time()
cal_lr = CalibratedClassifierCV(lr, method="isotonic", cv=5, n_jobs = ncores)
cal_lr.fit(X_train_sc, y_train)
print("LR 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
start_time = time.time()
cal_svc = CalibratedClassifierCV(svc, method="sigmoid", cv=5, n_jobs = ncores)
cal_svc.fit(X_train_sc, y_train)
print("SVC 1vRest: %s seconds" % (round(time.time() - start_time, 2)))
start_time = time.time()
cal_rfc = CalibratedClassifierCV(rfc, method="sigmoid", cv=5, n_jobs = ncores)
cal_rfc.fit(X_train_sc, y_train)
print("RF 1vRest: %s seconds" % (round(time.time() - start_time, 2)))

start_time = time.time()
cal_oo_lr = CalibratedClassifierCV(oo_lr, method="isotonic", cv=5, n_jobs = ncores)
cal_oo_lr.fit(X_train_sc, y_train)
print("LR 1v1: %s seconds" % (round(time.time() - start_time, 2)))
start_time = time.time()
cal_oo_svc = CalibratedClassifierCV(oo_svc, method="sigmoid", cv=5, n_jobs = ncores)
cal_oo_svc.fit(X_train_sc, y_train)
print("SVC 1v1: %s seconds" % (round(time.time() - start_time, 2)))
start_time = time.time()
cal_oo_rfc = CalibratedClassifierCV(oo_rfc, method="sigmoid", cv=5, n_jobs = ncores)
cal_oo_rfc.fit(X_train_sc, y_train)
print("RF 1v1: %s seconds" % (round(time.time() - start_time, 2)))

LR 1vRest: 2881.6 seconds
SVC 1vRest: 200.76 seconds
RF 1vRest: 196.65 seconds
LR 1v1: 1021.16 seconds
SVC 1v1: 434.68 seconds
RF 1v1: 7425.34 seconds


Use models to predict the test data

In [7]:
pred_oo_lr = cal_oo_lr.predict(X_test_sc)
pred_oo_svc = cal_oo_svc.predict(X_test_sc)
pred_oo_rfc = cal_oo_rfc.predict(X_test_sc)

pred_lr = cal_lr.predict(X_test_sc)
pred_svc = cal_svc.predict(X_test_sc)
pred_rfc = cal_rfc.predict(X_test_sc)

  proba /= np.sum(proba, axis=1)[:, np.newaxis]


Get F1 score for predictions

In [25]:
f1_oo_lr = f1_score(y_test, pred_oo_lr, average = "macro")
f1_oo_svc = f1_score(y_test, pred_oo_svc, average = "macro")
f1_oo_rfc = f1_score(y_test, pred_oo_rfc, average = "macro")

f1_lr = f1_score(y_test, pred_lr, average = "macro")
f1_svc = f1_score(y_test, pred_svc, average = "macro")
f1_rfc = f1_score(y_test, pred_rfc, average = "macro")

In [18]:
[f1_oo_lr, f1_oo_svc, f1_oo_rfc, f1_lr, f1_svc, f1_rfc]

[0.8765584196163484,
 0.8032046769921667,
 0.7867565116254122,
 0.8698998233698949,
 0.8803307395343059,
 0.8882004066543089]

In [27]:
all_oo_lr = precision_recall_fscore_support(y_test, pred_oo_lr, zero_division = 0)
all_oo_svc = precision_recall_fscore_support(y_test, pred_oo_svc, zero_division = 0)
all_oo_rfc = precision_recall_fscore_support(y_test, pred_oo_rfc, zero_division = 0)

all_lr = precision_recall_fscore_support(y_test, pred_lr, zero_division = 0)
all_svc = precision_recall_fscore_support(y_test, pred_svc, zero_division = 0)
all_rfc = precision_recall_fscore_support(y_test, pred_rfc, zero_division = 0)

In [32]:
with open("../results/cross_sp_predictions/cal_lr_human_hvg_model.pkl", "wb") as f:
    pickle.dump(cal_lr, file=f)
with open("../results/cross_sp_predictions/cal_svc_human_hvg_model.pkl", "wb") as f:
    pickle.dump(cal_svc, file=f)
with open("../results/cross_sp_predictions/cal_rfc_human_hvg_model.pkl", "wb") as f:
    pickle.dump(cal_rfc, file=f)

with open("../results/cross_sp_predictions/cal_oo_lr_human_hvg_model.pkl", "wb") as f:
    pickle.dump(cal_oo_lr, file=f)
with open("../results/cross_sp_predictions/cal_oo_svc_human_hvg_model.pkl", "wb") as f:
    pickle.dump(cal_oo_svc, file=f)
with open("../results/cross_sp_predictions/cal_oo_rfc_human_hvg_model.pkl", "wb") as f:
    pickle.dump(cal_oo_rfc, file=f)