In [1]:
import numpy as np
import pandas as pd
import scanpy as sc

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler  # to standardize
from sklearn.metrics import f1_score



# Load data

In [2]:
path_to_dataset = "../../DATA/scRNAseq_Benchmark_datasets/Inter-dataset/Brain/MouseV1_MouseALM_HumanMTG/"
adata = sc.read_csv(path_to_dataset + "MouseV1_MouseALM_HumanMTG.csv")

In [3]:
def labels_to_int(labels):
    labels_to_keys, keys_to_labels = {}, {}
    lbls = np.unique(labels)

    for j, l in enumerate(lbls):
        # establish dictionary conversion
        labels_to_keys[l], keys_to_labels[j] = j, l
        # convert labels to keys
        labels[labels == l] = j

    # convert labels to int64 type
    return labels.astype("int64")

In [4]:
labels = pd.read_csv(path_to_dataset + "MouseV1_MouseALM_HumanMTG_Labels34.csv").to_numpy()[:, 0]
labels = labels_to_int(labels)

In [5]:
idx = {
    'ALM': (12552, 20680), 
    'MTG': (20680, 34735), 
    'VISp': (0, 12552)
}

# Data Preprocessing

In [54]:
dataset_ref = 'VISp'
dataset_q = 'ALM'

In [55]:
adata_ref = adata[idx[dataset_ref][0]: idx[dataset_ref][1]].copy() 
adata_q = adata[idx[dataset_q][0]: idx[dataset_q][1]].copy()
adata_ref.X = np.log2(1 + adata_ref.X)
adata_q.X = np.log2(1 + adata_q.X)

# select the top 2k varying genes
sc.pp.highly_variable_genes(adata_ref, n_top_genes=2000, subset=True)
adata_q = adata_q[:, adata_ref.var_names].copy()

# normalize
sc.pp.normalize_total(adata_ref, target_sum=100)
sc.pp.normalize_total(adata_q, target_sum=100)

# SVM

In [56]:
X_train, X_test = adata_ref.X, adata_q.X

In [57]:
Y_train, Y_test = labels[idx[dataset_ref][0]: idx[dataset_ref][1]], labels[idx[dataset_q][0]: idx[dataset_q][1]]

In [58]:
# create and train the SVM
classifier = SVC(kernel='linear')
classifier.fit(X_train, Y_train)

In [59]:
preds = classifier.predict(X_test)

In [60]:
np.mean(preds == Y_test)

0.7904773622047244

In [61]:
f1_score(Y_test, preds, average="weighted")

0.7908004445528123

In [62]:
np.save('results/svm/%s-%s.npy' % (dataset_ref, dataset_q), preds)

# Get class level accuracy

In [42]:
class_labels = pd.read_csv(path_to_dataset + "MouseV1_MouseALM_HumanMTG_Labels3.csv").to_numpy()[:, 0]
# convert labels to values and back
labels_to_keys, keys_to_labels = {}, {}
lbls = np.unique(class_labels)

for j, l in enumerate(lbls):
    # establish dictionary conversion
    labels_to_keys[l], keys_to_labels[j] = j, l
    # convert labels to keys
    class_labels[class_labels == l] = j

# convert labels to int64 type
class_labels = class_labels.astype("int64")

In [43]:
# stores how to convert from subclass to class 
subclass_to_class = np.empty(34)

In [44]:
for i in range(len(labels)):
    subclass_to_class[labels[i]] = class_labels[i]

In [41]:
class_preds = preds
for i in range(preds.size):
    class_preds[i] = subclass_to_class[preds[i]]

In [44]:
# return class level accuracy
(class_preds == class_labels[12553: 20681]).mean()

0.9160925196850394