/
metrics.py
75 lines (50 loc) · 2.43 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
from scipy.stats import itemfreq, entropy
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score, normalized_mutual_info_score
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from trvae.utils import remove_sparsity
def __entropy_from_indices(indices):
return entropy(np.array(itemfreq(indices)[:, 1].astype(np.int32)))
def entropy_batch_mixing(adata, label_key='batch',
n_neighbors=50, n_pools=50, n_samples_per_pool=100, subsample_frac=1.0):
adata = remove_sparsity(adata)
n_samples = adata.shape[0]
keep_idx = np.random.choice(np.arange(n_samples), size=min(n_samples, int(subsample_frac * n_samples)),
replace=False)
adata = adata[keep_idx, :]
neighbors = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(adata.X)
indices = neighbors.kneighbors(adata.X, return_distance=False)[:, 1:]
batch_indices = np.vectorize(lambda i: adata.obs[label_key].values[i])(indices)
entropies = np.apply_along_axis(__entropy_from_indices, axis=1, arr=batch_indices)
# average n_pools entropy results where each result is an average of n_samples_per_pool random samples.
if n_pools == 1:
score = np.mean(entropies)
else:
score = np.mean([
np.mean(entropies[np.random.choice(len(entropies), size=n_samples_per_pool)])
for _ in range(n_pools)
])
return score
def asw(adata, label_key):
adata = remove_sparsity(adata)
labels = adata.obs[label_key].values
labels_encoded = LabelEncoder().fit_transform(labels)
return silhouette_score(adata.X, labels_encoded)
def ari(adata, label_key):
adata = remove_sparsity(adata)
n_labels = len(adata.obs[label_key].unique().tolist())
kmeans = KMeans(n_labels, n_init=200)
labels_pred = kmeans.fit_predict(adata.X)
labels = adata.obs[label_key].values
labels_encoded = LabelEncoder().fit_transform(labels)
return adjusted_rand_score(labels_encoded, labels_pred)
def nmi(adata, label_key):
adata = remove_sparsity(adata)
n_labels = len(adata.obs[label_key].unique().tolist())
kmeans = KMeans(n_labels, n_init=200)
labels_pred = kmeans.fit_predict(adata.X)
labels = adata.obs[label_key].values
labels_encoded = LabelEncoder().fit_transform(labels)
return normalized_mutual_info_score(labels_encoded, labels_pred)