In [15]:

from collections import Counter
import numpy as np
import pandas as pd 
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
exec(compile(open(r"../nsimplices.py", encoding="utf8").read(), "nsimplices.py", 'exec'))

In [16]:
df_Baron = pd.read_csv("~/nSimplices/data/sce_full_sce_Baron_scScope.csv", index_col=0, header=0)
df_Baron.shape

df_meta = pd.read_csv("~/nSimplices/data/mouse_metadata.csv", index_col=0, header=0)

In [17]:

### Run nSimplices method
feature_num = df_Baron.shape[1]
dim_start = 1
# dim_end = df_Baron.shape[1]
dim_end = 2
out_dis = pdist(df_Baron)
out_dis_sq = squareform(out_dis)

outlier_indices, subspace_dim, corr_dis_sq, corr_coord = nsimplices(out_dis_sq, feature_num, dim_start, dim_end, euc_coord=np.array(df_Baron.copy()))

dim in find_subspace_dim is: 1
dim in find_subspace_dim is: 2
med_height is: [395.9178389  276.40135712]
subspace_dim is: 2
thres is: 870.6649820270803 mean is: 485.7244853827613 std is: 128.31349888143967
outlier indices are: [   2    4   13   14   23   32   42   50   83   90  110  279  280  281
  285  290  340  362  389  569  571  572  587  593  594  605  611  690
  826  830  833  834  835  837  845  854  855  856  858  860  861  863
  864  865  866  869  874  877  880  881  883  885  888  889  891  892
  893  897  903  904  905  906  913  918  920  921  925  928  930  931
  933  937  939  940  948  949  959  965  966  967  969  970  974  977
  978  979  983  986  990  992  996 1002 1003 1007 1008 1010 1015 1017
 1025 1027 1030 1037 1038 1045 1050 1051 1052 1068 1070 1085 1110 1142
 1143 1145 1150 1151 1153 1154 1155 1156 1158 1161 1163 1164 1166 1167
 1168 1169 1171 1172 1175 1176 1177 1181 1182 1187 1189 1191 1192 1193
 1195 1196 1203 1206 1208 1209 1211 1217 1220 1222 1223 1224 12

In [24]:
# Percent of outliers 
print(df_Baron.shape)
print(len(outlier_indices))

# Print outliers 
print(df_Baron.iloc[outlier_indices[0]])

(1886, 13357)
294
X0610007P14Rik    0
X0610009B22Rik    0
X0610009E02Rik    1
X0610009L18Rik    0
X0610009O20Rik    0
                 ..
Zyg11b            0
Zyx               0
Zzef1             1
Zzz3              0
l7Rn6             2
Name: mouse1_lib1.final_cell_0003, Length: 13357, dtype: int64


In [25]:
def reduce_dim(coords):
    # assume that optimal dimension is 2 from DeCOr-MDS
    pca = PCA(n_components=2) 
    new_coords = pca.fit_transform(coords)
    return new_coords

In [60]:
def cluster(coords, true_labels):
    """ 
    Apply K-means, decide cluster IDs
    """
    n_clusters = len(list(set(true_labels)))
    # random state = 3 or 6 works better 
    kmeans = KMeans(n_clusters=n_clusters, random_state=6).fit(coords)
    return kmeans.labels_


In [65]:
def infer_cluster_labels(cluster_ids, true_labels):
    """ 
    Infer one label for each cluster 
    """
    def majority_label(labels):
        ''' 
        Infer labels by majority voting
        '''
        c = Counter(labels)
        value, _ = c.most_common()[0]
        return value
    pred_labels = np.empty(len(true_labels), dtype=object)
    unique_ids = list(set(cluster_ids))
    for id in unique_ids:
        id_indices = [i for i, x in enumerate(cluster_ids) if x == id]
        id_true_labels = true_labels[id_indices]
        pred_label = majority_label(id_true_labels)
        pred_labels[id_indices] = pred_label
    return pred_labels


In [66]:
def cluster_proc(coords, true_labels):
    """ 
    @param coords np.array: raw coordinates or corrected coordinates from DeCOr-MDS
    """
    cluster_ids = cluster(coords, true_labels)
    pred_labels = infer_cluster_labels(cluster_ids, true_labels)
    NMI = normalized_mutual_info_score(true_labels, pred_labels)
    ARI = adjusted_rand_score(true_labels, pred_labels)
    return NMI, ARI

In [67]:
true_labels = np.array(df_meta["cell_type"])
cluster_proc(corr_coord, true_labels)

(0.563911435816478, 0.6024625057608046)

In [68]:
coords = reduce_dim(np.array(df_Baron))
cluster_proc(coords, true_labels)

(0.5392773389390121, 0.5229809454095573)