In [3]:
import numpy as np
import pandas as pd
import scipy,sys,math
from sklearn.feature_selection import VarianceThreshold
from skfeature.function.sparse_learning_based import MCFS
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.neighbors import KNeighborsClassifier
import sklearn.cluster
from skfeature.utility.construct_W import construct_W

# this function returns MCC score for true_labels and pred_labels vectors
def get_mcc(true_labels, pred_labels):
    TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1))
    TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0))
    FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0))
    FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1))
    mcc = (TP * TN) - (FP * FN)
    denom = np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
    if denom==0:
        return 0
    return mcc / denom

# turn the probability to the clusters of the cell to the weighted coordinates of the cluster centers
def get_xyz(x):
    ret = []
    for i in range(len(x)):
        v = np.average(cluster_centers, axis=0, weights=x[i,:])
        ret += [ v ]
    return np.array(ret)

# ndfs feature selection 
# Li, Zechao, et al. "Unsupervised Feature Selection Using Nonnegative Spectral Analysis." AAAI. 2012.
def ndfs(X, n_clusters):
    gamma = 10e8
    W = construct_W(X)
    alpha = 1
    beta = 1
    n_samples, n_features = X.shape
    kmeans = sklearn.cluster.KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300,
                                    tol=0.0001, precompute_distances=True, verbose=0,
                                    random_state=42, copy_x=True, n_jobs=1)
    kmeans.fit(X)
    labels = kmeans.labels_
    Y = np.zeros((n_samples, n_clusters))
    for row in range(0, n_samples):
        Y[row, labels[row]] = 1
    T = np.dot(Y.transpose(), Y)
    F = np.dot(Y, np.sqrt(np.linalg.inv(T)))
    F = F + 0.02*np.ones((n_samples, n_clusters))            
    
    # initialize D as identity matrix
    D = np.identity(n_features)
    I = np.identity(n_samples)
    # build laplacian matrix
    L = np.array(W.sum(1))[:, 0] - W

    max_iter = 1000
    obj = np.zeros(max_iter)
    for iter_step in range(max_iter):
        # update W
        T = np.linalg.inv(np.dot(X.transpose(), X) + beta * D + 1e-6*np.eye(n_features))
        W = np.dot(np.dot(T, X.transpose()), F)
        # update D
        temp = np.sqrt((W*W).sum(1))
        temp[temp < 1e-16] = 1e-16
        temp = 0.5 / temp
        D = np.diag(temp)
        # update M
        M = L + alpha * (I - np.dot(np.dot(X, T), X.transpose()))
        M = (M + M.transpose())/2
        # update F
        denominator = np.dot(M, F) + gamma*np.dot(np.dot(F, F.transpose()), F)
        temp = np.divide(gamma*F, denominator)
        F = F*np.array(temp)
        temp = np.diag(np.sqrt(np.diag(1 / (np.dot(F.transpose(), F) + 1e-16))))
        F = np.dot(F, temp)
        # calculate objective function
        obj[iter_step] = np.trace(np.dot(np.dot(F.transpose(), M), F)) + gamma/4*np.linalg.norm(np.dot(F.transpose(), F)-np.identity(n_clusters), 'fro')
        if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
            break
    return W

# this function returns a ranking of features for a feature selection method  
def feat_ranking(fs):
    idx = None
    # Get expression of marker genes for cells
    cell_df = pd.read_csv('data/dge_normalized.txt',sep='\t')
    all_genes = list(cell_df.index)  
    cell_df = cell_df.T
    cell_df.columns = all_genes
    cell_df = cell_df[marker_genes]
    
    if fs=='variance':
        sel = VarianceThreshold()
        sel.fit(cell_df.values)
        score = sel.variances_
        idx = np.argsort(score, 0)[::-1]
    elif fs=='ndfs':
        W = ndfs(cell_df.values,n_clusters=10) 
        score = (W*W).sum(1)
        idx = np.argsort(score)[::-1]          
    elif fs=='mcfs':
        score = MCFS.mcfs(cell_df.values)
        idx = np.argsort(score)[::-1]
    return idx
    
# feat_selections = ['variance','ndfs','mcfs']
# loc_predictions = ['direct','indirect','indirect']

num_feats = [60,40,20]
scs = [1,2,3]
nfeat_sc = dict(zip(num_feats,scs))



# Get marker genes from in situ data
insitu_df = pd.read_csv('data/bdtnp.txt',sep='\t')
marker_genes = list(insitu_df.columns)
# in-situ binary expression
insitu_bin = pd.read_csv('data/binarized_bdtnp.csv')
# # in-situ coordinates
insitu_coords = pd.read_csv('data/geometry.txt',sep=' ')

# cell binary expression
cell_bin = pd.read_csv('data/dge_binarized_distMap.csv')
all_genes = list(cell_bin.index)  
cell_bin = cell_bin.T
cell_bin.columns = all_genes

for num_feat in [60,40,20]:
    for feat_selection in ['variance','ndfs','mcfs']:
        for loc_prediction in ['direct','indirect']:
            dout =  'result/' + 'sc' + str(nfeat_sc[num_feat]) + '_' + loc_prediction + '_' +  feat_selection +  '.csv' 
            print('running',dout)
        
            idx = feat_ranking(feat_selection)
            # use best num_feat genes
            genes = [marker_genes[e] for e in idx][:num_feat]
            if loc_prediction=='direct':
                # compute MCC for all cells with all locations
                Pr = cell_bin[genes].values
                Gt = insitu_bin[genes].values
                mcc = np.asarray([get_mcc(p, g) for p in Pr for g in Gt])
                mcc = mcc.reshape(len(Pr),-1)
                # choose 10 best locations
                TOP = 10
                # store indices of locations for all cells
                indices = []
                # loop over all cells
                for i in range(len(mcc)):
                #     get the location whose the largest mcc
                    best_idx = mcc[i].argmax()
                #     get coordinates for the location
                    coord = insitu_coords[ insitu_coords.index==best_idx ][['xcoord','ycoord','zcoord']].values
                #     compared the coord with all the coords    
                    dist = scipy.spatial.distance.cdist(coord, insitu_coords[['xcoord','ycoord','zcoord']].values)[0]
                #     get the locations with shortest Euclidean distance
                    top_idx = dist.argsort()[:TOP]
                #     +1 as the location starts from 1
                    top_idx = [e+1 for e in top_idx]
                    indices += [ top_idx ]
            else: # clustering first then classification
                # Step 1: Clustering locations into groups of similar coordinates
                # in-situ coordinates
                insitu_coords = pd.read_csv('data/geometry.txt',sep=' ')
                y = insitu_coords[['xcoord','ycoord','zcoord']].values
                bw = estimate_bandwidth(y, quantile=.01)
                ms = MeanShift(bandwidth=bw, bin_seeding=True, min_bin_freq=5)
                ms.fit(y)
                # store coordinates for the centers of the clusters
                cluster_centers = ms.cluster_centers_
                # store what cluster the insitu coordinates belong to
                labels = ms.labels_

                # Step 2: Classification

                clf = KNeighborsClassifier()
                # learn a multi-class classifier
                clf.fit(insitu_bin[genes].values, labels)
                # predict probability of cells to the clusters
                preds = clf.predict_proba(cell_bin[genes].values)
                # turn the probabilities to average weighted coordinates
                preds = get_xyz(preds)
                # distance to the in situ coordinates of the predicted coordinates for cells
                preds_to_insitu_coords = scipy.spatial.distance.cdist(preds,insitu_coords[['xcoord','ycoord','zcoord']].values)
                # choose 10 best locations
                TOP = 10
                # store indices of locations for all cells
                indices = []
                for i in range(len(preds_to_insitu_coords)):
                #     get the index of the location whose the smallest to insitu coords
                    best_idx = preds_to_insitu_coords[i].argmin()
                #     get coordinates for the location
                    coord = insitu_coords[ insitu_coords.index==best_idx ][['xcoord','ycoord','zcoord']].values
                #     compared the coord with all the coords    
                    dist = scipy.spatial.distance.cdist(coord, insitu_coords[['xcoord','ycoord','zcoord']].values)[0]
                #     get the locations with shortest Euclidean distance
                    top_idx = dist.argsort()[:TOP]
                #     +1 as the location starts from 1
                    top_idx = [e+1 for e in top_idx]
                    indices += [ top_idx ]        

            # submission file
            with open(dout,'w') as file_result:
                tmp = ['']
                for i in range(len(genes)):
                    tmp += [ genes[i] ]
                    if (i+1)%10==0:
                        file_result.write(','.join(tmp) + '\n')
                        tmp = ['']
                for i in range(len(indices)):
                    tmp = [i+1] + indices[i]
                    file_result.write(','.join(map(str,tmp)) + '\n')        


running result/sc1_direct_variance.csv
running result/sc1_indirect_variance.csv
running result/sc1_direct_ndfs.csv




running result/sc1_indirect_ndfs.csv




running result/sc1_direct_mcfs.csv
running result/sc1_indirect_mcfs.csv
running result/sc2_direct_variance.csv
running result/sc2_indirect_variance.csv
running result/sc2_direct_ndfs.csv




running result/sc2_indirect_ndfs.csv




running result/sc2_direct_mcfs.csv
running result/sc2_indirect_mcfs.csv
running result/sc3_direct_variance.csv
running result/sc3_indirect_variance.csv
running result/sc3_direct_ndfs.csv




running result/sc3_indirect_ndfs.csv




running result/sc3_direct_mcfs.csv
running result/sc3_indirect_mcfs.csv
