### Feature selection

In [1]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np
import pandas as pd

# Get marker genes from in situ data
insitu_df = pd.read_csv('data/bdtnp.txt',sep='\t')
marker_genes = list(insitu_df.columns)

# Get expression of marker genes for cells
cell_df = pd.read_csv('data/dge_normalized.txt',sep='\t')
all_genes = list(cell_df.index)  
cell_df = cell_df.T
cell_df.columns = all_genes
cell_df = cell_df[marker_genes]

sel = VarianceThreshold()
sel.fit(cell_df.values)
score = sel.variances_

# Descending sort the variance scores. 60 highest-variance genes will be selected.
idx = np.argsort(score, 0)[::-1]
genes = [marker_genes[e] for e in idx][:60]

### Location prediction for cells

In [2]:
import scipy
# this function returns MCC score for true_labels and pred_labels vectors
def get_mcc(true_labels, pred_labels):
    TP = np.sum(np.logical_and(pred_labels == 1, true_labels == 1))
    TN = np.sum(np.logical_and(pred_labels == 0, true_labels == 0))
    FP = np.sum(np.logical_and(pred_labels == 1, true_labels == 0))
    FN = np.sum(np.logical_and(pred_labels == 0, true_labels == 1))
    mcc = (TP * TN) - (FP * FN)
    denom = np.sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
    if denom==0:
        return 0
    return mcc / denom

# in-situ binary expression
insitu_bin = pd.read_csv('data/binarized_bdtnp.csv')
# # in-situ coordinates
insitu_coords = pd.read_csv('data/geometry.txt',sep=' ')

# cell binary expression
cell_bin = pd.read_csv('data/dge_binarized_distMap.csv')
all_genes = list(cell_bin.index)  
cell_bin = cell_bin.T
cell_bin.columns = all_genes

# compute MCC for all cells with all locations
Pr = cell_bin[genes].values
Gt = insitu_bin[genes].values
mcc = np.asarray([get_mcc(p, g) for p in Pr for g in Gt])
mcc = mcc.reshape(len(Pr),-1)

# choose 10 best locations
TOP = 10
# store indices of locations for all cells
indices = []
# loop over all cells
for i in range(len(mcc)):
#     get the location whose the largest mcc
    best_idx = mcc[i].argmax()
#     get coordinates for the location
    coord = insitu_coords[ insitu_coords.index==best_idx ][['xcoord','ycoord','zcoord']].values
#     compared the coord with all the coords    
    dist = scipy.spatial.distance.cdist(coord, insitu_coords[['xcoord','ycoord','zcoord']].values)[0]
#     get the locations with shortest Euclidean distance
    top_idx = dist.argsort()[:TOP]
#     +1 as the location starts from 1
    top_idx = [e+1 for e in top_idx]
    indices += [ top_idx ]

# submission file
dout = 'sc_60genes.csv' 
with open(dout,'w') as file_result:
    tmp = ['']
    for i in range(len(genes)):
        tmp += [ genes[i] ]
        if (i+1)%10==0:
            file_result.write(','.join(tmp) + '\n')
            tmp = ['']
    for i in range(len(indices)):
        tmp = [i+1] + indices[i]
        file_result.write(','.join(map(str,tmp)) + '\n')