In [1]:
from sklearn.neighbors import NearestNeighbors as KNN
from skimage.filters import gaussian, threshold_otsu
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tifffile
from math import sqrt


In [2]:
def lowercase(sample):
    if sample == "Well01OB":
        return "well01OB"
    return sample.lower()

def sagittal(sample):
    return (sample[:2] == "Sa")


In [3]:
def valid(x, y):
        return valid_backend.get((x, y), False)


In [50]:
for sample in ['Sagittal2','Sagittal3','SpinalCord','Well01brain','Well01OB','Well1_5',
               'Well2_5','Well03','Well3_5','Well04','Well05','Well06','Well07','Well7_5',
               'Well08','Well09','Well10','Well10_5','Well11']:
    is_sagittal = sagittal(sample)
    racrna_file = f'racRNA_data/{sample}/racRNA_spot_meta_all.csv'
    spot_file = f'racRNA_data/{sample}/{lowercase(sample)}_spot_meta.csv'
    racrna_output_file = f'output/classified_racrna_dapi/{sample}_classified_racrna.csv'
    cell_output_file = f'output/cell_info_dapi/{sample}_info_by_cell.csv'


    racrna_data = pd.read_csv(racrna_file)
    spot_data = pd.read_csv(spot_file)

    sample = lowercase(sample)
    
    X = spot_data[['spot_merged_1', 'spot_merged_2']].to_numpy()
    Y = spot_data['cellid_idx']
    Xpred = racrna_data[['spot_merged_1', 'spot_merged_2']].to_numpy()

    
    
    valid_backend = {}

    if True:
        df_filtered = pd.read_csv(f'racRNA_data/2D_mask/{lowercase(sample)}_racRNA_spot_meta_all.csv')
        for x, y in zip(df_filtered['spot_merged_1'], df_filtered['spot_merged_2']):
            valid_backend[(x, y)] = True
                     

    knn = KNN(n_neighbors=1).fit(X)
    distances, indices = knn.kneighbors(Xpred)
    
    
    n_pred = Xpred.shape[0]
    cell_type = []
    for i in range(n_pred):
        if valid(Xpred[i,0], Xpred[i,1]):
            cell_type.append(Y[indices[i][0]])
        else:
            cell_type.append(-1)
            
    print(sample, f"Of {len(cell_type)} racRNA spots, {len(cell_type) - int(np.sum(np.array(cell_type) == -1))} were placed into a cell.")

    racrna_data.loc[:,'cellid_idx'] = cell_type
    ctype_file = './pd_tissue.csv'

    
    ctype_data = pd.read_csv(ctype_file, low_memory=False)[['orginindex', 'sample']]
    ctype_data = ctype_data.loc[ctype_data['sample'] == sample].reset_index(drop=True)[['orginindex']]
    ctype_data['orginindex'] = ctype_data['orginindex'].astype(int)

    

    verbosity = False

    n_cells = np.max(ctype_data['orginindex']) + 1
    if is_sagittal:
        racrna_counter = np.zeros((n_cells, 4))
    else:
        racrna_counter = np.zeros(n_cells)
    cellids = racrna_data['cellid_idx']
    geneids = racrna_data['geneid']
    for i in range(racrna_data.shape[0]):
        cellid_idx = cellids[i]
        geneid = geneids[i]
        if cellid_idx != -1 and cellid_idx < n_cells:
            if is_sagittal:
                racrna_counter[cellid_idx][geneid-1] += 1
            else:
                racrna_counter[cellid_idx] += 1
        elif cellid_idx >= n_cells and verbosity:
            print("Found unknown cell: ", cellid_idx, f"(gene ID {geneid})") # some (usually <= 5) cells were not in cell type data

            
            

    to_df = {
        'cellid_idx': [],
        'racRNA uptake': []
    }

    if is_sagittal:
        to_df['geneid'] = []

    for idx in range(n_cells):
        if np.sum(racrna_counter[idx]) > 0:
            if is_sagittal:
                for i in range(4):
                    to_df['cellid_idx'].append(idx)
                    to_df['racRNA uptake'].append(racrna_counter[idx,i])
                    to_df['geneid'].append(i+1)
            else:
                to_df['cellid_idx'].append(idx)
                to_df['racRNA uptake'].append(racrna_counter[idx])


    cellinfo_df = pd.DataFrame(to_df)

    racrna_data.to_csv(racrna_output_file)
    cellinfo_df.to_csv(cell_output_file)


well08 Of 14589887 racRNA spots, 1849095 were placed into a cell.
