In [None]:
import defopt
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from tqdm import tqdm
import time

import numpy as np
import pandas as pd
import re
import scipy.sparse as ss
import h5py


In [None]:
def filter_genes(gene_names):
    # get rid of gene models etc
    genes_Rik = np.array([ re.search('Rik$', s) is not None for s in gene_names ])
    genes_Gm = np.array([ re.search('Gm\d', s) is not None for s in gene_names ])
    genes_LOC = np.array([ re.search('LOC\d', s) is not None for s in gene_names ])
    genes_AA = np.array([ re.search('^[A-Z]{2}\d*$', s) is not None for s in gene_names ])
    keep_genes = np.logical_not(genes_Rik + genes_Gm + genes_LOC + genes_AA)
    
    #Pseudocode for filtering out genes that are below a mean read threshold across all clusters
    #genes_lowcount = np.array()
    # _, _, cluster_means, _ = compute_means(load_data_tasic_2018('c:\\Users\\Alex\\PhD_data\\allen2018'), 'cluster') # n clusters x n genes matrix
    # gene_threshold = 0
    # for gene in gene_names:
    #     if np.max(cluster_means[gene] < gene_threshold):
    #         genes_lowcount.append(gene)
    keep_genes = np.logical_not(genes_Rik + genes_Gm + genes_LOC + genes_AA)
    return keep_genes

def load_data_tasic_2018(datapath, filter_neurons=True):
    """
    Load the scRNAseq data from Tasic et al., "Shared and distinct
    transcriptomic cell types across neocortical areas", Nature, 2018.

    Args:
        datapath: path to the data

    Returns:
        exons_matrix: n cells x n genes matrix (numpy.ndarray) of read counts
        cluster_ids: numpy.array of cluster assignments from the cell metadata
        cluster_means: n clusters x n genes matrix (numpy.ndarray) of mean read
            counts for each cluster
        cluster_labels: list of cluster names
        gene_names: pandas.Series of gene names

    """
    fname_metadata = f'{datapath}mouse_VISp_2018-06-14_samples-columns.csv'
    metadata = pd.read_csv(fname_metadata, low_memory=False)
    fname = f'{datapath}mouse_VISp_2018-06-14_exon-matrix.csv'
    exons = pd.read_csv(fname, low_memory=False)
    fname_genes = f'{datapath}mouse_VISp_2018-06-14_genes-rows.csv'
    genes = pd.read_csv(fname_genes, low_memory=False)
    gene_names = genes['gene_symbol']
    metadata.set_index('sample_name', inplace=True)
    keep_genes = filter_genes(gene_names)
    exons = exons.iloc[keep_genes]
    gene_names = gene_names.iloc[keep_genes]
    exons_df = metadata.join(exons.T, on='sample_name')
    # only include neurons
    include_classes = [
        'GABAergic',
        'Glutamatergic'
    ]
    # get rid of low quality cells etc
    if filter_neurons:
        exons_subset = exons_df[exons_df['class'].isin(include_classes)]
    else:
        exons_subset = exons_df
    exons_subset = exons_subset[~exons_subset['cluster'].str.contains('ALM')]
    exons_subset = exons_subset[~exons_subset['cluster'].str.contains('Doublet')]
    exons_subset = exons_subset[~exons_subset['cluster'].str.contains('Batch')]
    exons_subset = exons_subset[~exons_subset['cluster'].str.contains('Low Quality')]
    exons_subset = exons_subset[~exons_subset['subclass'].str.contains('High Intronic')]

    return exons_subset, gene_names

def compute_means(exons_df, classify_by, gene_filter='\d'):
    exons_matrix = exons_df.filter(regex=gene_filter).to_numpy() # n cells x n genes matrix
    # names of columns containing expression data are integer numbers
    expression_by_cluster = exons_df.groupby([classify_by]).mean().filter(regex=gene_filter)
    cluster_means = expression_by_cluster.to_numpy()   # n clusters x n genes matrix

    cluster_ids = np.empty(len(exons_df[classify_by]), dtype=int)
    for i, cluster in enumerate(exons_df[classify_by]):
        cluster_ids[i] = np.nonzero(expression_by_cluster.index == cluster)[0]
    cluster_labels = expression_by_cluster.index
    return exons_matrix, cluster_ids, cluster_means, cluster_labels


In [None]:
# n clusters x n genes matrix
exons_subset, gene_names = load_data_tasic_2018('C:/Users/Alex/PhD_data/allen2018/', filter_neurons=True)
exons_matrix, cluster_ids, cluster_means, cluster_labels = compute_means(exons_subset, 'cluster')

In [None]:

max_cluster_means = []
count = -1
for array in cluster_means:
    for gene in array:
        count +=1
        max_cluster_means.append(max(cluster_means[:,count]))
    if count > 23198:
            break

genes = gene_names.tolist()
gene_counts = np.column_stack((genes, max_cluster_means))

np.savetxt("C:/Users/Alex/PhD_data/allen2018/cluster_means.csv", gene_counts, delimiter=",", fmt = '%s')

filtered_gene_counts = np.delete(gene_counts, np.where(gene_counts[:,1].astype('float64') == 0), axis=0)
padlocks_required = np.column_stack((filtered_gene_counts[:,0], 38748 / filtered_gene_counts[:,1].astype('float64')))
np.savetxt("C:/Users/Alex/PhD_data/allen2018/padlocks_required.csv", padlocks_required, delimiter=",", fmt = '%s')

filtered_padlocks_required = np.delete(padlocks_required, np.where(padlocks_required[:,1].astype('float64') > 100), axis=0)
np.savetxt("C:/Users/Alex/PhD_data/allen2018/filtered_padlocks_required.csv", filtered_padlocks_required, delimiter=",", fmt = '%s')

In [None]:
picked_genes = ['Aqp4', 'Arpp21', 'Brinp3', 'Calb1', 'Calb2', 'Cartpt', 'Cck', 'Cd24a',
       'Cdh13', 'Chn2', 'Chodl', 'Chrm2', 'Cnr1', 'Cplx3', 'Cpne6', 'Crh', 'Cryab',
       'Cxcl12', 'Cxcl14', 'Dkk3', 'Enpp2', 'Etv1', 'Fst', 'Gabra1',
       'Gad1', 'Gap43', 'Gpx3', 'Hpcal4', 'Id2', 'Igfbp4', 'Itm2c',
       'Kcnab1', 'Lamp5', 'Lypd6', 'Lypd6b', 'Marcksl1', 'Mdh1', 'Myl4',
       'Ncald', 'Nefl', 'Nnat', 'Nov', 'Npy', 'Nr4a2', 'Nrep', 'Nrsn1',
       'Olfm3', 'Pantr1', 'Pcdh8', 'Pcp4', 'Pcp4l1', 'Pde1a', 'Pdyn',
       'Penk', 'Prss23', 'Ptn', 'Pvalb', 'Rab3b', 'Rbp4', 'Rcan2', 'Reln',
       'Rgs10', 'Rgs4', 'Rspo1', 'Scg2', 'Serpine2', 'Serpini1','Slc1a3', 'Snca',
       'Sncg', 'Sparcl1', 'Spock3', 'Spon1', 'Sst', 'Stmn2', 'Stxbp6', 'Synpr',
       'Tac2', 'Thsd7a', 'Vip']

picked_gene_counts = []
for gene in picked_genes:
    picked_gene_counts.append(gene_counts[int(np.where(gene_counts == gene)[0]),1])


picked_gene_counts = np.column_stack((picked_genes, picked_gene_counts))
np.savetxt("C:/Users/Alex/PhD_data/allen2018/picked_gene_counts.csv", picked_gene_counts, delimiter=",", fmt = '%s')

In [None]:
barseq_genes = ['Calb1','Car3', 'Ccn2','Cdh10','Cdh11','Cdh12','Cdh13','Cdh15','Cdh18','Cdh20','Cdh22','Cdh24','Cdh4','Cdh6','Cdh8','Cdh9','Col11a1',
    'Col19a1','Coro6','Cplx3','Enpp2','Fbxl7','Fezf2','Foxp2','Gad1','Galnt14','Galntl6','Gfra1','Grik1','Hgf','Hs3st4','Il1rapl2','Inpp4b','Kcnip1','Kcnn2','Lpp','Nr4a2','Nrp1','Nxph1','Oprk1','Otof','Pcdh11x','Pcdh17',
    'Pcdh19','Pcdh20','Pcdh7','Pcdh8','Pcdh9','Rab3c','Rasgrf2','Rasl10a','Rorb','Sdk1','Slc17a7','Slc24a3','Slc30a3','Sorcs3','Svil','Synpr','Tafa1','Tafa2','Tle4','Tshz2','Vwc2l','Zfpm2']

barseq_picked_gene_counts = []

for gene in barseq_genes:
    barseq_picked_gene_counts.append(gene_counts[int(np.where(gene_counts == gene)[0]),1])


barseq_picked_gene_counts = np.column_stack((barseq_genes, barseq_picked_gene_counts))
np.savetxt("C:/Users/Alex/PhD_data/allen2018/barseq_picked_gene_counts.csv", barseq_picked_gene_counts, delimiter=",", fmt = '%s')