In [1]:
%matplotlib notebook

from collections import Counter
import dill
import glob
import igraph as ig
import itertools
import leidenalg
#import magic
import matplotlib
from matplotlib import pyplot
import numba
import numpy
import os
import pickle
from plumbum import local
import random
import re
import scipy
from scipy.cluster import hierarchy
import scipy.sparse as sps
from scipy.spatial import distance
import scipy.stats as stats
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn import neighbors
from sklearn import metrics
import sys
import umap

#from plotly import tools
#import plotly.offline as py
#import plotly.graph_objs as go
#py.init_notebook_mode(connected=True)



In [2]:
def find_nearest_genes(peak_files, out_subdir, refseq_exon_bed):
    #get unix utilities
    bedtools, sort, cut, uniq, awk = local['bedtools'], local['sort'], local['cut'], local['uniq'], local['awk']

    #process the peak files to find nearest genes
    nearest_genes = []
    for path in sorted(peak_files):
        out_path = os.path.join(out_subdir, os.path.basename(path).replace('.bed', '.nearest_genes.txt'))
        cmd = (bedtools['closest', '-D', 'b', '-io', '-id', '-a', path, '-b', refseq_exon_bed] |
         cut['-f1,2,3,5,9,12'] | #fields are chrom, start, stop, peak sum, gene name, distance
         awk['BEGIN{OFS="\t"}{if($6 > -1200){print($1, $2, $3, $6, $5, $4);}}'] |
         sort['-k5,5', '-k6,6nr'] |
         cut['-f5,6'])()
        with open(out_path, 'w') as out:
            prev_gene = None
            for idx, line in enumerate(str(cmd).strip().split('\n')):
                if prev_gene is None or not line.startswith(prev_gene):
#                    print(line)
                    line_split = line.strip().split()
                    prev_gene = line_split[0]
                    out.write(line + '\n')
        nearest_genes.append(out_path)
    return nearest_genes

def load_expr_db(db_path):
    if os.path.basename(db_path) == 'RepAvgGeneTPM.csv':
        with open(db_path) as lines_in:
            db_headers = lines_in.readline().strip().split(',')[1:]
        db_vals = numpy.loadtxt(db_path, delimiter=',', skiprows=1, dtype=object)[:,1:]
    else:
        with open(db_path) as lines_in:
            db_headers = lines_in.readline().strip().split('\t')
        db_vals = numpy.loadtxt(db_path, delimiter='\t', skiprows=1, dtype=object)
    print('Loaded DB shape: {!s}'.format(db_vals.shape))
    return (db_headers, db_vals)

TOPN=500
def get_gene_data(genes_path, gene_expr_db, topn=TOPN):
    if isinstance(genes_path, list):
        genes_list = genes_path
    else:
        with open(genes_path) as lines_in:
            genes_list = [elt.strip().split()[:2] for elt in lines_in]
    gene_idx = [(numpy.where(gene_expr_db[:,0] == elt[0])[0],elt[1]) for elt in genes_list]
    gene_idx_sorted = sorted(gene_idx, key=lambda x:float(x[1]), reverse=True)
    gene_idx, gene_weights = zip(*[elt for elt in gene_idx_sorted if len(elt[0]) > 0][:topn])
    gene_idx = [elt[0] for elt in gene_idx]
    gene_data = gene_expr_db[:,1:].astype(float)[gene_idx,:]
    denom = numpy.sum(gene_data, axis=1)[:,None] + 1e-8
    gene_norm = gene_data/denom
    return gene_idx, gene_data, gene_norm, len(genes_list), numpy.array(gene_weights, dtype=float)

def sample_db(data_norm, expr_db, data_weights=None, nsamples=1000):
    samples = []
    rs = numpy.random.RandomState(15321)
    random_subset = numpy.arange(expr_db.shape[0])
    num_to_select = data_norm.shape[0]
    for idx in range(nsamples):
        rs.shuffle(random_subset)
        db_subset = expr_db[random_subset[:num_to_select]][:,1:].astype(float)
        denom = numpy.sum(db_subset, axis=1)[:None] + 1e-8
        db_subset_norm = numpy.mean((db_subset.T/denom).T, axis=0)
        if data_weights is not None:
            samples.append(numpy.log2(numpy.average(data_norm, axis=0, weights=gene_weights)/db_subset_norm))
        else:
            samples.append(numpy.log2(numpy.average(data_norm, axis=0, weights=None)/db_subset_norm))
    samples = numpy.vstack(samples)
    samples_mean = numpy.mean(samples, axis=0)
    samples_sem = stats.sem(samples, axis=0)
    conf_int = numpy.array([stats.t.interval(0.95, samples.shape[0]-1, 
                                             loc=samples_mean[idx], scale=samples_sem[idx])
                            for idx in range(samples.shape[1])]).T
    conf_int[0] = samples_mean - conf_int[0]
    conf_int[1] = conf_int[1] - samples_mean
    return samples_mean, conf_int

def plot_l2_tissues(nearest_genes_glob, refdata, expr_db=None, expr_db_headers=None, ncols=3, 
                    topn=TOPN, weights=False, nsamples=100, savefile=None, display_in_notebook=True):
    if expr_db is None:
        #Get all L2 tissue expression data to normalize the distribution of genes from peaks
        l2_tissue_db_path = os.path.join(refdata,'gexplore_l2_tissue_expr.txt')
        expr_db_headers, expr_db = load_expr_db(l2_tissue_db_path)
    
    gene_lists = glob.glob(nearest_genes_glob)
    if os.path.basename(gene_lists[0]).startswith('peaks'):
        gene_lists.sort(key=lambda x:int(os.path.basename(x).split('.')[0].replace('peaks', '')))
    elif os.path.basename(gene_lists[0]).startswith('topic'):
        gene_lists.sort(key=lambda x:int(os.path.basename(x).split('.')[1].replace('rank', '')))
    else:
        gene_lists.sort(key=lambda x:os.path.basename(x).split('.')[0])
    gene_list_data = [(os.path.basename(path).split('.')[0], get_gene_data(path, expr_db, topn=topn)) for path in gene_lists]
    print('\n'.join(['{!s} nearest genes: found {!s} out of {!s} total'.format(fname, data.shape[0], gene_list_len)
                    for (fname, (data_idx, data, data_norm, gene_list_len, gene_weights)) in gene_list_data]))
    
    l2_tissue_colors = [('Body wall muscle', '#e51a1e'),
                        ('Intestinal/rectal muscle', '#e51a1e'),
                        ('Pharyngeal muscle', '#377db8'),
                        ('Pharyngeal epithelia', '#377db8'),
                        ('Pharyngeal gland', '#377db8'),
                        ('Seam cells', '#4eae4a'),
                        ('Non-seam hypodermis', '#4eae4a'),
                        ('Rectum', '#4eae4a'),
                        ('Ciliated sensory neurons', '#984ea3'),
                        ('Oxygen sensory neurons', '#984ea3'),
                        ('Touch receptor neurons', '#984ea3'),
                        ('Cholinergic neurons', '#984ea3'),
                        ('GABAergic neurons', '#984ea3'),
                        ('Pharyngeal neurons', '#984ea3'),
                        ('flp-1(+) interneurons', '#984ea3'),
                        ('Other interneurons', '#984ea3'),
                        ('Canal associated neurons', '#984ea3'),
                        ('Am/PH sheath cells', '#ff8000'),
                        ('Socket cells', '#ff8000'),
                        ('Excretory cells', '#ff8000'),
                        ('Intestine', '#fcd800'),
                        ('Germline', '#f97fc0'),
                        ('Somatic gonad precursors', '#f97fc0'),
                        ('Distal tip cells', '#f97fc0'),
                        ('Vulval precursors', '#f97fc0'),
                        ('Sex myoblasts', '#f97fc0'),
                        ('Coelomocytes', '#a75629')]
    idx_by_color = {}
    for idx, (name, color) in enumerate(l2_tissue_colors):
        try:
            idx_by_color[color][1].append(idx)
        except KeyError:
            idx_by_color[color] = [name, [idx]]
            
#    rs = numpy.random.RandomState(15321)
#    random_subset = numpy.arange(expr_db.shape[0])
#    rs.shuffle(random_subset)
#    #num_to_select = int(numpy.mean([neuron_data.shape[0], emb_muscle_data.shape[0], l2_muscle_data.shape[0]]))
#    num_to_select = len(random_subset)
#    l2_tissue_db_subset = expr_db[random_subset[:num_to_select]][:,1:].astype(float)
#    denom = numpy.sum(l2_tissue_db_subset, axis=1)[:,None] + 1e-8
#    l2_tissue_db_norm = numpy.mean(l2_tissue_db_subset/denom, axis=0)
    print('Tissue DB norm shape: {!s}'.format(expr_db.shape))

    pyplot.rcParams.update({'xtick.labelsize':14,
                            'ytick.labelsize':14,
                            'xtick.major.pad':8})

    ind = numpy.arange(len(expr_db_headers) - 1)
    width = 0.66
    axis_fontsize = 18
    title_fontsize = 19
    nrows = int(numpy.ceil(len(gene_list_data)/float(ncols)))
    fig, axes = pyplot.subplots(nrows=nrows, ncols=ncols, figsize=(7 * ncols, 7 * nrows), sharey=True)
    for idx, (fname, (data_idx, data, data_norm, gene_list_len, gene_weights)) in enumerate(gene_list_data):
        ax_idx = (idx//ncols, idx%ncols) if nrows > 1 else idx
#        to_plot = numpy.log2(numpy.mean(data_norm, axis=0)/l2_tissue_db_norm)
#        import pdb; pdb.set_trace()
        if weights is True:
#            to_plot = numpy.log2(numpy.average(data_norm, axis=0, weights=gene_weights)/l2_tissue_db_norm)
            to_plot, errs = sample_db(data_norm, expr_db, data_weights=gene_weights, nsamples=nsamples)
        else:
#            to_plot = numpy.log2(numpy.average(data_norm, axis=0, weights=None)/l2_tissue_db_norm)
            to_plot, errs = sample_db(data_norm, expr_db, data_weights=None, nsamples=nsamples)
        for idx, (name, color) in enumerate(l2_tissue_colors):
            axes[ax_idx[0],ax_idx[1]].bar(ind[idx], to_plot[idx], width, yerr=errs[:,idx][:,None], color=color, label=name)
        axes[ax_idx[0],ax_idx[1]].axhline(0, color='k')
        axes[ax_idx[0],ax_idx[1]].set_xlim((-1, len(expr_db_headers)))
        axes[ax_idx[0],ax_idx[1]].set_title('{!s}\n({!s} genes)\n'.format(fname, data.shape[0]), fontsize=title_fontsize)
        axes[ax_idx[0],ax_idx[1]].set_ylabel('Log2 ratio of mean expr proportion\n(ATAC targets:Random genes)', fontsize=axis_fontsize)
        axes[ax_idx[0],ax_idx[1]].set_xlabel('L2 tissues', fontsize=axis_fontsize)
        axes[ax_idx[0],ax_idx[1]].set_xticks(ind + width/2)
        axes[ax_idx[0],ax_idx[1]].set_xticklabels([])
    #axes[0].set_xticklabels(expr_db_headers[1:], rotation=90)
    if nrows > 1:
        axes[0,ncols-1].legend(bbox_to_anchor=[1.0,1.0])
    else:
        axes[-1].legend(bbox_to_anchor=[1.0,1.0])

    if display_in_notebook is True:
        fig.tight_layout()
    if savefile is not None:
        fig.savefig(savefile, bbox_inches='tight')

def plot_stages(nearest_genes_glob, refdata, expr_db=None, expr_db_headers=None, ncols=3, topn=TOPN, weights=False):
    if expr_db is None:
        #Get all stages expression data to normalize the distribution of genes from peaks
        stage_db_path = os.path.join(refdata,'gexplore_stage_expr.txt')
        expr_db_headers, expr_db = load_expr_db(stage_db_path)

    gene_lists = glob.glob(nearest_genes_glob)
    if os.path.basename(gene_lists[0]).startswith('peaks'):
        gene_lists.sort(key=lambda x:int(os.path.basename(x).split('.')[0].replace('peaks', '')))
    elif os.path.basename(gene_lists[0]).startswith('topic'):
        gene_lists.sort(key=lambda x:int(os.path.basename(x).split('.')[1].replace('rank', '')))
    else:
        gene_lists.sort(key=lambda x:os.path.basename(x).split('.')[0])
    gene_list_data = [(os.path.basename(path).split('.')[0], get_gene_data(path, expr_db, topn=topn)) for path in gene_lists]
    print('\n'.join(['{!s} nearest genes: found {!s} out of {!s} total'.format(fname, data.shape[0], gene_list_len)
                    for (fname, (data_idx, data, data_norm, gene_list_len, gene_weights)) in gene_list_data]))
    
    rs = numpy.random.RandomState(15321)
    random_subset = numpy.arange(expr_db.shape[0])
    rs.shuffle(random_subset)
    #num_to_select = int(numpy.mean([neuron_data.shape[0], emb_muscle_data.shape[0], l2_muscle_data.shape[0]]))
    num_to_select = len(random_subset)
    stage_db_subset = expr_db[random_subset[:num_to_select]][:,1:].astype(float)
    denom = numpy.sum(stage_db_subset, axis=1)[:,None] + 1e-8
    stage_db_norm = numpy.mean(stage_db_subset/denom, axis=0)
    print('Stage DB norm shape: {!s}'.format(stage_db_norm.shape))

    emb_idx = [expr_db_headers[1:].index(elt) for elt in expr_db_headers[1:] 
               if elt.endswith('m') or elt == '4-cell']
    larva_idx = [expr_db_headers[1:].index(elt) for elt in expr_db_headers[1:] 
                 if elt.startswith('L')]
    adult_idx = [expr_db_headers[1:].index(elt) for elt in expr_db_headers[1:]
                if 'adult' in elt]
    dauer_idx = [expr_db_headers[1:].index(elt) for elt in expr_db_headers[1:]
                if 'dauer' in elt]
#    rest_idx = [expr_db_headers[1:].index(elt) for elt in expr_db_headers[1:] 
#                if not elt.endswith('m') and not elt.startswith('L') and elt != '4-cell']

    pyplot.rcParams.update({'xtick.labelsize':20,
                            'ytick.labelsize':20,
                            'xtick.major.pad':8})

    ind = numpy.arange(len(expr_db_headers) - 1)
    width = 0.66
    axis_fontsize = 25
    title_fontsize = 27
    nrows = int(numpy.ceil(len(gene_list_data)/float(ncols)))
    fig, axes = pyplot.subplots(nrows=nrows, ncols=ncols, figsize=(7 * ncols, 7 * nrows), sharey=True)
    for idx, (fname, (data_idx, data, data_norm, gene_list_len, gene_weights)) in enumerate(gene_list_data):
        ax_idx = (idx//ncols, idx%ncols) if nrows > 1 else idx
#        to_plot = numpy.log2(numpy.mean(data_norm, axis=0)/stage_db_norm)
        if weights is True:
            to_plot = numpy.log2(numpy.average(data_norm, axis=0, weights=gene_weights)/stage_db_norm)
        else:
            to_plot = numpy.log2(numpy.average(data_norm, axis=0, weights=None)/stage_db_norm)
        axes[ax_idx].bar(ind[emb_idx], to_plot[emb_idx], width, color='orange', label='Embryo')
        axes[ax_idx].bar(ind[larva_idx], to_plot[larva_idx], width, color='blue', label='Larva')
        axes[ax_idx].bar(ind[adult_idx], to_plot[adult_idx], width, color='red', label='Adult')
        axes[ax_idx].bar(ind[dauer_idx], to_plot[dauer_idx], width, color='green', label='Dauer')
#        axes[ax_idx].bar(ind[rest_idx], to_plot[rest_idx], width, color='grey', label='Other')
        axes[ax_idx].axhline(0, color='k')
        axes[ax_idx].set_xlim((-1, len(expr_db_headers)))
        axes[ax_idx].set_title('{!s}\n({!s} genes)\n'.format(fname, data.shape[0]), fontsize=title_fontsize)
        axes[ax_idx].set_ylabel('Log2 Ratio of Mean Expr Proportion\n(ATAC Targets:All Genes)', fontsize=axis_fontsize)
        axes[ax_idx].set_xlabel('Developmental Stage', fontsize=axis_fontsize)
        axes[ax_idx].set_xticks(ind + width/2)
        axes[ax_idx].set_xticklabels([])

    fig.tight_layout()

def leiden_clustering(umap_res, resolution_range=(0,1), random_state=2, kdtree_dist='euclidean'):
    tree = neighbors.KDTree(umap_res, metric=kdtree_dist)
    vals, i, j = [], [], []
    for idx in range(umap_res.shape[0]):
        dist, ind = tree.query([umap_res[idx]], k=25)
        vals.extend(list(dist.squeeze()))
        j.extend(list(ind.squeeze()))
        i.extend([idx] * len(ind.squeeze()))
    print(len(vals))
    ginput = sps.csc_matrix((numpy.array(vals), (numpy.array(i),numpy.array(j))), 
                            shape=(umap_res.shape[0], umap_res.shape[0]))
    sources, targets = ginput.nonzero()
    edgelist = zip(sources.tolist(), targets.tolist())
    G = ig.Graph(edges=list(edgelist))
    optimiser = leidenalg.Optimiser()
    optimiser.set_rng_seed(random_state)
    profile = optimiser.resolution_profile(G, leidenalg.CPMVertexPartition, resolution_range=resolution_range, number_iterations=0)
    print([len(elt) for elt in profile])
    return profile

def write_peaks_and_map_to_genes(data_array, row_headers, c_labels, out_dir, refseq_exon_bed, 
                                 uniqueness_threshold=3, num_peaks=1000):
    #write the peaks present in each cluster to bed files
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    else:
        local['rm']('-r', out_dir)
        os.makedirs(out_dir)
    
    #write a file of peaks per cluster in bed format
    peak_files = []
    for idx, cluster_name in enumerate(sorted(set(c_labels))):
        cell_coords = numpy.where(c_labels == cluster_name)
        peak_sums = numpy.mean(data_array[:,cell_coords[0]], axis=1)
        peak_sort = numpy.argsort(peak_sums)
#        sorted_peaks = peak_sums[peak_sort]
#        print('Cluster {!s} -- Present Peaks: {!s}, '
#              'Min Peaks/Cell: {!s}, '
#              'Max Peaks/Cell: {!s}, '
#              'Peaks in {!s}th cell: {!s}'.format(cluster_name, numpy.sum(peak_sums > 0), 
#                                                  sorted_peaks[0], sorted_peaks[-1], 
#                                                  num_peaks, sorted_peaks[-num_peaks]))
        out_tmp = os.path.join(out_dir, 'peaks{!s}.tmp.bed'.format(cluster_name))
        out_path = out_tmp.replace('.tmp', '')
        peak_indices = peak_sort[-num_peaks:]
        with open(out_tmp, 'w') as out:
            out.write('\n'.join('chr'+'\t'.join(elt) if not elt[0].startswith('chr') else '\t'.join(elt) 
                                for elt in numpy.hstack([row_headers[peak_indices],
                                                         peak_sums[peak_indices,None].astype(str)])) + '\n')
        (local['sort']['-k1,1', '-k2,2n', out_tmp] > out_path)()
        os.remove(out_tmp)
        peak_files.append(out_path)

    bedtools, sort, cut, uniq, awk = local['bedtools'], local['sort'], local['cut'], local['uniq'], local['awk']
    out_subdir = os.path.join(out_dir, 'nearest_genes')
    if not os.path.isdir(out_subdir):
        os.makedirs(out_subdir)
    nearest_genes = []
    for path in sorted(peak_files):
        out_path = os.path.join(out_subdir, os.path.basename(path).replace('.bed', '.nearest_genes.txt'))
        cmd = (bedtools['closest', '-D', 'b', '-io', '-id', '-a', path, '-b', refseq_exon_bed] |
         cut['-f1,2,3,5,9,12'] | #fields are chrom, start, stop, peak sum, gene name, distance
         awk['BEGIN{OFS="\t"}{if($6 > -1200){print($1, $2, $3, $6, $5, $4);}}'] |
         sort['-k5,5', '-k6,6nr'] |
         cut['-f5,6'])()
        with open(out_path, 'w') as out:
            prev_gene = None
            for idx, line in enumerate(str(cmd).strip().split('\n')):
                if prev_gene is None or not line.startswith(prev_gene):
#                    print(line)
                    line_split = line.strip().split()
                    prev_gene = line_split[0]
                    out.write(line + '\n')
        nearest_genes.append(out_path)

    all_genes = []
#    for idx in range(len(nearest_genes)):
#        nearest_genes_path = os.path.join(out_subdir, 'peaks{!s}.nearest_genes.txt'.format(idx))
    for nearest_genes_path in nearest_genes:
        with open(nearest_genes_path) as lines_in:
            all_genes.append([elt.strip().split() for elt in lines_in.readlines()])

#    count_dict = Counter([i[0] for i in itertools.chain(*[all_genes[elt] for elt in range(len(nearest_genes))])])
    count_dict = Counter([i[0] for i in itertools.chain(*all_genes)])
    #print unique genes
    for idx, nearest_genes_path in enumerate(nearest_genes):
        unique_genes = [elt for elt in all_genes[idx] if count_dict[elt[0]] < uniqueness_threshold]
        print(idx, len(unique_genes))
#        unique_genes_path = os.path.join(out_subdir, 'peaks{!s}.nearest_genes_lt_{!s}.txt'.
#                                         format(idx, uniqueness_threshold))
        unique_genes_path = os.path.splitext(nearest_genes_path)[0] + '_lt_{!s}.txt'.format(uniqueness_threshold)
        with open(unique_genes_path, 'w') as out:
            out.write('\n'.join(['\t'.join(elt) for elt in unique_genes]) + '\n')
    #print shared genes
    shared_genes_by_cluster = []
    all_genes = [dict([(k,float(v)) for k,v in elt]) for elt in all_genes]
    for gene_name in sorted(count_dict.keys()):
        if count_dict[gene_name] < uniqueness_threshold:
            continue
        shared_genes_by_cluster.append([gene_name])
        for cluster_dict in all_genes:
            shared_genes_by_cluster[-1].append(cluster_dict.get(gene_name, 0.0))
    shared_out = os.path.join(out_subdir, 'non-unique_genes_lt_{!s}.txt'.
                              format(uniqueness_threshold))
    numpy.savetxt(shared_out, shared_genes_by_cluster, fmt='%s')
#                  fmt=('%s',)+tuple('%18f' for _ in range(len(all_genes))))

    return

def write_peaks_and_map_to_genes2(data_array, peak_topic_specificity, row_headers, c_labels, out_dir, 
                                  refseq_exon_bed, uniqueness_threshold=3, num_peaks=1000):
#    import pdb; pdb.set_trace()
    #write the peaks present in each cluster to bed files
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    else:
        local['rm']('-r', out_dir)
        os.makedirs(out_dir)
    
    #write a file of peaks per cluster in bed format
    peak_files = []
    for idx, cluster_name in enumerate(sorted(set(c_labels))):
        cell_coords = numpy.where(c_labels == cluster_name)
        peaks_present = numpy.sum(data_array[cell_coords[0],:], axis=0)
        out_tmp = os.path.join(out_dir, 'peaks{!s}.tmp.bed'.format(cluster_name))
        out_path = out_tmp.replace('.tmp', '')
#        peak_indices = peak_sort[-num_peaks:]
        peak_scores = (peak_topic_specificity ** 2) * peaks_present
        sort_idx = numpy.argsort(peak_scores[peaks_present.astype(bool)])
        peak_indices = sort_idx[-num_peaks:]
        with open(out_tmp, 'w') as out:
#            out.write('\n'.join('chr'+'\t'.join(elt) if not elt[0].startswith('chr') else '\t'.join(elt) 
#                                for elt in numpy.hstack([row_headers[peaks_present.astype(bool)][peak_indices],
#                                                         peak_scores[peaks_present.astype(bool)][peak_indices,None].astype(str)])) + '\n')
            out.write('\n'.join('\t'.join(elt) for elt in 
                                numpy.hstack([row_headers[peaks_present.astype(bool)][peak_indices],
                                              peak_scores[peaks_present.astype(bool)][peak_indices,None].astype(str)])) + '\n')
        (local['sort']['-k1,1', '-k2,2n', out_tmp] > out_path)()
        os.remove(out_tmp)
        peak_files.append(out_path)

    bedtools, sort, cut, uniq, awk = local['bedtools'], local['sort'], local['cut'], local['uniq'], local['awk']
    out_subdir = os.path.join(out_dir, 'nearest_genes')
    if not os.path.isdir(out_subdir):
        os.makedirs(out_subdir)
    nearest_genes = []
    for path in sorted(peak_files):
        out_path = os.path.join(out_subdir, os.path.basename(path).replace('.bed', '.nearest_genes.txt'))
        cmd = (bedtools['closest', '-D', 'b', '-io', '-id', '-a', path, '-b', refseq_exon_bed] |
         cut['-f1,2,3,5,9,12'] | #fields are chrom, start, stop, peak sum, gene name, distance
         awk['BEGIN{OFS="\t"}{if($6 > -1200){print($1, $2, $3, $6, $5, $4);}}'] |
         sort['-k5,5', '-k6,6nr'] |
         cut['-f5,6'])()
        with open(out_path, 'w') as out:
            prev_gene = None
            for idx, line in enumerate(str(cmd).strip().split('\n')):
                if prev_gene is None or not line.startswith(prev_gene):
#                    print(line)
                    line_split = line.strip().split()
                    prev_gene = line_split[0]
                    out.write(line + '\n')
        nearest_genes.append(out_path)

    all_genes = []
#    for idx in range(len(nearest_genes)):
#        nearest_genes_path = os.path.join(out_subdir, 'peaks{!s}.nearest_genes.txt'.format(idx))
    for nearest_genes_path in nearest_genes:
        with open(nearest_genes_path) as lines_in:
            all_genes.append([elt.strip().split() for elt in lines_in.readlines()])

#    count_dict = Counter([i[0] for i in itertools.chain(*[all_genes[elt] for elt in range(len(nearest_genes))])])
    count_dict = Counter([i[0] for i in itertools.chain(*all_genes)])
    #print unique genes
    for idx, nearest_genes_path in enumerate(nearest_genes):
        unique_genes = [elt for elt in all_genes[idx] if count_dict[elt[0]] < uniqueness_threshold]
        print(idx, len(unique_genes))
#        unique_genes_path = os.path.join(out_subdir, 'peaks{!s}.nearest_genes_lt_{!s}.txt'.
#                                         format(idx, uniqueness_threshold))
        unique_genes_path = os.path.splitext(nearest_genes_path)[0] + '_lt_{!s}.txt'.format(uniqueness_threshold)
        with open(unique_genes_path, 'w') as out:
            out.write('\n'.join(['\t'.join(elt) for elt in unique_genes]) + '\n')
    #print shared genes
    shared_genes_by_cluster = []
    all_genes = [dict([(k,float(v)) for k,v in elt]) for elt in all_genes]
    for gene_name in sorted(count_dict.keys()):
        if count_dict[gene_name] < uniqueness_threshold:
            continue
        shared_genes_by_cluster.append([gene_name])
        for cluster_dict in all_genes:
            shared_genes_by_cluster[-1].append(cluster_dict.get(gene_name, 0.0))
    shared_out = os.path.join(out_subdir, 'non-unique_genes_lt_{!s}.txt'.
                              format(uniqueness_threshold))
    numpy.savetxt(shared_out, shared_genes_by_cluster, fmt='%s')
#                  fmt=('%s',)+tuple('%18f' for _ in range(len(all_genes))))

    return

def write_peaks_and_map_to_genes3(data_array, row_headers, c_labels, out_dir, 
                                  refseq_exon_bed, uniqueness_threshold=3, num_peaks=1000):
#    import pdb; pdb.set_trace()
    #write the peaks present in each cluster to bed files
    if not os.path.isdir(out_dir):
        os.makedirs(out_dir)
    else:
        local['rm']('-r', out_dir)
        os.makedirs(out_dir)

    agg_clusters = numpy.vstack([numpy.sum(data_array[numpy.where(c_labels == cluster_idx)[0]], axis=0)
                                 for cluster_idx in sorted(set(c_labels))])
    tfidf = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
    agg_clusters_tfidf = tfidf.fit_transform(agg_clusters).toarray()

    #write a file of peaks per cluster in bed format
    peak_files = []
    for idx, cluster_name in enumerate(sorted(set(c_labels))):
        out_tmp = os.path.join(out_dir, 'peaks{!s}.tmp.bed'.format(cluster_name))
        out_path = out_tmp.replace('.tmp', '')
        sort_idx = numpy.argsort(agg_clusters_tfidf[idx])
        peak_indices = sort_idx[-num_peaks:]
        with open(out_tmp, 'w') as out:
#            out.write('\n'.join('chr'+'\t'.join(elt) if not elt[0].startswith('chr') else '\t'.join(elt) 
#                                for elt in numpy.hstack([row_headers[peaks_present.astype(bool)][peak_indices],
#                                                         peak_scores[peaks_present.astype(bool)][peak_indices,None].astype(str)])) + '\n')
            out.write('\n'.join('\t'.join(elt) for elt in 
                                numpy.hstack([row_headers[peak_indices],
                                              agg_clusters_tfidf[idx][peak_indices,None].astype(str)])) + '\n')
        (local['sort']['-k1,1', '-k2,2n', out_tmp] > out_path)()
        os.remove(out_tmp)
        peak_files.append(out_path)

    bedtools, sort, cut, uniq, awk = local['bedtools'], local['sort'], local['cut'], local['uniq'], local['awk']
    out_subdir = os.path.join(out_dir, 'nearest_genes')
    if not os.path.isdir(out_subdir):
        os.makedirs(out_subdir)
    nearest_genes = []
    for path in sorted(peak_files):
        out_path = os.path.join(out_subdir, os.path.basename(path).replace('.bed', '.nearest_genes.txt'))
        cmd = (bedtools['closest', '-D', 'b', '-io', '-id', '-a', path, '-b', refseq_exon_bed] |
         cut['-f1,2,3,5,9,12'] | #fields are chrom, start, stop, peak sum, gene name, distance
         awk['BEGIN{OFS="\t"}{if($6 > -1200){print($1, $2, $3, $6, $5, $4);}}'] |
         sort['-k5,5', '-k6,6nr'] |
         cut['-f5,6'])()
        with open(out_path, 'w') as out:
            prev_gene = None
            for idx, line in enumerate(str(cmd).strip().split('\n')):
                if prev_gene is None or not line.startswith(prev_gene):
#                    print(line)
                    line_split = line.strip().split()
                    prev_gene = line_split[0]
                    out.write(line + '\n')
        nearest_genes.append(out_path)

    all_genes = []
#    for idx in range(len(nearest_genes)):
#        nearest_genes_path = os.path.join(out_subdir, 'peaks{!s}.nearest_genes.txt'.format(idx))
    for nearest_genes_path in nearest_genes:
        with open(nearest_genes_path) as lines_in:
            all_genes.append([elt.strip().split() for elt in lines_in.readlines()])

#    count_dict = Counter([i[0] for i in itertools.chain(*[all_genes[elt] for elt in range(len(nearest_genes))])])
    count_dict = Counter([i[0] for i in itertools.chain(*all_genes)])
    #print unique genes
    for idx, nearest_genes_path in enumerate(nearest_genes):
        unique_genes = [elt for elt in all_genes[idx] if count_dict[elt[0]] < uniqueness_threshold]
        print(idx, len(unique_genes))
#        unique_genes_path = os.path.join(out_subdir, 'peaks{!s}.nearest_genes_lt_{!s}.txt'.
#                                         format(idx, uniqueness_threshold))
        unique_genes_path = os.path.splitext(nearest_genes_path)[0] + '_lt_{!s}.txt'.format(uniqueness_threshold)
        with open(unique_genes_path, 'w') as out:
            out.write('\n'.join(['\t'.join(elt) for elt in unique_genes]) + '\n')
    #print shared genes
    shared_genes_by_cluster = []
    all_genes = [dict([(k,float(v)) for k,v in elt]) for elt in all_genes]
    for gene_name in sorted(count_dict.keys()):
        if count_dict[gene_name] < uniqueness_threshold:
            continue
        shared_genes_by_cluster.append([gene_name])
        for cluster_dict in all_genes:
            shared_genes_by_cluster[-1].append(cluster_dict.get(gene_name, 0.0))
    shared_out = os.path.join(out_subdir, 'non-unique_genes_lt_{!s}.txt'.
                              format(uniqueness_threshold))
    numpy.savetxt(shared_out, shared_genes_by_cluster, fmt='%s')
#                  fmt=('%s',)+tuple('%18f' for _ in range(len(all_genes))))

    return

## Peaks model

In [3]:
#read in sc peak table
peaktable_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/filtered_peaks_iqr4.0_low_cells.bow'
peak_data_sparse = numpy.loadtxt(peaktable_path, dtype=int, skiprows=3)
peak_data = sps.csr_matrix((peak_data_sparse[:,2], (peak_data_sparse[:,0] - 1, peak_data_sparse[:,1] - 1)))

cell_names_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/filtered_peaks_iqr4.0_low_cells.indextable.txt'
cell_names = numpy.loadtxt(cell_names_path, dtype=object)[:,0]

peak_names_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/filtered_peaks_iqr4.0_low_cells.extra_cols.bed'
peak_row_headers = numpy.loadtxt(peak_names_path, dtype=object)
#chr_regex = re.compile('[:-]')
peak_row_headers = numpy.hstack([peak_row_headers, numpy.array(['name'] * peak_row_headers.shape[0])[:,None]])
print(peak_data.shape)

orig_peaktable_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/../all_peaks.merged.allcells.bow'
orig_peak_data_sparse = numpy.loadtxt(orig_peaktable_path, dtype=int, skiprows=3)
orig_peak_data = sps.csr_matrix((orig_peak_data_sparse[:,2], 
                                (orig_peak_data_sparse[:,0] - 1, orig_peak_data_sparse[:,1] - 1)))

orig_cell_names_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/../all_peaks.merged.allcells.zeros_filtered.indextable.txt'
orig_cell_names = numpy.loadtxt(orig_cell_names_path, dtype=object)[:,0]

orig_peak_names_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/../all_peaks.merged.allcells.zeros_filtered.bed'
orig_peak_row_headers = numpy.loadtxt(orig_peak_names_path, dtype=object)
#chr_regex = re.compile('[:-]')
orig_peak_row_headers = numpy.hstack([orig_peak_row_headers, 
                                      numpy.array(['name'] * orig_peak_row_headers.shape[0])[:,None]])
print(orig_peak_data.shape)

#peak_binary = sps.csr_matrix((numpy.ones((peak_data_sparse.shape[0],)), 
#                              (peak_data_sparse[:,0], peak_data_sparse[:,1])))
#print(peak_binary.shape)

#make a cells x peaks array
peak_data_array = peak_data.toarray().astype(numpy.int8)
print(peak_data_array.shape)
del(peak_data)

orig_peak_data_array = orig_peak_data.toarray().astype(numpy.int8)
print(orig_peak_data_array.shape)
del(orig_peak_data)

(1504, 8432)
(1571, 10216)
(1504, 8432)
(1571, 10216)


In [4]:
cell_idx_map = [numpy.where(orig_cell_names == elt)[0][0] for elt in cell_names]
orig_peak_data_array_filtered = orig_peak_data_array[cell_idx_map]
print(orig_peak_data_array_filtered.shape)

(1504, 10216)


In [5]:
numpy.median(numpy.sum(peak_data_array, axis=1))

89.0

In [6]:
numpy.median(numpy.sum(peak_data_array, axis=0))

13.0

In [7]:
numpy.max(peak_data_array)

1

## Analysis functions

In [8]:
REFDATA = 'ATAC_sequencing/2018_worm_atac/ref_data/WS235'
refseq_exon_bed = os.path.join(REFDATA, 'c_elegans.WS272.canonical_geneset.genes.common_names.sorted.bed.gz')
import gzip
ucsc = True if peak_row_headers[0][0].startswith('chr') else False
with gzip.open(refseq_exon_bed, 'rb') as lines_in:
    exon_locs = []
    for line in lines_in:
        line = line.decode()[3:].strip().split()
        if ucsc is True:
            line[0] = 'chr{!s}'.format(line[0])
        line[1] = int(line[1])
        line[2] = int(line[2])
        exon_locs.append(line)

gene_locs = {}
for exon in exon_locs:
    gene_locs.setdefault(exon[3], []).append(exon)
for gene, locs in gene_locs.items():
    gene_locs[gene] = sorted(locs, key=lambda x:(x[1],x[2]))

In [9]:
class DistanceException(Exception):
    pass
class NoPeakException(Exception):
    pass

def get_closest_peaks(gene_name, row_headers, verbose=False, dist_threshold=1200, dist_excpt=False):
    gene_coord = gene_locs[gene_name][0] if gene_locs[gene_name][0][-1] == '+' else gene_locs[gene_name][-1]
    if verbose:
        print(gene_coord)
    if gene_coord[-1] == '+':
        try:
            nearest_peak = numpy.where(numpy.logical_and(row_headers[:,0] == gene_coord[0], 
                                                         row_headers[:,1].astype(int) <= gene_coord[1]))[0][-1]
        except IndexError:
            raise NoPeakException()
        alt_peak = nearest_peak - 1
#        peak_dist = numpy.absolute(gene_coord[1] - row_headers[[nearest_peak, alt_peak],1].astype(int))
        peak_dist = gene_coord[1] - row_headers[[nearest_peak, alt_peak],2].astype(int)
        if verbose:
            print(row_headers[[nearest_peak, alt_peak]])
            print(peak_dist)
    else:
        try:
            nearest_peak = numpy.where(numpy.logical_and(row_headers[:,0] == gene_coord[0], 
                                                         row_headers[:,2].astype(int) >= gene_coord[2]))[0][0]
        except IndexError:
            raise NoPeakException()
        alt_peak = nearest_peak + 1
#        peak_dist = numpy.absolute(gene_coord[2] - row_headers[[nearest_peak, alt_peak],2].astype(int))
        peak_dist = row_headers[[nearest_peak, alt_peak],1].astype(int) - gene_coord[2]
        if verbose:
            print(row_headers[[nearest_peak, alt_peak]])
            print(peak_dist)
    if peak_dist[0] > dist_threshold:
        msg = 'Warning: nearest peak to {!s} is far away! ({!s} bp)'.format(gene_name, peak_dist[0])
        if dist_excpt:
            raise DistanceException(msg)
        else:
            print(msg)
    return nearest_peak, alt_peak

def get_closest_peaks2(gene_name, row_headers, verbose=False, dist_threshold=1200, gene_end_threshold=100, dist_excpt=False):
    gene_coord = gene_locs[gene_name][0] if gene_locs[gene_name][0][-1] == '+' else gene_locs[gene_name][-1]
#    gene_coord = gene_locs[gene_name][0]
#    gene_coord[2] = gene_locs[gene_name][-1][2]
    if verbose:
        print(gene_coord)
    if gene_coord[-1] == '+':
        try:                                              #same chromosome
            nearest_peaks = numpy.where(numpy.logical_and(row_headers[:,0] == gene_coord[0], 
                                                          #peak start is before gene stop
                                                          #peak stop is after gene start - dist threshold and close to gene end
                                                          numpy.logical_and(row_headers[:,1].astype(int) <= (gene_coord[2] + gene_end_threshold),
                                                                            row_headers[:,2].astype(int) >= (gene_coord[1] - dist_threshold))))[0]
#                                                                            numpy.logical_and(row_headers[:,2].astype(int) >= (gene_coord[1] - dist_threshold),
#                                                                                              row_headers[:,2].astype(int) <= (gene_coord[2] + gene_end_threshold)))))[0]
        except IndexError:
            raise NoPeakException()
#        alt_peak = nearest_peak - 1
#        peak_dist = numpy.absolute(gene_coord[1] - row_headers[[nearest_peak, alt_peak],1].astype(int))
        peak_dist = gene_coord[1] - row_headers[nearest_peaks,1].astype(int)
        if verbose:
            print(row_headers[nearest_peaks])
            print(peak_dist)
    else:
        try:
            nearest_peaks = numpy.where(numpy.logical_and(row_headers[:,0] == gene_coord[0], 
                                                          numpy.logical_and(row_headers[:,2].astype(int) >= (gene_coord[1] - gene_end_threshold),
                                                                            row_headers[:,1].astype(int) <= (gene_coord[2] + dist_threshold))))[0]
#                                                                            numpy.logical_and(row_headers[:,1].astype(int) <= (gene_coord[2] + dist_threshold),
#                                                                                              row_headers[:,1].astype(int) >= (gene_coord[1] - gene_end_threshold)))))[0]
        except IndexError:
            raise NoPeakException()
#        alt_peak = nearest_peak + 1
#        peak_dist = numpy.absolute(gene_coord[2] - row_headers[[nearest_peak, alt_peak],2].astype(int))
        peak_dist = row_headers[nearest_peaks,2].astype(int) - gene_coord[2]
        if verbose:
            print(row_headers[nearest_peaks])
            print(peak_dist)
#    if peak_dist[0] > dist_threshold:
#        msg = 'Warning: nearest peak to {!s} is far away! ({!s} bp)'.format(gene_name, peak_dist[0])
#        if dist_excpt:
#            raise DistanceException(msg)
#        else:
#            print(msg)
    return nearest_peaks

def get_gene_cells(gene_name, row_headers, peak_data_array, **kwargs):
    nearest_peaks = get_closest_peaks2(gene_name, row_headers, **kwargs)
    cells_idx = numpy.any(peak_data_array[:,nearest_peaks], axis=1)
    return cells_idx

def get_gene_idx(gene_name, row_headers, peaktopic_frac, topic_prob_threshold=0.5, **kwargs):
    nearest_peak, alt_peak = get_closest_peaks(gene_name, row_headers, **kwargs)
    topic_idx = numpy.argsort(peaktopic_frac[nearest_peak])[::-1]
    num_to_get = numpy.where(numpy.cumsum(peaktopic_frac[nearest_peak][topic_idx]) > topic_prob_threshold)[0][0] + 1
    return nearest_peak, topic_idx[:num_to_get]

def get_gene_topn_topics(gene_name, row_headers, peaktopic_frac, ntopics=1, **kwargs):
    nearest_peak, alt_peak = get_closest_peaks(gene_name, row_headers, **kwargs)
    topic_idx = numpy.argsort(peaktopic_frac[nearest_peak])[::-1]
    return nearest_peak, topic_idx[:ntopics]

## Topic Mode

In [10]:
doctopic_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/0000_topics9_alpha3.000_beta2000.000/topic_mode.theta'
doctopic_peaks = numpy.loadtxt(doctopic_path, delimiter=',', dtype=float)
print(doctopic_peaks.shape)

#center and scale the topic values
#col_means = numpy.mean(doctopic.T, axis=0)
#doctopic_norm = doctopic.T - col_means
#doctopic_norm = doctopic_norm / numpy.std(doctopic_norm, axis=0)
#doctopic_norm = doctopic_norm.T
#print(doctopic_norm.shape)

col_means = numpy.mean(doctopic_peaks.T, axis=0)
doctopic_peaks_norm = doctopic_peaks.T - col_means
l2_for_norm = (doctopic_peaks_norm ** 2).sum(axis=0).flatten() ** 0.5
doctopic_peaks_norm /= l2_for_norm
doctopic_peaks_norm = doctopic_peaks_norm.T
print(doctopic_peaks_norm.shape)

doctopic_peaks_frac = (doctopic_peaks.T/doctopic_peaks.sum(axis=1).astype(float)).T
print(doctopic_peaks_frac.shape)

(1504, 9)
(1504, 9)
(1504, 9)


In [11]:
peaktopic_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/0000_topics9_alpha3.000_beta2000.000/topic_mode.wordTopic'
peaktopic = numpy.loadtxt(peaktopic_path, delimiter=',', dtype=float)
print(peaktopic.shape)

#center and scale the topic values
#col_means = numpy.mean(doctopic.T, axis=0)
#doctopic_norm = doctopic.T - col_means
#doctopic_norm = doctopic_norm / numpy.std(doctopic_norm, axis=0)
#doctopic_norm = doctopic_norm.T
#print(doctopic_norm.shape)

nonzero_idx = numpy.where(numpy.sum(peaktopic, axis=1) > 0)[0]
peaktopic = peaktopic[nonzero_idx]
peak_row_headers = peak_row_headers[nonzero_idx]
peak_data_array = peak_data_array[:,nonzero_idx]

col_means = numpy.mean(peaktopic.T, axis=0)
peaktopic_norm = peaktopic.T - col_means
l2_for_norm = (peaktopic_norm ** 2).sum(axis=0).flatten() ** 0.5
peaktopic_norm /= l2_for_norm
peaktopic_norm = peaktopic_norm.T
print(peaktopic_norm.shape)

peaktopic_frac = (peaktopic.T/peaktopic.sum(axis=1).astype(float)).T
print(peaktopic_frac.shape)

(8432, 9)
(8432, 9)
(8432, 9)


In [12]:
doctopic_peaks_umap3_obj = umap.UMAP(n_components=3, random_state=253)
doctopic_peaks_umap3_res = doctopic_peaks_umap3_obj.fit_transform(doctopic_peaks_norm)
print(doctopic_peaks_umap3_res.shape)

(1504, 3)


In [13]:
doctopic_peaks_umap2_obj = umap.UMAP(n_components=2, n_neighbors=15, random_state=1)
doctopic_peaks_umap2_res = doctopic_peaks_umap2_obj.fit_transform(doctopic_peaks_norm)
print(doctopic_peaks_umap2_res.shape)

(1504, 2)


In [15]:
ncols=4
nrows = int(numpy.ceil(doctopic_peaks_frac.shape[1]/ncols))
fig, axes = pyplot.subplots(nrows=nrows, ncols=ncols, figsize=(3.5*ncols,3*nrows))
for idx, topic in enumerate(numpy.arange(doctopic_peaks_frac.shape[1])):
    row_idx, col_idx = int(idx/ncols), int(idx%ncols)
    if nrows > 1 and ncols > 1:
        ax = axes[row_idx, col_idx]
    elif nrows > 1 or ncols > 1:
        ax = axes[idx]
    else:
        ax = axes
    s = ax.scatter(doctopic_peaks_umap2_res[:,0], 
                   doctopic_peaks_umap2_res[:,1],
                   cmap='viridis',
                   c=doctopic_peaks_frac[:,topic],
                   s=2)
    ax.set_ylabel('UMAP2')
    ax.set_xlabel('UMAP1')
    ax.set_title('Topic {!s}'.format(topic))
    fig.colorbar(s, ax=ax)
#cbar = fig.colorbar(s, ticks=numpy.arange(len(gene_names)))
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

<IPython.core.display.Javascript object>

In [15]:
#Excretory cell
gene_names = ['mca-1', 'pgp-3', 'klo-1', 'ceh-6']

not_in_clust_idx = numpy.ones(doctopic_peaks_umap2_res.shape[0])
in_clust_idx = []
in_clust_colors = []
fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(9,7))
for idx, gname in enumerate(gene_names):
    have_gene = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_idx = list(numpy.where(have_gene)[0])
    not_in_clust_idx = numpy.logical_and(not_in_clust_idx, ~have_gene)
    s = axes.scatter(doctopic_peaks_umap2_res[have_gene_idx,0], 
                     doctopic_peaks_umap2_res[have_gene_idx,1],
                     label=gname,
                     zorder=1,
                     s=25)
axes.scatter(doctopic_peaks_umap2_res[not_in_clust_idx,0], 
             doctopic_peaks_umap2_res[not_in_clust_idx,1], 
             c='grey',
             zorder=0,
             s=1)
axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
axes.legend(loc='best')
fig.tight_layout()

<IPython.core.display.Javascript object>

['chrIV', 9625909, 9634948, 'mca-1', '0', '-']
[['chrIV' '9625692' '9626081' 'name']
 ['chrIV' '9631958' '9632158' 'name']
 ['chrIV' '9632526' '9632757' 'name']
 ['chrIV' '9633049' '9633780' 'name']
 ['chrIV' '9633896' '9634096' 'name']
 ['chrIV' '9634945' '9635250' 'name']
 ['chrIV' '9635374' '9635856' 'name']]
[-8867 -2790 -2191 -1168  -852   302   908]
['chrX', 11352135, 11357094, 'pgp-3', '0', '+']
[]
[]
['chrIV', 7745978, 7748225, 'klo-1', '0', '-']
[]
[]
['chrI', 8498113, 8502412, 'ceh-6', '0', '-']
[]
[]


In [16]:
#Excretory gland
gene_names = ['lim-6', 'ser-2', 'aat-1']

not_in_clust_idx = numpy.ones(doctopic_peaks_umap2_res.shape[0])
in_clust_idx = []
in_clust_colors = []
fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(9,7))
for idx, gname in enumerate(gene_names):
    have_gene = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_idx = list(numpy.where(have_gene)[0])
    not_in_clust_idx = numpy.logical_and(not_in_clust_idx, ~have_gene)
    s = axes.scatter(doctopic_peaks_umap2_res[have_gene_idx,0], 
                     doctopic_peaks_umap2_res[have_gene_idx,1],
                     label=gname,
                     zorder=1,
                     s=25)
axes.scatter(doctopic_peaks_umap2_res[not_in_clust_idx,0], 
             doctopic_peaks_umap2_res[not_in_clust_idx,1], 
             c='grey',
             zorder=0,
             s=1)
axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
axes.legend(loc='best')
fig.tight_layout()

<IPython.core.display.Javascript object>

['chrX', 1073907, 1079451, 'lim-6', '0', '+']
[]
[]
['chrX', 15286578, 15302963, 'ser-2', '0', '-']
[['chrX' '15297393' '15297593' 'name']]
[-5370]
['chrIV', 9598985, 9601695, 'aat-1', '0', '-']
[]
[]


In [17]:
#Excretory duct
gene_names = ['grl-2', 'fmo-4', 'B0303.11', 'lin-48', 'mua-3']

not_in_clust_idx = numpy.ones(doctopic_peaks_umap2_res.shape[0])
in_clust_idx = []
in_clust_colors = []
fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(9,7))
for idx, gname in enumerate(gene_names):
    have_gene = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_idx = list(numpy.where(have_gene)[0])
    not_in_clust_idx = numpy.logical_and(not_in_clust_idx, ~have_gene)
    s = axes.scatter(doctopic_peaks_umap2_res[have_gene_idx,0], 
                     doctopic_peaks_umap2_res[have_gene_idx,1],
                     label=gname,
                     zorder=1,
                     s=25)
axes.scatter(doctopic_peaks_umap2_res[not_in_clust_idx,0], 
             doctopic_peaks_umap2_res[not_in_clust_idx,1], 
             c='grey',
             zorder=0,
             s=1)
axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
axes.legend(loc='best')
fig.tight_layout()

<IPython.core.display.Javascript object>

['chrV', 12944860, 12946853, 'grl-2', '0', '+']
[]
[]
['chrV', 13603275, 13606701, 'fmo-4', '0', '-']
[['chrV' '13601756' '13603361' 'name']]
[-3340]
['chrIII', 8703299, 8710410, 'B0303.11', '0', '-']
[]
[]
['chrIII', 3745460, 3747451, 'lin-48', '0', '+']
[['chrIII' '3744123' '3744481' 'name']
 ['chrIII' '3744490' '3744690' 'name']
 ['chrIII' '3745770' '3747521' 'name']]
[1337  970 -310]
['chrIII', 10160696, 10183529, 'mua-3', '0', '+']
[['chrIII' '10160405' '10160755' 'name']
 ['chrIII' '10161425' '10162406' 'name']
 ['chrIII' '10167673' '10168139' 'name']
 ['chrIII' '10169148' '10169590' 'name']
 ['chrIII' '10171031' '10171629' 'name']
 ['chrIII' '10179450' '10180015' 'name']]
[   291   -729  -6977  -8452 -10335 -18754]


In [18]:
#AMsh
gene_names = ['vap-1', 'fig-1', 'kcc-3', 'nas-31', 'pros-1', 'F52E1.2', 'F16F9.3']

not_in_clust_idx = numpy.ones(doctopic_peaks_umap2_res.shape[0])
in_clust_idx = []
in_clust_colors = []
fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(9,7))
for idx, gname in enumerate(gene_names):
    have_gene = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_idx = list(numpy.where(have_gene)[0])
    not_in_clust_idx = numpy.logical_and(not_in_clust_idx, ~have_gene)
    s = axes.scatter(doctopic_peaks_umap2_res[have_gene_idx,0], 
                     doctopic_peaks_umap2_res[have_gene_idx,1],
                     label=gname,
                     zorder=1,
                     s=25)
axes.scatter(doctopic_peaks_umap2_res[not_in_clust_idx,0], 
             doctopic_peaks_umap2_res[not_in_clust_idx,1], 
             c='grey',
             zorder=0,
             s=1)
axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
axes.legend(loc='best')
fig.tight_layout()

<IPython.core.display.Javascript object>

['chrX', 17393788, 17396556, 'vap-1', '0', '-']
[['chrX' '17393711' '17394486' 'name']
 ['chrX' '17395164' '17397893' 'name']]
[-2070  1337]
['chrV', 10970398, 10981213, 'fig-1', '0', '-']
[['chrV' '10969973' '10970374' 'name']
 ['chrV' '10971430' '10971854' 'name']
 ['chrV' '10974093' '10974427' 'name']
 ['chrV' '10974659' '10975999' 'name']
 ['chrV' '10977347' '10977919' 'name']
 ['chrV' '10978214' '10978811' 'name']
 ['chrV' '10980234' '10980608' 'name']
 ['chrV' '10981128' '10982106' 'name']
 ['chrV' '10982286' '10983332' 'name']]
[-10839  -9359  -6786  -5214  -3294  -2402   -605    893   2119]
['chrII', 7425938, 7434282, 'kcc-3', '0', '-']
[['chrII' '7430330' '7430809' 'name']
 ['chrII' '7433675' '7433875' 'name']
 ['chrII' '7434025' '7434646' 'name']]
[-3473  -407   364]
['chrV', 10918860, 10921847, 'nas-31', '0', '-']
[['chrV' '10921645' '10921951' 'name']
 ['chrV' '10922721' '10923076' 'name']]
[ 104 1229]
['chrIII', 8067288, 8071455, 'pros-1', '0', '+']
[['chrIII' '8064822' '8

In [19]:
#PHsh
gene_names = ['vap-1']

not_in_clust_idx = numpy.ones(doctopic_peaks_umap2_res.shape[0])
in_clust_idx = []
in_clust_colors = []
fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(9,7))
for idx, gname in enumerate(gene_names):
    have_gene = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_idx = list(numpy.where(have_gene)[0])
    not_in_clust_idx = numpy.logical_and(not_in_clust_idx, ~have_gene)
    s = axes.scatter(doctopic_peaks_umap2_res[have_gene_idx,0], 
                     doctopic_peaks_umap2_res[have_gene_idx,1],
                     label=gname,
                     zorder=1,
                     s=25)
axes.scatter(doctopic_peaks_umap2_res[not_in_clust_idx,0], 
             doctopic_peaks_umap2_res[not_in_clust_idx,1], 
             c='grey',
             zorder=0,
             s=1)
axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
axes.legend(loc='best')
fig.tight_layout()

<IPython.core.display.Javascript object>

['chrX', 17393788, 17396556, 'vap-1', '0', '-']
[['chrX' '17393711' '17394486' 'name']
 ['chrX' '17395164' '17397893' 'name']]
[-2070  1337]


In [20]:
#CEPsh
gene_names = ['kcc-3', 'hlh-17', 'pros-1', 'glt-1']

not_in_clust_idx = numpy.ones(doctopic_peaks_umap2_res.shape[0])
in_clust_idx = []
in_clust_colors = []
fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(9,7))
for idx, gname in enumerate(gene_names):
    have_gene = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_idx = list(numpy.where(have_gene)[0])
    not_in_clust_idx = numpy.logical_and(not_in_clust_idx, ~have_gene)
    s = axes.scatter(doctopic_peaks_umap2_res[have_gene_idx,0], 
                     doctopic_peaks_umap2_res[have_gene_idx,1],
                     label=gname,
                     zorder=1,
                     s=25)
axes.scatter(doctopic_peaks_umap2_res[not_in_clust_idx,0], 
             doctopic_peaks_umap2_res[not_in_clust_idx,1], 
             c='grey',
             zorder=0,
             s=1)
axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
axes.legend(loc='best')
fig.tight_layout()

<IPython.core.display.Javascript object>

['chrII', 7425938, 7434282, 'kcc-3', '0', '-']
[['chrII' '7430330' '7430809' 'name']
 ['chrII' '7433675' '7433875' 'name']
 ['chrII' '7434025' '7434646' 'name']]
[-3473  -407   364]
['chrIV', 16254763, 16256274, 'hlh-17', '0', '+']
[]
[]
['chrIII', 8067288, 8071455, 'pros-1', '0', '+']
[['chrIII' '8064822' '8066127' 'name']
 ['chrIII' '8066294' '8066750' 'name']
 ['chrIII' '8070110' '8070310' 'name']
 ['chrIII' '8070568' '8070999' 'name']
 ['chrIII' '8071056' '8071764' 'name']]
[ 2466   994 -2822 -3280 -3768]
['chrX', 3496025, 3499165, 'glt-1', '0', '+']
[['chrX' '3495353' '3495911' 'name']]
[672]


In [36]:
#AMsh/PHsh vs CEPsh
gene_names = ['fig-1'] + ['glt-1']

not_in_clust_idx = numpy.ones(doctopic_peaks_umap2_res.shape[0])
in_clust_idx = []
in_clust_colors = []
fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(9,7))
for idx, gname in enumerate(gene_names):
    have_gene = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_idx = list(numpy.where(have_gene)[0])
    not_in_clust_idx = numpy.logical_and(not_in_clust_idx, ~have_gene)
    s = axes.scatter(doctopic_peaks_umap2_res[have_gene_idx,0], 
                     doctopic_peaks_umap2_res[have_gene_idx,1],
                     label=gname,
                     zorder=1,
                     s=25)
axes.scatter(doctopic_peaks_umap2_res[not_in_clust_idx,0], 
             doctopic_peaks_umap2_res[not_in_clust_idx,1], 
             c='grey',
             zorder=0,
             s=1)
axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
axes.legend(loc='best')
fig.tight_layout()

<IPython.core.display.Javascript object>

['chrV', 10970398, 10981213, 'fig-1', '0', '-']
[['chrV' '10969973' '10970374' 'name']
 ['chrV' '10971430' '10971854' 'name']
 ['chrV' '10974093' '10974427' 'name']
 ['chrV' '10974659' '10975999' 'name']
 ['chrV' '10977347' '10977919' 'name']
 ['chrV' '10978214' '10978811' 'name']
 ['chrV' '10980234' '10980608' 'name']
 ['chrV' '10981128' '10982106' 'name']
 ['chrV' '10982286' '10983332' 'name']]
[-10839  -9359  -6786  -5214  -3294  -2402   -605    893   2119]
['chrX', 3496025, 3499165, 'glt-1', '0', '+']
[['chrX' '3495353' '3495911' 'name']]
[672]


In [21]:
#AMso, PHSO
gene_names = ['grd-15', 'grl-12', 'lin-44']

not_in_clust_idx = numpy.ones(doctopic_peaks_umap2_res.shape[0])
in_clust_idx = []
in_clust_colors = []
fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(9,7))
for idx, gname in enumerate(gene_names):
    have_gene = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_idx = list(numpy.where(have_gene)[0])
    not_in_clust_idx = numpy.logical_and(not_in_clust_idx, ~have_gene)
    s = axes.scatter(doctopic_peaks_umap2_res[have_gene_idx,0], 
                     doctopic_peaks_umap2_res[have_gene_idx,1],
                     label=gname,
                     zorder=1,
                     s=25)
axes.scatter(doctopic_peaks_umap2_res[not_in_clust_idx,0], 
             doctopic_peaks_umap2_res[not_in_clust_idx,1], 
             c='grey',
             zorder=0,
             s=1)
axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
axes.legend(loc='best')
fig.tight_layout()

<IPython.core.display.Javascript object>

['chrI', 13564566, 13566362, 'grd-15', '0', '+']
[]
[]
['chrV', 8635819, 8637006, 'grl-12', '0', '-']
[]
[]
['chrI', 4126055, 4129307, 'lin-44', '0', '+']
[]
[]


### Note: if you look at the browser tracks for grd-15, grl-12, and lin-44, there is a small amount of signal for all three genes from Topic 3, suggesting that the small cluster in the southwest corner of the UMAP plot is socket cells.

In [22]:
#includes CEPso, ILso
gene_names = ['cutl-8', 'mam-5', 'daf-6']

not_in_clust_idx = numpy.ones(doctopic_peaks_umap2_res.shape[0])
in_clust_idx = []
in_clust_colors = []
fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(9,7))
for idx, gname in enumerate(gene_names):
    have_gene = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_idx = list(numpy.where(have_gene)[0])
    not_in_clust_idx = numpy.logical_and(not_in_clust_idx, ~have_gene)
    s = axes.scatter(doctopic_peaks_umap2_res[have_gene_idx,0], 
                     doctopic_peaks_umap2_res[have_gene_idx,1],
                     label=gname,
                     zorder=1,
                     s=25)
axes.scatter(doctopic_peaks_umap2_res[not_in_clust_idx,0], 
             doctopic_peaks_umap2_res[not_in_clust_idx,1], 
             c='grey',
             zorder=0,
             s=1)
axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
axes.legend(loc='best')
fig.tight_layout()

<IPython.core.display.Javascript object>

['chrI', 8270303, 8275701, 'cutl-8', '0', '+']
[['chrI' '8270131' '8270439' 'name']]
[172]
['chrV', 9952698, 9956883, 'mam-5', '0', '+']
[['chrV' '9952355' '9952852' 'name']]
[343]
['chrX', 14888487, 14894297, 'daf-6', '0', '+']
[]
[]


In [23]:
#Excretory duct
gene_names = ['grl-2', 'fmo-4', 'B0303.11', 'lin-48', 'mua-3']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrV', 12944860, 12946853, 'grl-2', '0', '+']
[]
[]
['chrV', 13603275, 13606701, 'fmo-4', '0', '-']
[['chrV' '13601756' '13603361' 'name']]
[-3340]
['chrIII', 8703299, 8710410, 'B0303.11', '0', '-']
[]
[]
['chrIII', 3745460, 3747451, 'lin-48', '0', '+']
[['chrIII' '3744123' '3744481' 'name']
 ['chrIII' '3744490' '3744690' 'name']
 ['chrIII' '3745770' '3747521' 'name']]
[1337  970 -310]
['chrIII', 10160696, 10183529, 'mua-3', '0', '+']
[['chrIII' '10160405' '10160755' 'name']
 ['chrIII' '10161425' '10162406' 'name']
 ['chrIII' '10167673' '10168139' 'name']
 ['chrIII' '10169148' '10169590' 'name']
 ['chrIII' '10171031' '10171629' 'name']
 ['chrIII' '10179450' '10180015' 'name']]
[   291   -729  -6977  -8452 -10335 -18754]


<IPython.core.display.Javascript object>

In [37]:
#Excretory and socket cells
gene_names = ['lin-48', 'mua-3']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrIII', 3745460, 3747451, 'lin-48', '0', '+']
[['chrIII' '3744123' '3744481' 'name']
 ['chrIII' '3744490' '3744690' 'name']
 ['chrIII' '3745770' '3747521' 'name']]
[1337  970 -310]
['chrIII', 10160696, 10183529, 'mua-3', '0', '+']
[['chrIII' '10160405' '10160755' 'name']
 ['chrIII' '10161425' '10162406' 'name']
 ['chrIII' '10167673' '10168139' 'name']
 ['chrIII' '10169148' '10169590' 'name']
 ['chrIII' '10171031' '10171629' 'name']
 ['chrIII' '10179450' '10180015' 'name']]
[   291   -729  -6977  -8452 -10335 -18754]


<IPython.core.display.Javascript object>

In [39]:
#CEP + IL socket cells
gene_names = ['cutl-8', 'mam-5'] + ['col-53', 'col-177']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrI', 8270303, 8275701, 'cutl-8', '0', '+']
[['chrI' '8270131' '8270439' 'name']]
[172]
['chrV', 9952698, 9956883, 'mam-5', '0', '+']
[['chrV' '9952355' '9952852' 'name']]
[343]
['chrI', 5644728, 5646370, 'col-53', '0', '+']
[['chrI' '5644165' '5644837' 'name']]
[563]
['chrX', 10999741, 11001850, 'col-177', '0', '-']
[['chrX' '11002852' '11003361' 'name']]
[1511]


<IPython.core.display.Javascript object>

In [33]:
#Sheath cells
gene_names = ['kcc-3', 'pros-1']# + ['nas-31', 'fig-1'] + ['glt-1']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrII', 7425938, 7434282, 'kcc-3', '0', '-']
[['chrII' '7430330' '7430809' 'name']
 ['chrII' '7433675' '7433875' 'name']
 ['chrII' '7434025' '7434646' 'name']]
[-3473  -407   364]
['chrIII', 8067288, 8071455, 'pros-1', '0', '+']
[['chrIII' '8064822' '8066127' 'name']
 ['chrIII' '8066294' '8066750' 'name']
 ['chrIII' '8070110' '8070310' 'name']
 ['chrIII' '8070568' '8070999' 'name']
 ['chrIII' '8071056' '8071764' 'name']]
[ 2466   994 -2822 -3280 -3768]


<IPython.core.display.Javascript object>

In [24]:
#AMsh
gene_names = ['vap-1', 'fig-1', 'kcc-3', 'nas-31', 'pros-1', 'F52E1.2', 'F16F9.3']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrX', 17393788, 17396556, 'vap-1', '0', '-']
[['chrX' '17393711' '17394486' 'name']
 ['chrX' '17395164' '17397893' 'name']]
[-2070  1337]
['chrV', 10970398, 10981213, 'fig-1', '0', '-']
[['chrV' '10969973' '10970374' 'name']
 ['chrV' '10971430' '10971854' 'name']
 ['chrV' '10974093' '10974427' 'name']
 ['chrV' '10974659' '10975999' 'name']
 ['chrV' '10977347' '10977919' 'name']
 ['chrV' '10978214' '10978811' 'name']
 ['chrV' '10980234' '10980608' 'name']
 ['chrV' '10981128' '10982106' 'name']
 ['chrV' '10982286' '10983332' 'name']]
[-10839  -9359  -6786  -5214  -3294  -2402   -605    893   2119]
['chrII', 7425938, 7434282, 'kcc-3', '0', '-']
[['chrII' '7430330' '7430809' 'name']
 ['chrII' '7433675' '7433875' 'name']
 ['chrII' '7434025' '7434646' 'name']]
[-3473  -407   364]
['chrV', 10918860, 10921847, 'nas-31', '0', '-']
[['chrV' '10921645' '10921951' 'name']
 ['chrV' '10922721' '10923076' 'name']]
[ 104 1229]
['chrIII', 8067288, 8071455, 'pros-1', '0', '+']
[['chrIII' '8064822' '8

<IPython.core.display.Javascript object>

In [25]:
#PHsh
gene_names = ['fig-1', 'kcc-3', 'nas-31', 'pros-1', 'F52E1.2', 'F16F9.3']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrV', 10970398, 10981213, 'fig-1', '0', '-']
[['chrV' '10969973' '10970374' 'name']
 ['chrV' '10971430' '10971854' 'name']
 ['chrV' '10974093' '10974427' 'name']
 ['chrV' '10974659' '10975999' 'name']
 ['chrV' '10977347' '10977919' 'name']
 ['chrV' '10978214' '10978811' 'name']
 ['chrV' '10980234' '10980608' 'name']
 ['chrV' '10981128' '10982106' 'name']
 ['chrV' '10982286' '10983332' 'name']]
[-10839  -9359  -6786  -5214  -3294  -2402   -605    893   2119]
['chrII', 7425938, 7434282, 'kcc-3', '0', '-']
[['chrII' '7430330' '7430809' 'name']
 ['chrII' '7433675' '7433875' 'name']
 ['chrII' '7434025' '7434646' 'name']]
[-3473  -407   364]
['chrV', 10918860, 10921847, 'nas-31', '0', '-']
[['chrV' '10921645' '10921951' 'name']
 ['chrV' '10922721' '10923076' 'name']]
[ 104 1229]
['chrIII', 8067288, 8071455, 'pros-1', '0', '+']
[['chrIII' '8064822' '8066127' 'name']
 ['chrIII' '8066294' '8066750' 'name']
 ['chrIII' '8070110' '8070310' 'name']
 ['chrIII' '8070568' '8070999' 'name']
 ['chrII

<IPython.core.display.Javascript object>

In [26]:
#CEPsh
gene_names = ['kcc-3', 'hlh-17', 'pros-1', 'glt-1']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrII', 7425938, 7434282, 'kcc-3', '0', '-']
[['chrII' '7430330' '7430809' 'name']
 ['chrII' '7433675' '7433875' 'name']
 ['chrII' '7434025' '7434646' 'name']]
[-3473  -407   364]
['chrIV', 16254763, 16256274, 'hlh-17', '0', '+']
[]
[]
['chrIII', 8067288, 8071455, 'pros-1', '0', '+']
[['chrIII' '8064822' '8066127' 'name']
 ['chrIII' '8066294' '8066750' 'name']
 ['chrIII' '8070110' '8070310' 'name']
 ['chrIII' '8070568' '8070999' 'name']
 ['chrIII' '8071056' '8071764' 'name']]
[ 2466   994 -2822 -3280 -3768]
['chrX', 3496025, 3499165, 'glt-1', '0', '+']
[['chrX' '3495353' '3495911' 'name']]
[672]


<IPython.core.display.Javascript object>

In [27]:
#includes CEPso, ILso
gene_names = ['cutl-8', 'mam-5', 'daf-6']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrI', 8270303, 8275701, 'cutl-8', '0', '+']
[['chrI' '8270131' '8270439' 'name']]
[172]
['chrV', 9952698, 9956883, 'mam-5', '0', '+']
[['chrV' '9952355' '9952852' 'name']]
[343]
['chrX', 14888487, 14894297, 'daf-6', '0', '+']
[]
[]


<IPython.core.display.Javascript object>

## Markers from Fung, et al. 2020 (bioRxiv preprint)

In [28]:
#ILso
gene_names = ['col-53', 'col-177', 'grl-18']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrI', 5644728, 5646370, 'col-53', '0', '+']
[['chrI' '5644165' '5644837' 'name']]
[563]
['chrX', 10999741, 11001850, 'col-177', '0', '-']
[['chrX' '11002852' '11003361' 'name']]
[1511]
['chrV', 5485696, 5488866, 'grl-18', '0', '-']
[]
[]


<IPython.core.display.Javascript object>

In [29]:
#AMsh, PMsh
gene_names = ['F16F9.3']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrX', 8457613, 8459734, 'F16F9.3', '0', '+']
[['chrX' '8457494' '8457694' 'name']]
[119]


<IPython.core.display.Javascript object>

In [30]:
#AMsh only
gene_names = ['F11C7.2']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrX', 17396813, 17397798, 'F11C7.2', '0', '+']
[['chrX' '17395164' '17397893' 'name']]
[1649]


<IPython.core.display.Javascript object>

In [31]:
#AMso and PHso
gene_names = ['grl-2']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrV', 12944860, 12946853, 'grl-2', '0', '+']
[]
[]


<IPython.core.display.Javascript object>

In [32]:
#CEPsh
gene_names = ['hlh-17']

#have_gene_arr = []
have_gene_sum = numpy.zeros(peak_data_array.shape[0])
for idx, gname in enumerate(gene_names):
    have_gene_idx = get_gene_cells(gname, orig_peak_row_headers, orig_peak_data_array_filtered, verbose=True)
    have_gene_sum[have_gene_idx] += 1

plot_order = numpy.argsort(have_gene_sum)

fig, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(8,7))
s = axes.scatter(doctopic_peaks_umap2_res[plot_order, 0], 
                 doctopic_peaks_umap2_res[plot_order, 1], 
                 c=have_gene_sum[plot_order],
                 cmap='viridis',
                 s=5*(have_gene_sum[plot_order] + 1)**2)

axes.set_ylabel('UMAP2', fontsize=14)
axes.set_xlabel('UMAP1', fontsize=14)
cbar = fig.colorbar(s)
#cbar.ax.set_yticklabels(gene_names, fontsize=10)
fig.tight_layout()

['chrIV', 16254763, 16256274, 'hlh-17', '0', '+']
[]
[]


<IPython.core.display.Javascript object>

## Construct an AnnData object and save it in loom format

In [14]:
def add_lda_result_to_anndata_obj(anndata_obj, lda_base, lda_cellnames, lda_peak_bed):

    filt_cellnames = numpy.loadtxt(lda_cellnames, dtype=object)[:,0]
    filt_cellnames_set = set(filt_cellnames)
    filt_cellnames_map = [(True, idx, numpy.where(filt_cellnames == elt)[0][0])
                          if elt in filt_cellnames_set else (False, idx, numpy.nan)
                          for idx, elt in enumerate(anndata_obj.obs.index.values)]
    unfilt_cellnames_idx, filt_cellnames_idx = [list(idx_tuple) for idx_tuple in zip(*[(elt[1], elt[2]) for elt in filt_cellnames_map if elt[0] is True])]
    anndata_obj.obs['lda_cell'] = [elt[0] for elt in filt_cellnames_map]

    theta = numpy.loadtxt(lda_base + '.theta', delimiter=',', dtype=float)
    unfiltered_theta = numpy.ones((len(filt_cellnames_map), theta.shape[1])) * numpy.nan
    unfiltered_theta[unfilt_cellnames_idx] = theta[filt_cellnames_idx]
    anndata_obj.obsm['lda_theta'] = unfiltered_theta

    col_means = numpy.mean(theta.T, axis=0)
    theta_norm = theta.T - col_means
    l2_for_norm = (theta_norm ** 2).sum(axis=0).flatten() ** 0.5
    theta_norm /= l2_for_norm
    theta_norm = theta_norm.T
    unfiltered_theta_norm = numpy.ones(unfiltered_theta.shape) * numpy.nan
    unfiltered_theta_norm[unfilt_cellnames_idx] = theta_norm[filt_cellnames_idx]
    anndata_obj.obsm['lda_theta_norm'] = unfiltered_theta_norm

    doctopic = numpy.loadtxt(lda_base + '.docTopic', delimiter=',', dtype=float)
    unfiltered_doctopic = numpy.ones(unfiltered_theta.shape) * numpy.nan
    unfiltered_doctopic[unfilt_cellnames_idx] = doctopic[filt_cellnames_idx]
    anndata_obj.obsm['lda_doctopic'] = unfiltered_doctopic

    doctopic_frac = (doctopic.T/doctopic.sum(axis=1)).T
    unfiltered_doctopic_frac = numpy.ones(unfiltered_theta.shape) * numpy.nan
    unfiltered_doctopic_frac[unfilt_cellnames_idx] = doctopic_frac[filt_cellnames_idx]
    anndata_obj.obsm['lda_doctopic_frac'] = unfiltered_doctopic_frac

    filt_peaks = numpy.loadtxt(lda_peak_bed, dtype=object)
    filt_peaks_str = ['{!s}\t{!s}\t{!s}'.format(*filt_peaks[idx,:3])
                      for idx in range(filt_peaks.shape[0])]
    filt_peaks_set = set(filt_peaks_str)
    unfilt_peaks_str = ['{!s}\t{!s}\t{!s}'.format(anndata_obj.var['chr'][idx],
                                                  anndata_obj.var['start'][idx],
                                                  anndata_obj.var['stop'][idx])
                        for idx in range(anndata_obj.var.shape[0])]
    filt_peaks_map = [(True, idx, filt_peaks_str.index(elt))
                      if elt in filt_peaks_set else (False, idx, numpy.nan)
                      for idx, elt in enumerate(unfilt_peaks_str)]
    unfilt_peaks_idx, filt_peaks_idx = [list(idx_tuple) for idx_tuple in zip(*[(elt[1], elt[2]) for elt in filt_peaks_map if elt[0] is True])]
    anndata_obj.var['lda_peak'] = [elt[0] for elt in filt_peaks_map]

    phi = numpy.loadtxt(lda_base + '.phi', delimiter=',', dtype=float).T
    unfiltered_phi = numpy.ones((anndata_obj.var.shape[0], phi.shape[1])) * numpy.nan
    unfiltered_phi[unfilt_peaks_idx] = phi[filt_peaks_idx]
    anndata_obj.varm['lda_phi'] = unfiltered_phi

    wordtopic = numpy.loadtxt(lda_base + '.wordTopic', delimiter=',', dtype=float)
    unfiltered_wordtopic = numpy.ones(unfiltered_phi.shape) * numpy.nan
    unfiltered_wordtopic[unfilt_peaks_idx] = wordtopic[filt_peaks_idx]
    anndata_obj.varm['lda_wordtopic'] = unfiltered_wordtopic

    wordtopic_frac = (wordtopic.T/wordtopic.sum(axis=1)).T
    unfiltered_wordtopic_frac = numpy.ones(unfiltered_phi.shape) * numpy.nan
    unfiltered_wordtopic_frac[unfilt_peaks_idx] = wordtopic_frac[filt_peaks_idx]
    anndata_obj.varm['lda_wordtopic_frac'] = unfiltered_wordtopic_frac

    return

def read_in_bow(bow_path, cell_names_path, feature_info_path, gene_name_map=None):
    try:
        bow_data_sparse = numpy.loadtxt(bow_path, dtype=int, skiprows=3)
    except StopIteration:
        #probably NFS lag; just wait a few seconds and try again
        time.sleep(10)
        bow_data_sparse = numpy.loadtxt(bow_path, dtype=int, skiprows=3)
    open_func, open_mode = (gzip.open, 'rb') if bow_path.endswith('.gz') else (open, 'rb')
    with open_func(bow_path, open_mode) as lines_in:
        cellnum = int(lines_in.readline().decode().strip())
        featnum = int(lines_in.readline().decode().strip())
    bow_data = sps.csr_matrix((bow_data_sparse[:,2],
                               (bow_data_sparse[:,0] - 1,
                                bow_data_sparse[:,1] - 1)),
                              shape=(cellnum, featnum))

    try:
        cell_names = numpy.loadtxt(cell_names_path, dtype=object)[:,0]
    except StopIteration:
        time.sleep(10)
        cell_names = numpy.loadtxt(cell_names_path, dtype=object)[:,0]

    try:
        feature_info_bed = numpy.loadtxt(feature_info_path, dtype=object)
    except StopIteration:
        time.sleep(10)
        feature_info_bed = numpy.loadtxt(feature_info_path, dtype=object)
    col_names = ['chr', 'start', 'stop', 'name', 'score', 'strand']
    num_cols = feature_info_bed.shape[1]
    if num_cols <= 6:
        col_names = col_names[:num_cols]
    else:
        col_names = col_names + list(numpy.arange(6, num_cols).astype(str))

    anndata_obj = anndata.AnnData(X=bow_data,
                                  obs=pandas.DataFrame(index=cell_names),
                                  var=pandas.DataFrame(data=feature_info_bed,
                                                       columns=col_names))
    if gene_name_map is not None:
        gene_name_map = numpy.loadtxt(gene_name_map, delimiter='\t', dtype=object)
        gene_name_map = dict([gene_name_map[idx] for idx in range(gene_name_map.shape[0])])
        anndata_obj.var['common_name'] = [gene_name_map.get(elt, elt) for elt in anndata_obj.var.name]

    return anndata_obj

In [15]:
#make and save anndata object
import anndata
import pandas

#first, read in the original BOW data (the unfiltered data)
orig_peaktable_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/../all_peaks.merged.allcells.bow'
orig_cell_names_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/../all_peaks.merged.allcells.zeros_filtered.indextable.txt'
orig_peak_names_path = '../tissue_analysis/glia/topic_analysis/rank_peaks/../all_peaks.merged.allcells.zeros_filtered.bed'

anndata_obj = read_in_bow(orig_peaktable_path, orig_cell_names_path, orig_peak_names_path)

#next, add the LDA results, taking into account that some of the cells/peaks were filtered out before running LDA
lda_base = '../tissue_analysis/glia/topic_analysis/rank_peaks/0000_topics9_alpha3.000_beta2000.000/topic_mode'
lda_cellnames = '../tissue_analysis/glia/topic_analysis/rank_peaks/filtered_peaks_iqr4.0_low_cells.indextable.txt'
lda_peak_bed = '../tissue_analysis/glia/topic_analysis/rank_peaks/filtered_peaks_iqr4.0_low_cells.extra_cols.bed'
add_lda_result_to_anndata_obj(anndata_obj, lda_base, lda_cellnames, lda_peak_bed)

print(anndata_obj)

  from pandas.core.index import RangeIndex
Transforming to str index.


AnnData object with n_obs × n_vars = 1571 × 10216 
    obs: 'lda_cell'
    var: 'chr', 'start', 'stop', 'lda_peak'
    obsm: 'lda_theta', 'lda_theta_norm', 'lda_doctopic', 'lda_doctopic_frac'
    varm: 'lda_phi', 'lda_wordtopic', 'lda_wordtopic_frac'


In [16]:
umap_to_add = numpy.zeros((anndata_obj.shape[0], doctopic_peaks_umap2_res.shape[1])) * numpy.nan
anndata_idx = numpy.where(anndata_obj.obs.lda_cell)[0]
umap_idx = [numpy.where(cell_names == elt)[0][0] for elt in anndata_obj.obs[anndata_obj.obs.lda_cell].index.values]
umap_to_add[anndata_idx] = doctopic_peaks_umap2_res[umap_idx]
print(umap_to_add.shape)
anndata_obj.obsm['umap2'] = umap_to_add

umap_to_add = numpy.zeros((anndata_obj.shape[0], doctopic_peaks_umap3_res.shape[1])) * numpy.nan
anndata_idx = numpy.where(anndata_obj.obs.lda_cell)[0]
umap_idx = [numpy.where(cell_names == elt)[0][0] for elt in anndata_obj.obs[anndata_obj.obs.lda_cell].index.values]
umap_to_add[anndata_idx] = doctopic_peaks_umap3_res[umap_idx]
print(umap_to_add.shape)
anndata_obj.obsm['umap3'] = umap_to_add

(1571, 2)
(1571, 3)


In [17]:
anndata_obj.write_loom('../tissue_analysis/glia/topic_analysis/rank_peaks/refinement_lda_results.loom', 
                       write_obsm_varm=True)