## Spatial Expression of One Gene
Tests if the spatial expression of a gene is significantly different from the spatial distribution of random genes. The p-vale and effect size are calculated using a Kolmogorov–Smirnov test.<br>
7/29/19

In [None]:
#packages
import argparse 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import scipy.stats


In [None]:
#Block creates parser to interpret command line info and make arguments into variables.
parser = argparse.ArgumentParser(description = "handle inputs from SOMETHING script to run permutation tests" +
                                 "on puck data")
parser.add_argument("-ern", type=int,
                   help = "Enforced Read Number: enter 1 or 0. if 1, enforces that all random samples have the "+
                    "same number of positive as the test sample at the cost of some computational time.")
parser.add_argument("-fg", type=int,
                   help = "Filter Genes:input values to determine how genes are filtered: 0, in which case all "+
                    "genes are analyzed (not recommended, due to false positives); 1, in which case genes are "+
                    "filtered by within-dropseq-cluster expression; 2, in which case it's filtered by within-dropseq-"+
                    "cluster variance; or 3, in which case genes either match the expression cutoff or variance "+
                    "cutoff, and are labeled according to which they pass (or both)")
parser.add_argument("-pg", type=int,
                   help = "Plot Genes: enter 1 or 0. If 1, will output a pdf with the significant genes at the "+
                    "0.005 level plotted")
parser.add_argument("--ns", type=int,
                   help = "Number of Samples: Number of samples for the null model. 1000 by default. Note: Runtime"+
                    "scales linearly with numsamples")
parser.add_argument("--bc", type=int,
                   help = "Bead Cutoff: minimum number of beads needed to assess significance of a gene. 15 by "+
                    "default") 
parser.add_argument("-bmf", type=str,
                   help = "Bead Mapping File: enter data path for puck gene expression/cluster data file. If no "+ 
                   "extension, assumes csv")
parser.add_argument("-pn", type=str,
                   help = "Puck Number: enter puck identifier here")
parser.add_argument('--clust', nargs='*', type=int,
                   help="enter the cluster numbers you wish to analyze. Multiple arguments allowed")

print(parser)

### In the block below, you may change the following for your data:
 -ern    : to force sample distribution to have the same number of beads as the data <br>
 -fg     : how genes are filtered <br>
 -pg     : if you want to plot significant genes<br>
 --ns    : the number of samples for the null model<br>
 --bc    : minimum number of beads expressing the gene to assess it<br>
 -bmf    : data path for puck data<br>
 -pn     : puck number<br>
 --clust : cluster(s) to analyze

In [None]:
#Block utilizes parser
args = parser.parse_args('-ern 0 -fg 0 -pg 0 -bmf /broad/thechenlab/breanna/permutation_test_data -pn Puck_181206_3'.split())

In [None]:
#Block formalizes variables from parser info
EnforcedReadNumbers = args.ern

FilterGenes = args.fg

PlotGenes = args.pg

if args.ns is None:
    NumSamples = 1000
else: 
    NumSamples = args.ns

if args.bc is None:
    BeadCutoff = 15
else: 
    BeadCutoff = args.bc
    
BeadMappingFile = args.bmf

PuckNumber = args.pn

if args.clust is None:
    ClustertoAnalyze=[]
else:
    ClustertoAnalyze=args.clust

DataPath = "{}/{}.csv".format(BeadMappingFile,PuckNumber) 

In [None]:
#Read in and save data
AllMappedBeads=pd.read_csv(DataPath, header = 0, index_col = 0)
if ClustertoAnalyze:
    UniqueMappedBeads=AllMappedBeads[AllMappedBeads["cluster"].isin(ClustertoAnalyze)]
else: UniqueMappedBeads=AllMappedBeads

In [None]:
#Count number of reads per bead
genes_only=UniqueMappedBeads.iloc[:,0:-3]#exclude cluster number and coordinates
NumReadsPerBead = genes_only.sum(axis=1)


In [None]:
#Calculate pair-wise distances between each bead
x=UniqueMappedBeads.as_matrix(columns=['xcoord'])
BeadXCoordMatrix=x*np.ones((1,UniqueMappedBeads.shape[0]))
y=UniqueMappedBeads.as_matrix(columns=['ycoord'])
BeadYCoordMatrix=y*np.ones((1,UniqueMappedBeads.shape[0]))

BeadPairwiseXValDifferences=BeadXCoordMatrix-np.transpose(BeadXCoordMatrix)
BeadPairwiseYValDifferences=BeadYCoordMatrix-np.transpose(BeadYCoordMatrix)

dist = lambda x,y: ((x**2)+(y**2))**(1/2)
BeadPairwiseDistanceMat = dist(BeadPairwiseXValDifferences,BeadPairwiseYValDifferences)

#set up bins for histograms later
num_bins=100
Triu = np.triu_indices(BeadPairwiseDistanceMat.shape[0],1)
hist,bins=np.histogram(BeadPairwiseDistanceMat[Triu],num_bins)


In [None]:
#Determines the probability of picking each bead for the null distribution based on the number of reads per bead
NumReadsPerBead = UniqueMappedBeads.iloc[:,0:-3].sum(axis=1)
ProbabilityPerBead=NumReadsPerBead/NumReadsPerBead.sum()

#### Note: Filtering by variance has not yet been implemented

In [None]:
#filtering genes according to "FilterGenes" input

ExpressionGenes=[]
VarianceGenes=[]

for cluster in ClustertoAnalyze:
    tmp=genes_only
    #Filter by within-cluster expression
    if FilterGenes==1 or FilterGenes==3:
        GoodGenes=list(np.array(tmp.columns)[np.array(tmp.sum(axis=0)>0)])
        ExpressionGenes=list(np.unique(ExpressionGenes+GoodGenes))
    #Filter by within-cluster variance
    #if FilterGenes==2 or FilterGenes==3:


### Selecting your genes
Here, the GeneNames variable is a list of genes that will be analyzed. Replace the selected kidney genes currently in the GeneNames list with your desired genes, or uncomment the lines below to analyze all genes in your sample.

#### Note:
If the genes you select are not in the sample, the method will fail.

In [None]:
#Perform permutation test and save data to file path provided to the arg parser.

#Select kidney genes
GeneNames=['Slc12a1', 'Umod', 'Ctgf', 'Nphs1','Nphs2','Wt1', 'Synpo', 'Itga8','Ptn',  'Plvap', 'Ehd3', 'Ren1','Slc27a2', 'Aqp2', 'Napsa', 'Aqp3', 'Pck1', 'Miox', 'Acsm2', 'Acsm3', 'Rarres2', 'Col4a2', 'C1qc', 'C1qa']

#Uncomment the following two lines to analyze every gene in the sample
#GeneNames=list(UniqueMappedBeads.columns)[:-3]
#GeneNames=list(genes_only.columns)

pvals=np.zeros(len(GeneNames))
effectsize=np.zeros(len(GeneNames))

for geneval in GeneNames:
    if ClustertoAnalyze:
        #filter genes based on filter genes
        if FilterGenes==3:
            PassingVariance=0
            PassingExpression=0
            if geneval in ExpressionGenes:
                PassingExpression=1
            if geneval in VarianceGenes:
                PassingVariance=1
            if not ExpressionGenes and not VarianceGenes:
                pvals[GeneNames.index(geneval)]=-1
                continue          
        elif FilterGenes==1 and not geneval in ExpressionGenes:
            pvals[GeneNames.index(geneval)]=-1
            continue
        elif FilterGenes==2 and not geneval in VarianceGenes:
            pvals[GeneNames.index(geneval)]=-1
            continue 
        
    #filter out genes expressed in too few beads    
    NumBeads=sum(UniqueMappedBeads[geneval]>0)
    print('NumBeads')
    print(NumBeads)
    if NumBeads<BeadCutoff:
        pvals[GeneNames.index(geneval)]=-1
        print('here')
        continue 
        
    #This will give the true distribution
    #Find which beads have gene expressed
    A=np.matrix(UniqueMappedBeads[geneval]>0)
    NonzeroBeads=BeadPairwiseDistanceMat[np.matmul(np.transpose(A),A)]
    #create boolean matrix with locations of beads with genes expressed
    NonzeroMat=np.reshape(NonzeroBeads,(int(NonzeroBeads.size**(1/2)),int(NonzeroBeads.size**(1/2))))
    Triu = np.triu_indices(NonzeroMat.shape[0],1)
    #Matrix with distances between beads with gene expressed 
    #Triu to take only one half of the symmetric matrix (other half is identical, avoid double-counting)
    PairWiseDistances=NonzeroMat[Triu]
    
    #plot true distribution
    n=plt.hist(PairWiseDistances, bins=num_bins, color='royalblue') 
    plt.rcParams.update({'font.size': 20})
    # arguments are passed to np.histogram
    plt.xlabel('Distance',fontsize=24)
    plt.ylabel('Count',fontsize=24)
    plt.title("TAL True Distance {}".format(geneval),fontsize=29)
    plt.show()
    
    DistanceDist=n[0]/n[0].sum()
    
    #Generate permuted distribution. There is a ton of duplication here, because this calculation is the 
    #same regardless of geneval. It only depends on the NUMBER of beads in which geneval appears.
    AverageDistribution=np.zeros(num_bins)
    RandomDists=[]
    pvals_tmp=np.zeros(NumSamples)
    effectsize_tmp=np.zeros(NumSamples)
    for p in range(NumSamples):
        if EnforcedReadNumbers:
            NonzeroBeadsRandomTmp=np.random.choice(np.arange(len(NumReadsPerBead)),NumBeads,replace=False,p=ProbabilityPerBead)
            NonzeroBeadsRandom=np.zeros(len(NumReadsPerBead))
            for elem in NonzeroBeadsRandomTmp:
                NonzeroBeadsRandom[elem]=1
        else: NonzeroBeadsRandom=np.random.uniform(size=len(NumReadsPerBead))/NumBeads<ProbabilityPerBead
        #Find which beads have gene expressed
        A=np.matrix(NonzeroBeadsRandom)
        NonzeroBeadsRandomTmp=BeadPairwiseDistanceMat[np.matmul(np.transpose(A),A)]
        #create boolean matrix with locations of beads with genes expressed
        NonzeroBeadsRandomMat=np.reshape(NonzeroBeadsRandomTmp,(int(NonzeroBeadsRandomTmp.size**(1/2)),int(NonzeroBeadsRandomTmp.size**(1/2))))
        Triu = np.triu_indices(NonzeroBeadsRandomMat.shape[0],1)
        #Matrix with distances between beads with gene expressed 
        #Triu to take only one half of the symmetric matrix (other half is identical, avoid double-counting)
        RandomDistTmp=NonzeroBeadsRandomMat[Triu]
        #save only the counts in each bin
        RandomDists.append(np.histogram(RandomDistTmp,num_bins, range=(bins[0],bins[num_bins]))[0])
        AverageDistribution+=np.array(RandomDists[p])
        RandomDists[p]=np.divide(RandomDists[p],RandomDists[p].sum())
        effectsize_tmp[p][0]=scipy.stats.ks_2samp(PairWiseDistances,RandomDistTmp)[0]
        pvals_tmp[p][1]=scipy.stats.ks_2samp(PairWiseDistances,RandomDistTmp)[p][1]
        
           
    pvals[GeneNames.index(geneval)]=pvals_tmp.max(axis=0)[1]
    effectsize[GeneNames.index(geneval)]=effectsize_tmp.max(axis=0)[0]
    
    with open('kidney_expression_data_fixed.csv', 'a') as csvfile:
        my_writer = csv.writer(csvfile, delimiter=',')
        my_writer.writerow((geneval, pvals[GeneNames.index(geneval)], effectsize[GeneNames.index(geneval)]))

### Selecting your genes
Here, the GeneNames variable is a list of genes that will be analyzed. The lines that are currently commented out will run the method for every gene detected in the sample that meets the filter gene critera and is expressed in more beads than the bead cutoff number. 

Otherwise, replace the selected kidney genes currently in the GeneNames list with your desired genes. 

#### Note:
If the genes you select are not in the sample, the method will fail.

In [None]:
#p-values of genes analyzed
pvals

In [None]:
#effect size of genes analyzed
effectsize

In [None]:
#Find genes that yield significant results
SignificantGenes=list(np.array(GeneNames)[(pvals>0) & (pvals<.25)])
SignificantGeneNames=[]
for gene in SignificantGenes:
    SignificantGeneNames.append(GeneNames[GeneNames.index(gene)])

In [None]:
#genes that yielded significant results
SignificantGeneNames

In [None]:
#function to plot significant genes (currently supports 19. For more, increase colors in color list)
def plot_significant_genes(user_genes):
    color_list=['b','darkorange','green','r','darkviolet','saddlebrown','magenta','orchid','k','olive','teal','cornflowerblue','gold','lawngreen','lightsalmon','plum','peru','y','turquoise']
    c_num=0
    empty_beads=pcounts_and_coords[pcounts_and_coords[user_gene].sum(axis=1)==0]
    ax=empty_beads.plot(kind='scatter',x='xcoord',y='ycoord',color='lightgrey',alpha=.2,figsize=(10,10))
    for gene in user_genes:
        tmp=pcounts_and_coords[pcounts_and_coords[gene]>0]
        ax = tmp.plot(kind="scatter", x="xcoord",y="ycoord", color=color_list[c_num], label=gene,alpha=.5,ax=ax)
        c_num+=1
    plt.rcParams.update({'font.size': 20})
    plt.legend(bbox_to_anchor=(1.04,1), loc="upper left",fontsize='x-large')
    ax.set_xlabel("x")
    ax.set_ylabel("y")
    ax.set_title("Location of Gene Expression")
    plt.show()

In [None]:
plot_significant_genes(SignificantGenes)