# Chopchop targets across coding genes and population variation
Simplifying down from 'scratch' version
# Data processing

In [1]:
import sys; print(sys.version)
import os
import glob
import subprocess
import multiprocessing
import io
from collections import OrderedDict
import json

import numpy as np; print('numpy', np.__version__)
import pandas as pd; print('pandas',pd.__version__)
import allel; print('scikit-allel', allel.__version__)
import zarr; print('zarr', zarr.__version__)

import matplotlib as mpl

import statsmodels; print('statsmodels', statsmodels.__version__)
import statsmodels.api as sm

from IPython.display import display, HTML

3.6.7 | packaged by conda-forge | (default, Feb 28 2019, 09:07:38) 
[GCC 7.3.0]
numpy 1.16.2
pandas 0.24.1
scikit-allel 1.2.0
zarr 2.2.0
statsmodels 0.9.0


In [2]:
%matplotlib notebook
mpl.rcParams['figure.facecolor'] = '#BBBBBB'

### Compute sample list for Regional comparisons (@TCC WIP)

In [3]:
if False:
    d = pd.read_excel('vcfs/YL-Agam-GF2.xlsx')
    d = d[(d['Country']=='Mali') & (d['Note']=='YL')]['Sample']

    # samples which don't occur in Hanno's list
    d2 = pd.read_csv('/data/vcfs/hanno_Agam_sample_list.txt', header=None)[0]
    display(np.setdiff1d(d,d2))
    # print("\n".join(d))

    samp_list = np.intersect1d(d,d2)
    display(samp_list)
    display(samp_list.shape)

    print("\n".join(samp_list))

In [4]:
if False:
    d = pd.read_excel('/data/vcfs/YL-Agam-GF2.xlsx')
    d = d[(d['Country']=='Comoros') & (d['Note']=='YL')]['Sample']

    # samples which don't occur in Hanno's list
    d2 = pd.read_csv('/data/vcfs/hanno_Agam_sample_list.txt', header=None)[0]
    display(np.setdiff1d(d,d2))
    # print("\n".join(d))

    samp_list = np.intersect1d(d,d2)
    display(samp_list)
    display(samp_list.shape)

    print("\n".join(samp_list))

## Settings/Constants

In [5]:
SUBSAMPLE_N = None  # randomly select this many samples for testing sample size effects
IGNORE_ONE_CALL_VARIANTS = False

## Uncommnet block for the appropriate set of samples

## An. gambiae
SETNAME = 'VGL-gam'
GENOME = "AgamP4.11"
GENE_PREFIX = "AGAP"
CHROMS = ['2R','2L','3R','3L','X']
GTFFN = "../datafiles/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.11.gtf"
ZVCF = 'vcfs/YL-Agam-GF2_pflit.vcf.gz.zarr'
SAMPLE_LIST_FN = 'vcfs/hanno_Agam_sample_list.txt'

# ## An. coluzzii
# SETNAME = 'VGL-col'
# GENOME = "AgamP4.11"
# GENE_PREFIX = "AGAP"
# CHROMS = ['2R','2L','3R','3L','X']#,'Y_unplaced','UNKN','Mt']
# GTFFN = "../datafiles/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.11.gtf"
# ZVCF = "vcfs/100Acol_pflit.vcf.gz.zarr"
# SAMPLE_LIST_FN = None # use all samples in ZVCF

# ## Ae. aegypti
# SETNAME = 'VGL-Aaeg'
# GENOME = "Aaegypti_L5.1"
# GENE_PREFIX = "AAEL"
# CHROMS = ['1','2','3']
# GTFFN = "../datafiles/Aedes-aegypti-LVP_AGWG_BASEFEATURES_AaegL5.1.gtf"
# ZVCF = "vcfs/YL-Aaeg-03_pflit.vcf.gz.zarr"
# SAMPLE_LIST_FN = None # use all samples in ZVCF

# ## Ag1000G An. gambiae
# SETNAME = 'Ag1000g-gam'
# GENOME = "AgamP4.11"
# GENE_PREFIX = "AGAP"
# CHROMS = ['2R','2L','3R','3L','X']
# GTFFN = "../datafiles/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.11.gtf"
# ZVCF = '/data/ag1000g_p2_ar1/ngs.sanger.ac.uk/production/ag1000g/phase2/AR1/variation/main/vcf/all/ag1000g.phase2.ar1.zarr'
# SAMPLE_LIST_FN = ['S', '/data/ag1000g_p2_ar1/samples/samples.meta.txt'] # list implies ag1000g to be filtered on m_s

## Ag1000G An. coluzzii
# SETNAME = 'Ag1000g-col'
# GENOME = "AgamP4.11"
# GENE_PREFIX = "AGAP"
# CHROMS = ['2R','2L','3R','3L','X']
# GTFFN = "../datafiles/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.11.gtf"
# ZVCF = '/data/ag1000g_p2_ar1/ngs.sanger.ac.uk/production/ag1000g/phase2/AR1/variation/main/vcf/all/ag1000g.phase2.ar1.zarr'
# SAMPLE_LIST_FN = ['M', '/data/ag1000g_p2_ar1/samples/samples.meta.txt'] # list implies ag1000g to be filtered on m_s

## *Regional* An. gambiae
# GENOME = "AgamP4.11"
# GENE_PREFIX = "AGAP"
# CHROMS = ['2R','2L','3R','3L','X']
# GTFFN = "../datafiles/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.11.gtf"
# ZVCF = 'vcfs/YL-Agam-GF2_pflit.vcf.gz.zarr'
# SUBSAMPLE_N = 40
## Mali
# SAMPLE_LIST_FN = 'vcfs/YL-Agam-GF2_Mali_samples_good.txt' # 40
## or Comoros
# SAMPLE_LIST_FN = 'vcfs/YL-Agam-GF2_Comoros_samples_good.txt' # 54
## or ALL
# SAMPLE_LIST_FN = 'vcfs/hanno_Agam_sample_list.txt'

##########
## Below shouldn't need changing

TRANSFN = "../datafiles/transcript_list_{}".format(GENOME)
CHOPCHOP_OUT_DIR = "../all_transcripts_run/".format(GENOME)

if IGNORE_ONE_CALL_VARIANTS:
    SETNAME = SETNAME+'_ignore-one-call'

MIN_GC = 30 # minimum allowable GC percentage (inclusive)
MAX_GC = 70 # maximum allowable GC percentage (inclusive)
MAX_OFFTARGET_HITS = 0 # maximum allowable total offtarget hits

AWK_EXEC = '/usr/bin/awk'

In [6]:
results_dict = OrderedDict()  # to collect results for final display

## Open the callset and possibly set `CHROMS` to all
note: callset isn't actually used til near the end where we're looking at variants

In [7]:
callset = zarr.open_group(ZVCF, mode='r')
if CHROMS is None: # use all chroms?
    CHROMS = list(callset.keys())

## get the (optional) subset of samples to use

In [8]:
if SAMPLE_LIST_FN is None: # use all samples
    meta = pd.DataFrame(list(callset.values())[0]['samples'][:])
    meta['callset_idx'] = range(meta.shape[0])
else:
    # load the sample list and make meta dataframe
    if isinstance(SAMPLE_LIST_FN, str): # assume a simple list file
        meta = pd.read_csv(SAMPLE_LIST_FN, comment='#', header=None, index_col=0)
    else: # parse ag1000g samples.meta.txt with SAMPLE_LIST_FN really being [M|S,filename]
        meta = pd.read_csv(SAMPLE_LIST_FN[1], delimiter='\t', comment='#', index_col=0)
        meta = meta[meta['m_s']==SAMPLE_LIST_FN[0]]
    all_callset_samples = list(list(callset.values())[0]['samples'])
    meta['callset_idx'] = [all_callset_samples.index(x) for x in meta.index]
meta.index.name = 'sample'

# Optionally subsample to check sample size effects
if SUBSAMPLE_N:
    meta = meta.sample(n=SUBSAMPLE_N)

# ensure it is sorted by callset_idx (otherwise makes later steps terribly inefficient)
meta = meta.sort_values('callset_idx')

print("number of samples",meta.shape[0])
results_dict['number of samples'] = meta.shape[0]
meta

number of samples 111


Unnamed: 0_level_0,callset_idx
sample,Unnamed: 1_level_1
06BANA0008,0
06BANA0010,1
06BANA0012,2
06BANA0013,3
06BANA0016,4
06BANA0035,5
06DONE0022,6
06DONE0044,7
06DONE0045,8
06DONE0046,9


## Load the trascript list

In [9]:
tlist = pd.read_csv(TRANSFN, header=None, names=['transcript_id'])
t = tlist['transcript_id'].str.rsplit('-', n=1, expand=True)
t.columns = ['gene', 'splice_id']
tlist = pd.concat((tlist,t), axis=1)
tlist

Unnamed: 0,transcript_id,gene,splice_id
0,AGAP004677-RB,AGAP004677,RB
1,AGAP004677-RA,AGAP004677,RA
2,AGAP004678-RA,AGAP004678,RA
3,AGAP004679-RB,AGAP004679,RB
4,AGAP004679-RA,AGAP004679,RA
5,AGAP004680-RA,AGAP004680,RA
6,AGAP004681-RA,AGAP004681,RA
7,AGAP004682-RA,AGAP004682,RA
8,AGAP028431-RA,AGAP028431,RA
9,AGAP004683-RA,AGAP004683,RA


### We want "transcribed parts (transcripts) of protein-coding genes"
What we're actually doing is including targets hitting a trascript which contains a CDS

In [10]:
d = pd.read_csv(GTFFN, sep='\t', comment='#', header=None,
            names=['seqid',
                   'source',
                   'type',
                   'start',
                   'end',
                   'score',
                   'strand',
                   'phase',
                   'attributes'],
                dtype={'seqid':str})

# total number of genes
t = d.loc[d['type']=='gene' ,:].copy(deep=True)
t['gene_id'] = t['attributes'].apply(
                    lambda x: dict([_.strip().split() for _ in
                    x.split(';') if _])['gene_id'].strip('"'))
print("total number of genes in gtf:", t['gene_id'].unique().shape[0])
results_dict['total genes'] = t['gene_id'].unique().shape[0]

# list of CDS
d = d.loc[d['type']=='CDS' ,:]
d['gene_id'] = d['attributes'].apply(
                    lambda x: dict([_.strip().split() for _ in
                    x.split(';') if _])['gene_id'].strip('"'))
d['transcript_id'] = d['attributes'].apply(
                    lambda x: dict([_.strip().split() for _ in
                    x.split(';') if _])['transcript_id'].strip('"'))
# filter to only CRHOMS
d = d.loc[d['seqid'].isin(CHROMS) ,:]
cdslist = d

assert len(set(cdslist['transcript_id'])-set(tlist['transcript_id'])) == 0 # transcripts should be superset of CDS
tcdslist = set(tlist['transcript_id']) & set(cdslist['transcript_id'])
tcdsdf = tlist[tlist['transcript_id'].isin(tcdslist)]
print('# trascripts with CDS:', len(tcdslist))
results_dict['coding transcripts'] = len(tcdslist)

num_coding_genes = tcdsdf['gene'].unique().shape[0]
print('# genes with a transcript with a CDS:', num_coding_genes)
results_dict['coding genes'] = tcdsdf['gene'].unique().shape[0]

total number of genes in gtf: 13822
# trascripts with CDS: 14488
# genes with a transcript with a CDS: 12562


## Load the chopchop targets

In [None]:
# use awk to append the transcript_ids and merge all the chopchop outputs
# stream than into pandas.read_csv
cmd = (AWK_EXEC+" -F "+r"'\t' "
"'BEGIN{OFS=FS}{if(FNR==1){if(NR==FNR){print $0, "+'"transcript_id"'+"}}else{print $0, FILENAME}}' "+
"{}/{}*".format(GENOME, GENE_PREFIX))

print(cmd)
with subprocess.Popen(cmd, cwd=CHOPCHOP_OUT_DIR, shell=True, stdout=subprocess.PIPE) as p:
    d_orig = pd.read_csv(p.stdout, sep='\t', index_col=0)

In [None]:
# make a copy of the full chopchop target list so just in case
d = d_orig.copy(deep=True)
d.reset_index(inplace=True)
# remove the path from the transcript_id
d['transcrpit_id_old'] = d['transcript_id']
d['transcript_id'] = d['transcript_id'].apply(os.path.basename)

## Filter chopchop targets `d`

In [None]:
# restrict to transcripts containing a CDS
d = d[d['transcript_id'].isin(tcdsdf['transcript_id'])]
print(d.shape[0], "initial target sites in coding transcripts")
results_dict['initial target sites in coding transcripts'] = d.shape[0]

nuts = d['Genomic location'].unique().shape[0] # divisor for filter percents
results_dict['unique location target sites'] = nuts

print(nuts, 'unique genomic locations')
print(d['Target sequence'].unique().shape[0], 'unique target sequences')

In [None]:
# set the >=555 sort of entries in the MM columns to 999
d["sumMM"] = d.loc[:,("MM0","MM1","MM2","MM3")].\
    apply(pd.to_numeric, errors='coerce').fillna(999).sum(axis=1)

In [None]:
# GC filter
gc_flt = ((d["GC content (%)"] >= MIN_GC) & (d["GC content (%)"] <= MAX_GC))
cnt = d.loc[gc_flt,'Genomic location'].unique().shape[0]
print("unique pass GC filter {} ({}%)".format(cnt, 100*cnt/nuts))
results_dict['unique targets pass GC filter'] = cnt
results_dict['unique targets pass GC filter %'] = 100*cnt/nuts

In [None]:
# off-target filter
offtarget_flt = (d["sumMM"] <= MAX_OFFTARGET_HITS)
cnt = d.loc[offtarget_flt,'Genomic location'].unique().shape[0]
print("unique pass off-target {} ({}%)".format(cnt, 100*cnt/nuts))
results_dict['unique targets pass off-target filter'] = cnt
results_dict['unique targets pass off-target filter %'] = 100*cnt/nuts

In [None]:
cnt = d.loc[(gc_flt & offtarget_flt),'Genomic location'].unique().shape[0]
print("unique pass both {} ({}%)".format(cnt, 100*cnt/nuts))
results_dict['unique targets pass filters'] = cnt
results_dict['unique targets pass filters %'] = 100*cnt/nuts

In [None]:
tmp = d['transcript_id'].unique().shape[0]

# actually apply filters
d = d.loc[(gc_flt & offtarget_flt),:]

# add a gene column
d['gene'] = d['transcript_id'].str.rsplit('-',1).str.get(0)

print("{} passing targets (including duplicate sites)".format(d.shape[0]))
print("hitting {} of {} unique transcript_ids left after flitering {}".format(
        d['transcript_id'].unique().shape[0],
        tmp,
        d['transcript_id'].unique().shape[0]/tmp))
chopchop_targets = d

In [None]:
results_dict

In [None]:
chopchop_targets['Genomic location'].unique().shape[0]

### Info on unique target+gene

In [None]:
# Info on unique target+gene
t = chopchop_targets.set_index('Genomic location').loc[:,('Target sequence','gene')]

single_target_different_genes = t[t.index.duplicated() & ~t.duplicated()]
print(single_target_different_genes.shape[0], "targets hitting more than one gene")
print(single_target_different_genes['gene'].unique().shape[0], "genes affected")

# filter to targets which are unique or hit different genes
t = t[~t.duplicated(keep='first')]
print(t.shape[0], 'unique target+gene combinations')

# number of genes with 0 targets
total_num_genes = tlist['gene'].unique().shape[0]
num_0_target_genes = total_num_genes - t['gene'].unique().shape[0]
print(num_0_target_genes, "genes of", total_num_genes, "with no targets")

# sanity check
assert t.shape[0]-single_target_different_genes.shape[0] == chopchop_targets['Genomic location'].unique().shape[0]

# results_dict['coding genes with potential target'] = t['gene'].unique().shape[0]
# results_dict['coding genes with potential target %'] = (100*
#     results_dict['coding genes with potential target']/results_dict['coding genes'])

In [None]:
display(chopchop_targets.head())
display(chopchop_targets.shape)

## @TCC Exploring efficiency scores

In [None]:
x = chopchop_targets
y = x['Efficiency'].value_counts().sort_index()

fig,ax = mpl.pyplot.subplots(1)
ax.plot(y.index, y/y.sum())
ax.set_xlabel('Xu et al. 2015 efficiency')
ax.set_ylabel('proportion of targets')

In [None]:
y = x['Efficiency']
print(y[y>.5].count())
print(y.describe())


## search vcf for target sites

In [None]:
# all target sequences are the same length
TARGET_LEN = len(chopchop_targets.iloc[0]['Target sequence'])

### get variant positiosn from the callset (zvcf)
Note: callset is opened way up top in case we need to get `CHROMS` from it

In [None]:
refAF_dict = OrderedDict()
vpos_dict = OrderedDict()
results_dict['vcf variants'] = 0
results_dict['variants'] = 0

for chrom in CHROMS:
    print("chrom:", chrom)
    
    ## Get the positions of variants on this chrom (possible subsetting samples)
    pos = allel.SortedIndex(callset[chrom]['variants/POS'])
    g = allel.GenotypeDaskArray(callset[chrom]['calldata/GT']).subset(None, meta['callset_idx'].values)
    n_variants_in = g.shape[0]
    
    # ensure number of samples is same
    assert meta.shape[0] == g.shape[1]
    
    # get major allele frequencies
    ac = g.count_alleles().compute()
    # only variant alleles matter, so only take those
    flt_var = ac.is_variant()
    print('variant filter passes:', flt_var.sum())
    
    if IGNORE_ONE_CALL_VARIANTS:
        flt_one_call = ac[:,1:].sum(axis=1)>1
        print('ignore-one-call variant filter passes:', flt_one_call.sum())
        flt_var = (flt_var & flt_one_call)
    
    ac = ac.compress(flt_var, axis=0)
    n_variants_var = ac.shape[0]

    refAF_dict[chrom] = ac.to_frequencies(fill=1)[:,0] # ref should be allele index 0
    vpos_dict[chrom] = pos.compress(flt_var, axis=0)[:] # `[:]` to loads into memory
    
    print("variants : {} of {} = {:.3f}%".format(n_variants_var, n_variants_in,
                                                 100*n_variants_var/n_variants_in))
    results_dict['vcf variants'] += n_variants_in
    results_dict['variants'] += n_variants_var
        
print('total variants :', results_dict['variants'])

In [11]:
# tcdsdf.to_msgpack(GENOME+"_tcdsdf.msgpack")

### Compare targets vs variants
#### grouping by **gene**

In [None]:
# sort the filtered list of chopchop targets by chrom,pos (from Genomic location)
d_filt = chopchop_targets.copy(deep=True)

# keep only unique location, gene combinations...
d_filt = d_filt.loc[d_filt[['Genomic location', 'gene']].drop_duplicates(keep='first').index]

# add chrom and pos columns
d_filt[['chrom','pos']] = d_filt['Genomic location'].str.split(':', n=1, expand=True)
d_filt['pos'] = d_filt['pos'].astype(int)
d_filt.sort_values(['chrom', 'pos'], inplace=True)

display(d_filt.head())
print("location by gene unique combinations:",d_filt.shape[0])
print("unique target locations:",chopchop_targets['Genomic location'].unique().shape[0])

In [None]:
tout = d_filt.copy(deep=True)
tout['vpos'] = np.nan # position of each variant
tout['nv'] = np.nan # number of variants
tout['refAF'] = np.nan # frequency of reference allele of each variant
tout['p_ref'] = np.nan # probability of ref sequence for whole target

for chrom in CHROMS:
    print("chrom:", chrom)
    vpos = vpos_dict[chrom]
    
    # target positions (target list is filtered above)
    tpos = d_filt[d_filt['chrom']==chrom]['pos'].copy(deep=True)
    print("unique (location,gene) targets:", tpos.shape)

    # find target positions in the list of variant positions
    p1 = np.searchsorted(vpos, tpos.values, side='left')
    p2 = np.searchsorted(vpos, tpos.values+TARGET_LEN, side='left')
    
    # list of actual variant positions in each target
    tout.loc[tpos.index,'vpos'] = [vpos[_p1:_p2].values for _p1,_p2 in zip(p1,p2)]
    # count of variants in each target
    tout.loc[tpos.index,'nv'] = (p2-p1)
    # refAF of each variant in each target
    tout.loc[tpos.index,'refAF'] = [refAF_dict[chrom][_p1:_p2] for _p1,_p2 in zip(p1,p2)]
    # the probability a target locus will be entirely reference (perfect match)
    # assumes each variant hitting that target is independent
    tout.loc[tpos.index,'p_ref'] = tout.loc[tpos.index,'refAF'].apply(lambda x: np.prod(x, initial=1))

In [None]:
### Save tout to a file so we can use it to generate a nice figure
tout.to_msgpack(SETNAME+"_tout.msgpack")

## @TCC EVERYTHING BELOW SHOULD REALLY BE DONE BY 'RESULTS' SCRIPT
Need to have saved
* tout
* number of coding genes (currently saved in results_dict below)
* total number of unique targets (currently saved in results_dict below)

### relationship between variant freq and num genes with a good target

In [None]:
# tout = pd.read_msgpack(SETNAME+'_tout.msgpack')

In [None]:
key = 'Genomic location'
t = tout[[key,'chrom','pos','nv','p_ref']].copy(deep=True)
t.set_index(key, drop=True, inplace=True)
t.drop_duplicates(inplace=True)
t.reset_index(inplace=True)

tx = np.sort(t['p_ref'])
tnfixed = (tx>=1).sum()
print('target nfixed', tnfixed)
tx = tx[tx<1]
ty = ((tnfixed+len(tx))-np.arange(len(tx)))/(len(tx)+tnfixed)


t = tout[['gene','chrom','pos','nv','p_ref']].copy(deep=True)
t.set_index('gene', drop=True, inplace=True)
print(t.shape)
t.drop_duplicates(inplace=True)
t.reset_index(inplace=True)
print(t.shape)

gb = t[['gene','p_ref']].groupby('gene').apply(lambda x: [_ for _ in x['p_ref']])

good_by_freq = gb.apply(max)
num_coding_genes = tcdsdf['gene'].unique().shape[0]

x = np.sort(good_by_freq) 
nfixed = (x>=1).sum()
print('nfixed',nfixed)
x = x[x<1]
y = ((nfixed+len(x))-np.arange(len(x)))/num_coding_genes # <= x

fig,ax = mpl.pyplot.subplots(1,1, figsize=(4,3))

# ax.step(x,y, where='pre', c='C0', label='genes')
# ax.step(tx,ty, where='pre', c='C1', label='targets')

ax.plot(x,y, ls='none', marker='.', ms=3, c='C0', label='genes')
ax.plot(tx,ty, ls='none', marker='.', ms=3, c='C1', label='targets')

# ax.axhline(1, ls=':', lw=1, c='k')
# ax.set_yscale('log')
# ax.set_xscale('log')
ax.set_xlim((.95,1))
ax.set_ylim((0,1))

ax.set_xlabel('required non-variant frequency')
ax.set_ylabel('proportion')
ax.legend()
fig.tight_layout()

In [None]:
tmp = pd.Series([2,2,3,4,5,6,6,6,6,7])
pd.Series(tx).quantile(.45)

In [None]:
tx.shape

In [None]:
t = tout[['gene','transcript_id','chrom','pos','nv','p_ref','Genomic location']]

# by target
num_potential_targets = t.shape[0]
print('num potential targets:', num_potential_targets)
# results_dict['potential targets'] = t.shape[0]

good_targets = t[t['nv']==0]
print('num good targets:', good_targets.shape[0])
print("% good targets: {:d}/{:d} = {:0.3f}%".format(
    good_targets.shape[0], num_potential_targets, 100*good_targets.shape[0]/num_potential_targets))

# by unique target
num_potential_unique_targets = t['Genomic location'].unique().shape[0]
print('num potential unique targets:', num_potential_unique_targets)

num_good_unique_targets = t[t['nv']==0]['Genomic location'].unique().shape[0]
print('num good unique targets:', num_good_unique_targets)
print("% good uniuqe targets: {:d}/{:d} = {:0.3f}%".format(
    num_good_unique_targets, num_potential_unique_targets,
    100*num_good_unique_targets/num_potential_unique_targets))

print("% good uniuqe targets of unique target sites: {:d}/{:d} = {:0.3f}%".format(
    num_good_unique_targets, chopchop_targets['Genomic location'].unique().shape[0],
    100*num_good_unique_targets/chopchop_targets['Genomic location'].unique().shape[0]))

results_dict['potential unique targets'] = num_potential_unique_targets
results_dict['good unique targets'] = num_good_unique_targets
results_dict['good unique targets % of total unique'] = (100*
    results_dict['good unique targets']/results_dict['unique location target sites'])
results_dict['good unique targets % of potential'] = (100*
    results_dict['good unique targets']/results_dict['potential unique targets'])

# by transcript
### @TCC NOT CORRECT SINCE t (from d_filt) eliminates duplicates based on genomic location and gene (not transcript_id)
# num_total_utcds = tcdsdf['transcript_id'].unique().shape[0]
# num_potential_utcds = t['transcript_id'].unique().shape[0]
# num_good_utcds = good_targets['transcript_id'].unique().shape[0]
# print("num coding transcripts", num_total_utcds)
# print("% transcripts w/ potential uniuqe targets: {:d}/{:d} = {:0.2f}%".format(
#     num_potential_utcds, num_total_utcds,
#     100*num_potential_utcds/num_total_utcds))
# print("% transcripts w/ good uniuqe targets: {:d}/{:d} = {:0.2f}%".format(
#     num_good_utcds, num_total_utcds,
#     100*num_good_utcds/num_total_utcds))

# by gene
num_coding_genes = tcdsdf['gene'].unique().shape[0]
num_potential_genes = t['gene'].unique().shape[0]
num_good_genes = good_targets['gene'].unique().shape[0]
print("num coding genes", num_coding_genes)
print("% genes w/ potential uniuqe targets: {:d}/{:d} = {:0.3f}%".format(
    num_potential_genes, num_coding_genes,
    100*num_potential_genes/num_coding_genes))
print("% genes w/ good uniuqe targets: {:d}/{:d} = {:0.3f}%".format(
    num_good_genes, num_coding_genes,
    100*num_good_genes/num_coding_genes))

results_dict['coding genes w/ potential targets'] = num_potential_genes
results_dict['coding genes w/ potential targets %'] = (100*
    num_potential_genes/num_coding_genes)
results_dict['coding genes w/ good targets'] = num_good_genes
results_dict['coding genes w/ good targets %'] = (100*
    num_good_genes/num_coding_genes)

# display(good_targets['gene'].value_counts())

In [None]:
results_dict

#### Rerunning with grouping by **transcript** to get good targets per transcript

In [None]:
# sort the filtered list of chopchop targets by chrom,pos (from Genomic location)
d_filt = chopchop_targets.copy(deep=True)
# keep only unique location, gene combinations...
d_filt = d_filt.loc[d_filt[['Genomic location', 'transcript_id']].drop_duplicates(keep='first').index]
# add chrom and pos columns
d_filt[['chrom','pos']] = d_filt['Genomic location'].str.split(':', n=1, expand=True)
d_filt['pos'] = d_filt['pos'].astype(int)
d_filt.sort_values(['chrom', 'pos'], inplace=True)

print("location by gene unique combinations:",d_filt.shape[0])
print("unique target locations:",chopchop_targets['Genomic location'].unique().shape[0])

In [None]:
tout = d_filt.copy(deep=True)
tout['vpos'] = np.nan # position of each variant
tout['nv'] = np.nan # number of variants
tout['refAF'] = np.nan # frequency of reference allele of each variant
tout['p_ref'] = np.nan # probability of ref sequence for whole target

for chrom in CHROMS:
    print("chrom:", chrom)
    vpos = vpos_dict[chrom]
    
    # target positions (target list is filtered above)
    tpos = d_filt[d_filt['chrom']==chrom]['pos'].copy(deep=True)
    print("unique (location,transcript) targets:", tpos.shape)

    # find target positions in the list of variant positions
    p1 = np.searchsorted(vpos, tpos.values, side='left')
    p2 = np.searchsorted(vpos, tpos.values+TARGET_LEN, side='left')
    
    # list of actual variant positions in each target
    tout.loc[tpos.index,'vpos'] = [vpos[_p1:_p2] for _p1,_p2 in zip(p1,p2)]
    # count of variants in each target
    tout.loc[tpos.index,'nv'] = (p2-p1)
    # refAF of each variant in each target
    tout.loc[tpos.index,'refAF'] = [refAF_dict[chrom][_p1:_p2] for _p1,_p2 in zip(p1,p2)]
    # the probability a target locus will be entirely reference (perfect match)
    # assumes each variant hitting that target is independent
    tout.loc[tpos.index,'p_ref'] = tout.loc[tpos.index,'refAF'].apply(lambda x: np.prod(x, initial=1))

# summary statistics
display(tout['nv'].value_counts())
display(tout['nv'].describe())
print('targets containting variant loci {:0.2f}%'.format(100*(tout['nv']==0).sum()/tout['nv'].count()))

In [None]:
t = tout[['gene','transcript_id','chrom','pos','nv','Genomic location']]

# by target (should be same as per-gene calculations)
num_potential_targets = t.shape[0]
print('num potential targets:', num_potential_targets)

good_targets = t[t['nv']==0]
print('num good targets:', good_targets.shape[0])
print("% good targets: {:d}/{:d} = {:0.3f}%".format(
    good_targets.shape[0], num_potential_targets, 100*good_targets.shape[0]/num_potential_targets))

# by unique target (should be same as per-gene calculations)
num_potential_unique_targets = t['Genomic location'].unique().shape[0]
print('num potential unique targets:', num_potential_unique_targets)

num_good_unique_targets = t[t['nv']==0]['Genomic location'].unique().shape[0]
print('num good unique targets:', num_good_unique_targets)
print("% good uniuqe targets: {:d}/{:d} = {:0.3f}%".format(
    num_good_unique_targets, num_potential_unique_targets,
    100*num_good_unique_targets/num_potential_unique_targets))

print("% good uniuqe targets of unique target sites: {:d}/{:d} = {:0.3f}%".format(
    num_good_unique_targets, chopchop_targets['Genomic location'].unique().shape[0],
    100*num_good_unique_targets/chopchop_targets['Genomic location'].unique().shape[0]))

# by transcript
num_total_utcds = tcdsdf['transcript_id'].unique().shape[0]
num_potential_utcds = t['transcript_id'].unique().shape[0]
num_good_utcds = good_targets['transcript_id'].unique().shape[0]
print("num coding transcripts", num_total_utcds)
print("% transcripts w/ potential uniuqe targets: {:d}/{:d} = {:0.2f}%".format(
    num_potential_utcds, num_total_utcds,
    100*num_potential_utcds/num_total_utcds))
print("% transcripts w/ good uniuqe targets: {:d}/{:d} = {:0.2f}%".format(
    num_good_utcds, num_total_utcds,
    100*num_good_utcds/num_total_utcds))

results_dict['coding transcripts w/ potential targets'] = num_potential_utcds
results_dict['coding transcripts w/ potential targets %'] = (100*
    num_potential_utcds/num_total_utcds)
results_dict['coding transcripts w/ good targets'] = num_good_utcds
results_dict['coding transcripts w/ good targets %'] = (100*
    num_good_utcds/num_total_utcds)

# by gene @TCC NOT NECISSARIALLY CORRECT SINCE REMOVED DUPLICATES BASED ON TRANSCRIPT AND LOCATION (NOT GENE)
num_coding_genes = tcdsdf['gene'].unique().shape[0]
num_potential_genes = t['gene'].unique().shape[0]
num_good_genes = good_targets['gene'].unique().shape[0]
print("num coding genes", num_coding_genes)
print("% genes w/ potential uniuqe targets: {:d}/{:d} = {:0.3f}%".format(
    num_potential_genes, num_coding_genes,
    100*num_potential_genes/num_coding_genes))
print("% genes w/ good uniuqe targets: {:d}/{:d} = {:0.3f}%".format(
    num_good_genes, num_coding_genes,
    100*num_good_genes/num_coding_genes))

In [None]:
results_dict

In [None]:
with open(SETNAME+'_results.json','w') as fh:
    json.dump(list(results_dict.items()), fh)

In [None]:
# with open(SETNAME+'_results.json','r') as fh:
#     foo = OrderedDict(json.load(fh))
# foo