In [1]:
import random
from Bio import SeqUtils
from fish_helpers import *
from scipy.signal import lfilter
import tqdm.notebook as tqdm
from Bio import SeqIO
from scipy import sparse
from multiprocessing import Pool
from functools import partial
import sys

In [2]:
"""
Parse Transcriptome
transcript identifier (tid) (index)
gene identifier (gene)
sequence (seq)
Convert seq to int (intseq)
for each seed length region:
find each identity(hash)
find if it contains N bases(isvalid)
"""
resourcePath = '/bigstore/binfo/mouse/'
rawTranscriptomeFasta = os.path.join(resourcePath,'mer_transcripts.fa')

tids = []
gids = []
seqs = []
intSeqs = []
seed_hashs = []
probe_len = 30
seedLength = 17
nt2int = {'A':0,'C':1,'G':2,'T':3,'N':np.nan}
seedhashBase = [4**i for i in np.linspace(seedLength-1,0,seedLength)]
with open(rawTranscriptomeFasta) as fasta_file:  # Will close handle cleanly
    for seq_record in tqdm.tqdm(SeqIO.parse(fasta_file, 'fasta')):  # (generator)
        tid,gid = seq_record.description.split(' ')
        gid = gid.split('=')[-1]
        seq = str(seq_record.seq)
        if len(seq)>probe_len:
            intSeq = np.array([nt2int[i] for i in seq])
            seed_hash = list(lfilter(seedhashBase,1,intSeq)[seedLength-1:-1].astype(int))
            tids.append(tid)
            gids.append(gid)
            seqs.append(seq)
            intSeqs.append(intSeq)
            seed_hashs.append(seed_hash)
transcriptome = pd.DataFrame(index=tids)
transcriptome['gene'] = gids
transcriptome['seq'] = seqs
transcriptome['intseq'] = intSeqs
transcriptome['hash'] = seed_hashs
transcriptome.head()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Unnamed: 0,gene,seq,intseq,hash
ENSMUST00000193812,4933401J01Rik,AAGGAAAGAGGATAACACTTGAAATGTAAATAAAGAAAATACCTAA...,"[0, 0, 2, 2, 0, 0, 0, 2, 0, 2, 2, 0, 3, 0, 0, ...","[1126727840, 4576649256, 14029064202, 16392167..."
ENSMUST00000082908,Gm26206,GTGCTTGCTTCGGCAACACATACACTAAATTTTGAACGATACAGAG...,"[2, 3, 2, 1, 3, 3, 2, 1, 3, 3, 1, 2, 2, 1, 0, ...","[4406079342, 1101519835, 4570347254, 114258681..."
ENSMUST00000162897,Xkr4,GCACACTACGGTCCATCTCCAACAACCGCAGTGTTGCCAGTGACCG...,"[2, 1, 0, 1, 0, 1, 3, 0, 1, 2, 2, 3, 1, 1, 0, ...","[7615362118, 14788742417, 7992152900, 62930055..."
ENSMUST00000159265,Xkr4,TTAGTTAAGAGCACTGACTGCTCTTGCAAAGGACCCAGGCTTGAGT...,"[3, 3, 0, 2, 3, 3, 0, 0, 2, 0, 2, 1, 0, 1, 3, ...","[3026325391, 5051548643, 14147789048, 12126881..."
ENSMUST00000070533,Xkr4,GCGGCGGCGGGCGAGCGGGCGCTGGAGTAGGAGCTGGGGAGCGGCG...,"[2, 1, 2, 2, 1, 2, 2, 1, 2, 2, 2, 1, 2, 0, 2, ...","[10241075622, 11150203497, 11377485466, 713933..."


In [3]:
"""
Load Annotation Data
"""
annotation = pd.read_csv('/bigstore/binfo/mouse/mart_export_25Nov2019.txt',sep='\t')
annotation.index = list(annotation['Transcript stable ID'])
"""
Filter Transcriptome to tsl<5
"""
tids = list(transcriptome.index)
TSL = []
for tsl in annotation['Transcript support level (TSL)']:
    try:
        TSL.append(tsl.split(' (')[0])
    except:
        TSL.append('tslNA')
keepers = np.unique(TSL)
keepers = [i for i in keepers if not i=='tslNA']
keepers = [i for i in keepers if not i=='tsl5']
mask = []
for tsl in TSL:
    if tsl in keepers:
        mask.append(True)
    else:
        mask.append(False)
TSL_tids = list(annotation[mask].index)
filtered_tids = list(set(tids).intersection(TSL_tids))
transcriptome = transcriptome.loc[filtered_tids]
len(transcriptome)

74535

In [4]:
# """
# calcualte GC and Tm for all possible probes
# find all possible seeds (17 bp regions)
# mask probes based on Tm and GC
# mask seeds based on masked probes
# """
# def calculateProbeMetrics(row,probe_len=30,
#                           monovalentSalt=0.3,
#                           seedLength=17,
#                           probeConc=5e-9,
#                           nt2int = {'A':0,'C':1,'G':2,'T':3,'N':np.nan},
#                             HT = [0,-7.6,-8.4,-7.8,-7.2,
#                                   -8.5,-8.0,-10.6,-7.8,
#                                   -8.2,-9.8,-8.0,-8.4,
#                                   -7.2,-8.2,-8.5,-7.6],
#                             ST = [0,-21.3,-22.4,-21.0,-20.4,
#                                   -22.7,-19.9,-27.2,-21.0,
#                                   -22.2,-24.4,-19.9,-22.4,
#                                   -21.3,-22.2,-22.7,-21.3],
#                           tm_max = 76,
#                           tm_min = 66,
#                           GC_max = 63/100,
#                           GC_min = 43/100):

#     hashBase = [4**i for i in np.linspace(seedLength-1,0,seedLength)]
#     tid = row[0]
#     gene = row[1]['gene']
#     seq = row[1]['seq']
#     length = len(seq)
#     if length>probe_len:
#         # Convert sequence to integers {'A':0,'C':1,'G':2,'T':3,'N':np.nan}
#         intSeq = np.array([nt2int[i] for i in seq])
#         nnID = (4*intSeq[:-1] + intSeq[1:])+1
#         nnID = np.nan_to_num(nnID).astype(int)
#         # Calculate Free energy
#         dG = np.zeros([2,len(seq)-1])
#         dG[0,:] = np.array([HT[i] for i in nnID])
#         dG[1,:] = np.array([ST[i] for i in nnID])
#         # Calculate Entropy and Enthalpy
#         H = lfilter(np.ones([probe_len-1]),1,dG[0,:])[probe_len-1:]
#         S = lfilter(np.ones([probe_len-1]),1,dG[1,:])[probe_len-1:]
#         fivePrimeAT = (1*(intSeq==0)+1*(intSeq==3))[:-(probe_len)]
#         threePrimeAT = (1*(intSeq==0)+1*(intSeq==3))[probe_len:]
#         H = H+0.2+(2.2*fivePrimeAT)+(2.2*threePrimeAT)
#         S = S-5.7+(6.9*fivePrimeAT)+(6.9*threePrimeAT)
#         S = S + 0.368*(probe_len-1)*np.log(monovalentSalt)
#         # Calcuate Melting Temp for 30 bp probes
#         Tm = (H*1000)/(S+1.9872*np.log(probeConc))-273.15
#         # Calculate GC content for 30 bp probes
#         gc = 1*((intSeq==1)|(intSeq==2))
#         gc = lfilter(np.ones([probe_len-1])/probe_len,1,gc)[probe_len:]
#         # convert all possible seedLength seq to unique number (hash)
#         h = lfilter(hashBase,1,intSeq)[seedLength:]
#         # mask seeds that belong to only failed probes
#         p_mask = 1*(((1*(Tm>tm_max))+(1*(Tm<tm_min))+(1*(gc>GC_max))+(1*(gc<GC_min)))==0)
#         h_mask = 1*(np.array([np.sum(p_mask[i:i+seedLength]) for i in range(len(h))])>0)
# #         validBase = np.ones(seedLength)
# #         isvalid = 1*(lfilter(validBase,1,1*(intSeq<4))[seedLength:]==1)
#         values = [gene,seq,length,intSeq,Tm,gc,h,p_mask,h_mask]
#         out = pd.DataFrame(values,index=['gene','seq','length','intseq','Tm','GC','hash','p_mask','h_mask'],columns=[tid]).T
#         return out
#     else:
#         print(tid,'Failed')
#         return False
# def pfunc(partial_transcriptome):
#     Output = []
#     for row in partial_transcriptome.iterrows():
#         if isinstance(row,tuple):
#             Output.append(calculateProbeMetrics(row))
#     return Output
# Input = []
# s = 300
# indexes = list(filtered_transcriptome.index)
# step = int(len(indexes)/s)+1
# for i in range(s):
#     partial_transcriptome = filtered_transcriptome.iloc[i*step:(i+1)*step]
#     Input.append(partial_transcriptome)
# sys.stdout.flush()
# Output = []
# with Pool(30) as p:
#     for out in tqdm.tqdm(p.imap(pfunc,Input),total=len(Input)):
#         Output.extend(out)
# sys.stdout.flush()
# new_transcriptome = pd.concat(Output)

HBox(children=(IntProgress(value=0, max=300), HTML(value='')))




In [4]:
"""
Load Expression Data
"""
Brain = pd.read_csv('/bigstore/binfo/mouse/Hippocampus/medians.csv',index_col=0)
expression = []
empty_genes = []
empty = Brain.iloc[0]
for i,gene in tqdm.tqdm(enumerate(transcriptome.gene),total=len(transcriptome)):
    try:
        expression.append(pd.DataFrame(Brain.loc[gene]).T)
    except:
        empty.name = gene
        empty_genes.append(gene)
        expression.append(pd.DataFrame(empty).T)
        continue
cell_type_expression = pd.concat(expression)
median_expression = cell_type_expression.median(axis=1)
transcriptome['expression'] = list(median_expression)
transcriptome['expression_vector'] = [np.ones(len(h)) for h in transcriptome['hash']]
transcriptome['expression_vector'] = transcriptome['expression_vector']*transcriptome['expression']
Brain.head()
del Brain
del cell_type_expression
del median_expression

HBox(children=(IntProgress(value=0, max=74535), HTML(value='')))




In [7]:
"""
Build Target Vector and mask
Limit target transcriptome to just protien coding
Filter by expression?
Filter by genes without enough probes
"""
count_cutoff = 3
tids = list(transcriptome.index)
mRNA_tids = list(annotation[annotation['Transcript type']=='protein_coding'].index)
filtered_tids = list(set(tids).intersection(mRNA_tids))
target_transcriptome = transcriptome.loc[filtered_tids]
target_transcriptome = target_transcriptome[target_transcriptome['expression']>count_cutoff]
target_vector = np.concatenate(target_transcriptome['hash'])
target_expression_vector = np.concatenate(target_transcriptome['expression_vector'])
len(target_transcriptome)

15976

In [8]:
"""
Probes for each gene
May want to preserve location and isoform
"""
def generate_probe_df(gene_transcriptome,
                    probe_len=30,
                    monovalentSalt=0.3,
                    seedLength=17,
                    probeConc=5e-9,
                    nt2int = {'A':0,'C':1,'G':2,'T':3,'N':np.nan},
                    HT = [0,-7.6,-8.4,-7.8,-7.2,
                          -8.5,-8.0,-10.6,-7.8,
                          -8.2,-9.8,-8.0,-8.4,
                          -7.2,-8.2,-8.5,-7.6],
                    ST = [0,-21.3,-22.4,-21.0,-20.4,
                          -22.7,-19.9,-27.2,-21.0,
                          -22.2,-24.4,-19.9,-22.4,
                          -21.3,-22.2,-22.7,-21.3],
                    tm_max = 76,
                    tm_min = 66,
                    GC_max = 63/100,
                    GC_min = 43/100):
    probes = []
    probes_mask = []
    GC = []
    TM = []
    probe_hash = []
    probes_intseq = []
    seed_hash = []
    hashBase = [4**i for i in np.linspace(probe_len-1,0,probe_len)]
    seedhashBase = [4**i for i in np.linspace(seedLength-1,0,seedLength)]
    for idx,row in gene_transcriptome.iterrows():
        seq = row['seq']
        intSeq = row['intseq']#np.array([nt2int[i] for i in seq])
        h = lfilter(hashBase,1,intSeq)[probe_len-1:-1].astype(int)
        sh = row['hash']#list(lfilter(seedhashBase,1,intSeq)[seedLength-1:-1].astype(int))
        probe_sh = [sh[i:i+30] for i in range(len(h))]
        probe_seq = [seq[i:i+30] for i in range(len(h))]
        probe_intseq = [list(intSeq[i:i+30]) for i in range(len(h))]
        nnID = (4*intSeq[:-1] + intSeq[1:])+1 # convert seq to pairs
        nnID = np.nan_to_num(nnID).astype(int) # N to 0
        # Calculate Free energy
        dG = np.zeros([2,len(seq)-1])
        dG[0,:] = np.array([HT[i] for i in nnID])
        dG[1,:] = np.array([ST[i] for i in nnID])
        # Calculate Entropy and Enthalpy
        H = lfilter(np.ones([probe_len]),1,dG[0,:])[probe_len-1:]
        S = lfilter(np.ones([probe_len]),1,dG[1,:])[probe_len-1:]
        fivePrimeAT = (1*(intSeq==0)+1*(intSeq==3))[:-(probe_len)]
        threePrimeAT = (1*(intSeq==0)+1*(intSeq==3))[probe_len:]
        H = H+0.2+(2.2*fivePrimeAT)+(2.2*threePrimeAT)
        S = S-5.7+(6.9*fivePrimeAT)+(6.9*threePrimeAT)
        S = S + 0.368*(probe_len-1)*np.log(monovalentSalt)
        # Calcuate Melting Temp for 30 bp probes
        tm = (H*1000)/(S+1.9872*np.log(probeConc))-273.15
        # Calculate GC content for 30 bp probes
        gc = 1*((intSeq==1)|(intSeq==2))
        gc = lfilter(np.ones([probe_len])/probe_len,1,gc)[probe_len-1:-1]
        p_mask = 1*(((1*(tm>tm_max))+(1*(tm<tm_min))+(1*(gc>GC_max))+(1*(gc<GC_min)))==0)
        probes.extend(probe_seq)
        probes_intseq.extend(list(probe_intseq))
        probes_mask.extend(list(p_mask))
        GC.extend(list(gc))
        TM.extend(list(tm))
        probe_hash.extend(list(h))
        seed_hash.extend(list(probe_sh))
    gene_probe_df = pd.DataFrame()
    gene_probe_df['seq'] = probes
    gene_probe_df['gene'] = row['gene']
    gene_probe_df['expression'] = row['expression']
    gene_probe_df['intseq'] = probes_intseq
    gene_probe_df['mask'] = probes_mask
    gene_probe_df['GC'] = GC
    gene_probe_df['TM'] = TM
    gene_probe_df['hash'] = probe_hash
    gene_probe_df['seed_hash'] = seed_hash
    iso_vector = np.array(gene_probe_df['hash'])
    iso_score = [len(np.where(iso_vector==h)[0]) for h in iso_vector]
    gene_probe_df['iso_score'] = iso_score
    gene_probe_df = gene_probe_df[gene_probe_df['mask']==1]
    gene_probe_df = gene_probe_df.drop_duplicates('hash')
    return gene_probe_df
Input = []
for gene in tqdm.tqdm(target_transcriptome.gene.unique(),desc='Generating Input',leave=False):
    gene_transcriptome = target_transcriptome[target_transcriptome.gene==gene].copy()
    Input.append(gene_transcriptome)
sys.stdout.flush()
gene_probe_dict = {}
with Pool(30) as p:
    for gene_probe_df in tqdm.tqdm(p.imap(generate_probe_df,Input),total=len(Input),desc='Outer'):
        gene = gene_probe_df.gene.unique()[0]
        gene_probe_dict[gene] = gene_probe_df
sys.stdout.flush()

HBox(children=(IntProgress(value=0, description='Generating Input', max=6592, style=ProgressStyle(description_…



HBox(children=(IntProgress(value=0, description='Outer', max=6592, style=ProgressStyle(description_width='init…




In [21]:
gene_probe_dict[gene]

Unnamed: 0,seq,gene,intseq,mask,GC,TM,hash,seed_hash,iso_score,expression
175,GCCGTGGACCCTGCGGAATTCTTCGTGTTG,Mrps26,"[2, 1, 1, 2, 3, 2, 2, 0, 1, 1, 1, 3, 2, 1, 2, ...",1,0.600000,75.895096,859763051418954624,"[2798988182, 699747045, 13059838649, 161498615...",1,6.638988
176,CCGTGGACCCTGCGGAATTCTTCGTGTTGA,Mrps26,"[1, 1, 2, 3, 2, 2, 0, 1, 1, 1, 3, 2, 1, 2, 2, ...",1,0.566667,74.636759,214940762854738656,"[699747045, 13059838649, 16149861550, 83324326...",1,6.638988
177,CGTGGACCCTGCGGAATTCTTCGTGTTGAC,Mrps26,"[1, 2, 3, 2, 2, 0, 1, 1, 1, 3, 2, 1, 2, 2, 0, ...",1,0.566667,74.636759,341965566865396416,"[13059838649, 16149861550, 8332432683, 1496801...",1,6.638988
178,GTGGACCCTGCGGAATTCTTCGTGTTGACC,Mrps26,"[2, 3, 2, 2, 0, 1, 1, 1, 3, 2, 1, 2, 2, 0, 0, ...",1,0.566667,74.636759,373721767868060864,"[16149861550, 8332432683, 14968010058, 1662690...",1,6.638988
179,TGGACCCTGCGGAATTCTTCGTGTTGACCG,Mrps26,"[3, 2, 2, 0, 1, 1, 1, 3, 2, 1, 2, 2, 0, 0, 3, ...",1,0.566667,75.015956,669891194270438656,"[8332432683, 14968010058, 16626904402, 8451693...",1,6.638988
180,GGACCCTGCGGAATTCTTCGTGTTGACCGA,Mrps26,"[2, 2, 0, 1, 1, 1, 3, 2, 1, 2, 2, 0, 0, 3, 3, ...",1,0.566667,74.305235,167472798567609664,"[14968010058, 16626904402, 8451693396, 1070285...",1,6.638988
181,GACCCTGCGGAATTCTTCGTGTTGACCGAG,Mrps26,"[2, 0, 1, 1, 1, 3, 2, 1, 2, 2, 0, 0, 3, 3, 1, ...",1,0.566667,74.630911,618328951945325952,"[16626904402, 8451693396, 10702857941, 1556061...",1,6.638988
182,ACCCTGCGGAATTCTTCGTGTTGACCGAGC,Mrps26,"[0, 1, 1, 1, 3, 2, 1, 2, 2, 0, 0, 3, 3, 1, 3, ...",1,0.566667,75.819623,442812614138043264,"[8451693396, 10702857941, 15560616373, 1248008...",1,6.638988
184,CCTGCGGAATTCTTCGTGTTGACCGAGCGC,Mrps26,"[1, 1, 3, 2, 1, 2, 2, 0, 0, 3, 3, 1, 3, 3, 1, ...",1,0.600000,75.982149,460021352611195328,"[15560616373, 12480088685, 16004924059, 168861...",1,6.638988
185,CTGCGGAATTCTTCGTGTTGACCGAGCGCT,Mrps26,"[1, 3, 2, 1, 2, 2, 0, 0, 3, 3, 1, 3, 3, 1, 2, ...",1,0.566667,74.213375,979696466607934080,"[12480088685, 16004924059, 16886132902, 128114...",1,6.638988


In [65]:
"""
Build Off Target Vector
Filter by expression?
"""
start = time.time()
count_cutoff = 3
background_transcriptome = transcriptome[transcriptome['expression']>count_cutoff]
background_vector = np.concatenate(background_transcriptome['hash'])
background_expression_vector = np.concatenate(background_transcriptome['expression_vector'])
background_array = np.concatenate((background_vector[:,None],background_expression_vector[:,None]),axis=1)
hash_score_lookup = {}
for h in tqdm.tqdm(np.unique(background_vector),desc='Initializing dict',leave=False):
    hash_score_lookup[h] = 0
for h,c in tqdm.tqdm(background_array,desc='Calculating Scores'):
     hash_score_lookup[h] = hash_score_lookup[h]+c
total = time.time()-start
print(total)

HBox(children=(IntProgress(value=0, max=31836901), HTML(value='')))




HBox(children=(IntProgress(value=0, max=58697188), HTML(value='')))


1326.621963262558


In [6]:
# """
# build gene index lookup
# """
# gene_locations = {}
# for gene in background_transcriptome['gene'].unique():
#     gene_locations[gene] = []
# start = 0
# for idx,row in tqdm.tqdm(background_transcriptome.iterrows(),total=len(background_transcriptome)):
#     gene = row['gene']
#     nh = len(row['hash'])
#     stop = start + nh
#     gene_locations[gene].extend(list(range(start,stop)))
#     start = stop

HBox(children=(IntProgress(value=0, max=31160), HTML(value='')))




In [74]:
def generateHashLookup(partial_transcriptome):
    vector = np.concatenate(list(partial_transcriptome['hash']))
    expression_vector = np.concatenate(list(partial_transcriptome['expression_vector']))
    array = np.concatenate((vector[:,None],expression_vector[:,None]),axis=1)
    lookup = {}
    for h in np.unique(vector):
        lookup[h] = 0
    for h,c in array:
         lookup[h] =  lookup[h]+c
    return partial_transcriptome.gene.iloc[0],lookup
Input = []
for gene in tqdm.tqdm(target_transcriptome.gene.unique(),desc='Input Generator'):
    Input.append(target_transcriptome[target_transcriptome.gene==gene])
sys.stdout.flush()
gene_hash_score_lookup_dict = {}
with Pool(30) as p:
    for gene,lookup in tqdm.tqdm(p.imap(generateHashLookup,Input),total=len(Input),desc='Outer'):
        gene_hash_score_lookup_dict[gene] = lookup
sys.stdout.flush()

HBox(children=(IntProgress(value=0, description='Input Generator', max=6592, style=ProgressStyle(description_w…




HBox(children=(IntProgress(value=0, description='Outer', max=6592, style=ProgressStyle(description_width='init…




In [67]:
hash_score_lookup[h]

13.54768315167306

In [68]:
np.sum(background_array[background_array[:,0]==h,1])

13.54768315167306

In [66]:
sys.getsizeofeof(hash_score_lookup)

1342177376

In [None]:
isoform_vector = np.concatenateatenate(list(gene_probe_df['seed_hash']))

In [84]:
"""
Find Homology Scores
"""
def calcuateOffTargetScores(gene_probe_df,hash_score_lookup,iso_score_lookup):
    gene = gene_probe_df.gene.iloc[0]
    iso_score_lookup = iso_score_lookup[gene]
    background_scores = []
    iso_scores = []
    for seed_hash in gene_probe_df['seed_hash']:
        background  = []
        iso  = []
        for sh in seed_hash:
            Is = iso_score_lookup[sh]
            background.append(hash_score_lookup[sh]-Is)
            iso.append(Is)
        background_scores.append(background)
        iso_scores.append(iso)
    gene_probe_df['background_scores'] = background_scores
    gene_probe_df['iso_scores'] = iso_scores
    return gene_probe_df
# print(gene)
# gene_probe_df = calcuateOffTargetScores(gene_probe_dict[gene],hash_score_lookup,gene_hash_score_lookup_dict[gene])
# gene_probe_df

pfunc = partial(calcuateOffTargetScores,hash_score_lookup=hash_score_lookup,iso_score_lookup=gene_hash_score_lookup_dict)
Input = [gene_probe_df for gene,gene_probe_df in gene_probe_dict.items()]
sys.stdout.flush()
gene_probe_scores_dict = {} 
with Pool(30) as p:
    for gene_probe_df in tqdm.tqdm(p.imap(pfunc,Input),total=len(Input),desc='Outer'):
        gene = gene_probe_df.gene.iloc[0]
        gene_probe_scores_dict[gene] = gene_probe_df
sys.stdout.flush()

Process ForkPoolWorker-106:
Process ForkPoolWorker-104:
Process ForkPoolWorker-100:
Process ForkPoolWorker-114:
Process ForkPoolWorker-117:
Process ForkPoolWorker-97:
Process ForkPoolWorker-118:
Process ForkPoolWorker-105:
Process ForkPoolWorker-95:
Process ForkPoolWorker-109:
Process ForkPoolWorker-107:
Process ForkPoolWorker-112:
Process ForkPoolWorker-111:
Process ForkPoolWorker-102:
Process ForkPoolWorker-99:
Process ForkPoolWorker-108:
Process ForkPoolWorker-92:
Process ForkPoolWorker-101:
Process ForkPoolWorker-115:
Process ForkPoolWorker-93:
Process ForkPoolWorker-116:
Process ForkPoolWorker-94:
Process ForkPoolWorker-91:
Process ForkPoolWorker-98:
Process ForkPoolWorker-96:
Process ForkPoolWorker-110:
Process ForkPoolWorker-113:
Process ForkPoolWorker-119:
Process ForkPoolWorker-120:
Process ForkPoolWorker-103:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call l

  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._

  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/multiprocessing/synchronize.py", line 96, in __enter__
    return self._semlock.__enter__()
  File "/home/zach/miniconda3/envs/pyspots/lib/python3.6/

KeyboardInterrupt: 

In [81]:
BSS = []
for idx,row in gene_probe_df.iterrows():
    Bs = row['background_scores']
    IS = row['iso_scores']
    BSS.append(np.array(Bs)-np.array(IS))
    

In [82]:
BSS

[array([6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778]),
 array([6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778]),
 array([6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 6.63898778, 6.63898778,
        6.63898778, 6.63898778, 6.63898778, 

In [48]:
# hash_score_lookup = {}
# for h in tqdm.tqdm(counts_df[counts_df[0]>1].index):
#     score = np.sum(background_expression_vector[background_vector==h])
#     hash_score_lookup[h] = score

HBox(children=(IntProgress(value=0, max=8888712), HTML(value='')))

KeyboardInterrupt: 

In [51]:
base_path = '/bigstore/binfo/mouse/Hippocampus/'
pickle.dump(gene_probe_dict,open(os.path.join(base_path,'gene_probe_dict.pkl'),'wb'))
pickle.dump(gene_locations,open(os.path.join(base_path,'gene_locations.pkl'),'wb'))
pickle.dump(background_vector,open(os.path.join(base_path,'background_vector.pkl'),'wb'))
pickle.dump(background_expression_vector,open(os.path.join(base_path,'background_expression_vector.pkl'),'wb'))

In [2]:
import pickle
import pandas as pd
import numpy as np
import tqdm.notebook as tqdm
from multiprocessing import Pool
from functools import partial
import os
import sys
base_path = '/bigstore/binfo/mouse/Hippocampus/'
gene_probe_dict = pickle.load(open(os.path.join(base_path,'gene_probe_dict.pkl'),'rb'))
gene_locations = pickle.load(open(os.path.join(base_path,'gene_locations.pkl'),'rb'))
background_vector = pickle.load(open(os.path.join(base_path,'background_vector.pkl'),'rb'))
background_expression_vector = pickle.load(open(os.path.join(base_path,'background_expression_vector.pkl'),'rb'))

In [3]:
"""
Find Homology Scores
"""
def calcuateOffTargetScores(gene_probe_df,gene_locations,background_vector,background_expression_vector):
    background_target_scores = {}
    isoform_target_scores = {}
    gene = gene_probe_df.gene.iloc[0]
    print(gene)
    isoform_locations = gene_locations[gene]
    seed_hash = np.unique(np.concatenate(list(gene_probe_df['seed_hash'])))
    for h in tqdm.tqdm(seed_hash,desc=gene,leave=False):
        homology_location = np.where(background_vector==h)[0]
        isoform_overlap = list(set(isoform_locations).intersection(list(homology_location)))
        isoform_counts = np.sum(background_expression_vector[isoform_overlap])
        background_target_counts = np.sum(background_expression_vector[homology_location])-isoform_counts
        background_target_scores[h] = (background_target_counts)
        isoform_target_scores[h] = (isoform_counts)
    isoform_scores = []
    background_scores = []
    for seed_hash in gene_probe_df['seed_hash']:
        isoform = []
        background  = []
        for sh in seed_hash:
            isoform.append(isoform_target_scores[sh])
            background.append(background_target_scores[sh])
        isoform_scores.append(isoform)
        background_scores.append(background)
    gene_probe_df['isoform_scores'] = isoform_scores
    gene_probe_df['background_scores'] = background_scores
    return gene_probe_df

pfunc = partial(calcuateOffTargetScores,gene_locations=gene_locations,background_vector=background_vector,background_expression_vector=background_expression_vector)
Input = [gene_probe_dict[gene] for gene in gene_probe_dict.keys()]
sys.stdout.flush()
gene_probe_scores_dict = {} 
with Pool(30) as p:
    for gene_probe_df in tqdm.tqdm(p.imap(pfunc,Input),total=len(Input),desc='Outer'):
        gene = gene_probe_df.gene.iloc[0]
        gene_probe_scores_dict[gene] = gene_probe_df
sys.stdout.flush()

HBox(children=(IntProgress(value=0, description='Outer', max=9, style=ProgressStyle(description_width='initial…

AttributeError: 'Series' object has no attribute 'gene'

In [6]:
gene_probe_dict

Unnamed: 0,seq,gene,intseq,mask,GC,TM,hash,seed_hash,iso_score
0,GAGGTGGGACTTCCGGTCTCCAGATACTTC,Tbce,"[2, 0, 2, 2, 3, 2, 2, 2, 0, 1, 3, 3, 1, 1, 2, ...",1,0.566667,72.643375,563831176120544128,"[15669177250, 8212261608, 14937967290, 8029459...",1
1,AGGTGGGACTTCCGGTCTCCAGATACTTCC,Tbce,"[0, 2, 2, 3, 2, 2, 2, 0, 1, 3, 3, 1, 1, 2, 2, ...",1,0.566667,73.866219,429188170181847808,"[8212261608, 14937967290, 8029459118, 63023320...",1
2,GGTGGGACTTCCGGTCTCCAGATACTTCCG,Tbce,"[2, 2, 3, 2, 2, 2, 0, 1, 3, 3, 1, 1, 2, 2, 3, ...",1,0.600000,74.746130,683757794848885504,"[14937967290, 8029459118, 6302332075, 15755830...",1
3,GTGGGACTTCCGGTCTCCAGATACTTCCGC,Tbce,"[2, 3, 2, 2, 2, 0, 1, 3, 3, 1, 1, 2, 2, 3, 1, ...",1,0.600000,74.746130,459169824863933120,"[8029459118, 6302332075, 1575583018, 898383034...",1
4,TGGGACTTCCGGTCTCCAGATACTTCCGCC,Tbce,"[3, 2, 2, 2, 0, 1, 3, 3, 1, 1, 2, 2, 3, 1, 3, ...",1,0.600000,75.159994,403022832367694976,"[6302332075, 1575583018, 8983830346, 224595758...",1
5,GGGACTTCCGGTCTCCAGATACTTCCGCCT,Tbce,"[2, 2, 2, 0, 1, 3, 3, 1, 1, 2, 2, 3, 1, 3, 1, ...",1,0.600000,73.711949,965446836547058944,"[1575583018, 8983830346, 2245957586, 134463912...",1
6,GGACTTCCGGTCTCCAGATACTTCCGCCTA,Tbce,"[2, 2, 0, 1, 3, 3, 1, 1, 2, 2, 3, 1, 3, 1, 1, ...",1,0.566667,72.619673,241361709136764736,"[8983830346, 2245957586, 13446391284, 33615978...",1
7,GACTTCCGGTCTCCAGATACTTCCGCCTAG,Tbce,"[2, 0, 1, 3, 3, 1, 1, 2, 2, 3, 1, 3, 1, 1, 0, ...",1,0.566667,72.038740,636801179587614720,"[2245957586, 13446391284, 3361597821, 51353667...",1
8,ACTTCCGGTCTCCAGATACTTCCGCCTAGA,Tbce,"[0, 1, 3, 3, 1, 1, 2, 2, 3, 1, 3, 1, 1, 0, 2, ...",1,0.533333,72.058959,159200294896903680,"[13446391284, 3361597821, 5135366751, 14168743...",1
9,CTTCCGGTCTCCAGATACTTCCGCCTAGAG,Tbce,"[1, 3, 3, 1, 1, 2, 2, 3, 1, 3, 1, 1, 0, 2, 0, ...",1,0.566667,72.799474,616260826027649408,"[3361597821, 5135366751, 14168743575, 16427087...",1


In [96]:
f = h5py.File('/bigstore/binfo/mouse/Hippocampus/transcriptome.zip', 'r')

OSError: Unable to open file (file signature not found)

In [95]:
list(f.keys())

[]