In [1]:
import pandas as pd
import numpy as np
import csv
import os
from os import path
import weblogo
import seqlogo
%matplotlib inline


In [2]:
filteredGeneList = pd.read_csv('../../Database/filteredGenesDetails_human_240118.txt',sep='\t')
acc2gene={}
for idx,row in filteredGeneList.iterrows():
    acc2gene[row['AccNum']] = row['GeneName']

In [3]:
def get_fa(filename,onlyKeys = 'All'):
    keyD = dict()
    if onlyKeys!='All':
        for k in onlyKeys: keyD[k] = None
        
    def addEntry(current,d):
        entry = current.split('\n')
        if entry[0]!='' and (onlyKeys=='All' or keyD.has_key(entry[0])):
            d[entry[0]] = ''.join(entry[1:])

    d = dict()
    bf = open(filename)
    current = ['\n']
    for line in bf:
        if line[0]=='>':
            addEntry(''.join(current),d)
            if len(line)>1: current = [line[1:]]
            else: current = ['\n']
        else: current.append(line)
    addEntry(''.join(current),d)
    bf.close()
    return d

In [4]:
splicedGenes = get_fa('../../Database/splicedGenes_240118.txt')
## truncate the keys to only accNum
ks = list(splicedGenes.keys())
for key in ks:
    splicedGenes[key.split('_chr')[0]] = splicedGenes[key]
    splicedGenes.pop(key)
splicedORFs = get_fa('../../Database/splicedORFs_240118.txt')

In [5]:
fpUTR={}
tpUTR = {}
ORFs = {}
for idx,row in filteredGeneList.iterrows():
    accNum = row['AccNum']
    fpLen = row['fpUTR_length']
    tpLen = row['tpUTR_length']
    fpUTR[accNum] = splicedGenes[accNum][0:fpLen]
    tpUTR[accNum] = splicedGenes[accNum][-tpLen:]
    ORFs[accNum] = splicedORFs[accNum]

In [6]:
class Sequences:
    
    @staticmethod
    def genes2seq(genes):
        seqs = []
        for gene in genes:
            fp=fpUTR[gene]
            if len(fp)<40:
                #print('No fpUTR encountered: ',acc2gene[gene])
                continue
            seq = fp[-35:-19] #position -35 to position -20
            seqs.append(seq)
        print('# of valid seqs: ',len(seqs))
        return seqs
    
    def __init__(self,seqs):
        self.seqs=seqs
        self.pfm, self.ppm = self.comp_count()
        self.flat = self.ppm2flat()
    
    def addseqs(self,seqs):
        self.seqs.extend(seqs)
        self.pfm,self.ppm = self.comp_count()
        self.flat = self.ppm2flat()
    
    def __len__(self):
        return len(self.seqs)
    
    def comp_count(self,seqLen=16):
        
        raw_count = np.zeros((seqLen,4))
        lines_tr=[''.join(s) for s in zip(*(self.seqs))]
        #print('#lines = ',len(seqs))
        for i in range(seqLen):
            raw_count[i][0] = lines_tr[i].count('A')
            raw_count[i][1] = lines_tr[i].count('C')
            raw_count[i][2] = lines_tr[i].count('G')
            raw_count[i][3] = lines_tr[i].count('T')
        ppm = self.pfm2ppm(raw_count)
        return raw_count, ppm
    
    ## Note: ppm here is the same as the so-called pwm in R,
    ## True meaning of 'pwm' vary from literatures
    def pfm2ppm(self,pfm):
        sum_of_rows = pfm.sum(axis=1)
        return pfm / sum_of_rows[:, np.newaxis]
    
    def plotlogo(self):
        pfm_pd=pd.DataFrame(self.pfm)
        ppm_pd=seqlogo.pfm2ppm(pfm_pd)
        ppm = seqlogo.Ppm(ppm_pd)
        plt=seqlogo.seqlogo(ppm, ic_scale = True, format = 'png', size = 'large')#,filename=datset+'.png')
        #seqlogo.seqlogo(ppm, ic_scale = True, format = 'svg', size = 'medium')#,filename=datset+'.svg')
        return plt
    
    def ppm2flat(self):
        return self.ppm.reshape(-1)
    
    
    def bootstrap(self,num):    # with replacement
        sample_seqs = np.random.choice(self.seqs, size=num)
        return Sequences(sample_seqs.tolist())
    
    def sample(self,num): # without replacement
        sample_seqs = np.random.choice(self.seqs, size=num,replace=False)
        return Sequences(sample_seqs.tolist())

    def __gt__(self, seq2):
        return True

In [None]:
from scipy.stats import chisquare
import seaborn as sns
class TestSeq:
    
    def __init__(self,target,test,resample=10000):
        self.resample=resample
        self.target=target
        self.test=test
        self.ps, self.seqs=self.createTest(target,test,resample)
        
    @classmethod
    def createTest(cls,target,test,resample=10000):
        ps = []
        seqs=[]
        for i in range(0,resample):
            seq = test.sample(len(target))
            ## dof = (4-1)*(8+8) = 48, number of freqs = 64
            ## dof = k - 1 - ddof
            ## ddof = 64 - 1 - 48 = 15
            _, p = chisquare(seq.flat*len(target),f_exp=target.flat*len(target),ddof=14)
            ps.append(p)
            seqs.append(seq)
        both = sorted(zip(ps,seqs))
        ps,seqs = [y for y,x in both],[x for y,x in both]
        return ps, seqs

    def dist(self):
        return sns.distplot(self.ps)
    def logdist(self):
        return sns.distplot(np.log2(self.ps))

In [7]:
def pfm2ppm_df(pfm):
    pfm_pd=pd.DataFrame(pfm)
    ppm_pd=seqlogo.pfm2ppm(pfm_pd)
    return seqlogo.Ppm(ppm_pd).T

In [8]:
batch = 'peak_27'

top21 = pd.read_csv('../../AUG meta analysis/old/t2normed/topGenesAtUTR.csv')
accNums = top21['AccNum']
print(accNums[0])

NM_000518


In [9]:
deseq = pd.read_csv('../../Deseq2 Analysis/shift15/coding/csv_raw/'+'ER_L24_t1'+'.csv')
background = deseq['Unnamed: 0']
#deseq_filtered = deseq.loc[deseq['padj']<0.05]
#up_num = sum(deseq_filtered['log2FoldChange'] > 1)


In [10]:
target_seq = Sequences(Sequences.genes2seq(accNums))
bg_seq = Sequences(Sequences.genes2seq(background))

# of valid seqs:  21
# of valid seqs:  7504


In [None]:
with open('AUG_t2_peak/top21genes.fa','w') as f:
    seqs = target_seq.seqs
    for i in range(len(accNums)):
        f.write(">%s\n"%(acc2gene[accNums[i]]))
        f.write("%s\n"%(seqs[i]))

In [None]:
ppm_out = pfm2ppm_df(target_seq.pfm)
ppm_out.to_csv("deseq_test/pwms/"+batch+"/"+batch+"_enriched"+".csv")
ppm_out = pfm2ppm_df(dep_seq.pfm)
ppm_out.to_csv("deseq_test/pwms/"+batch+"/"+batch+"_depleted"+".csv")
ppm_out = pfm2ppm_df(bg_seq.pfm)
ppm_out.to_csv("deseq_test/pwms/"+batch+"/"+"bg"+".csv")

In [None]:
target_seq.ppm

In [None]:
np.random.seed(42)
test = TestSeq(target_seq,bg_seq)

In [None]:
ax=t0.logdist()
ax.get_figure().savefig("deseq_wo_replace/"+batch+"/"+batch+"_self_hist_log.png")

In [None]:
ax=test.logdist()
print(type(ax))
#ax.get_figure().savefig("deseq_wo_replace/"+batch+"/"+batch+"_test_hist.png")

In [None]:
from scipy import stats
x = ax.lines[0].get_xdata() # Get the x data of the distribution
y = ax.lines[0].get_ydata() # Get the y data of the distribution
maxid = np.argmax(y) # The id of the peak (maximum of y data)
mode = x[maxid]
logp = np.log(test.ps)
median = np.median(logp)
std=np.std(logp)
print(mode,median,std)

In [None]:
left = np.searchsorted(logp,mode-std)
mid = np.searchsorted(logp,mode)
right = np.searchsorted(logp,mode+std)
print(left,mid,right)

In [None]:
!pwd
def disp_save_img(idx,pos):
    num = 50
    for i in range(num):
        seq = test.seqs[i-num//2+idx]
        if i == 0:
            com_seq = seq
        else:
            com_seq.addseqs(seq.seqs)
#        img=seq.plotlogo()
        #display(img)
#        with open("deseq_wo_replace/"+batch+"/"+pos+str(i)+".png", "wb") as png:
#            png.write(img.data)
    img=com_seq.plotlogo()
    #display(img)
#    with open("deseq_wo_replace/"+batch+"/"+pos+'_comb'+str(num)+".png", "wb") as png:
#        png.write(img.data)
    ppm_out = pfm2ppm_df(com_seq.pfm)
    ppm_out.to_csv("deseq_test/pwms/"+batch+"/"+batch+'_'+pos+'_comb'+str(num)+".csv")

disp_save_img(mid,'median')
disp_save_img(left,'leftstd')
disp_save_img(right,'rightstd')

In [None]:
img=target_seq.plotlogo()
with open("deseq_wo_replace/"+batch+"/enriched"+".png", "wb") as png:
    png.write(img.data)
img=bg_seq.plotlogo()
with open("deseq_wo_replace/"+batch+"/background"+".png", "wb") as png:
    png.write(img.data)
img=dep_seq.plotlogo()
with open("deseq_wo_replace/"+batch+"/depleted"+".png", "wb") as png:
    png.write(img.data)

In [None]:
img=target_seq.plotlogo()
with open("AUG_t2_peak/topGenes"+".png", "wb") as png:
    png.write(img.data)
img=bg_seq.plotlogo()
with open("AUG_t2_peak/bg"+".png", "wb") as png:
    png.write(img.data)

# Normed with t2 reads value

In [11]:
fid = pd.read_csv('../../RBC_fid_table.csv')
fid.head(2)

Unnamed: 0,Barcodes,Sets,Types,Timepoint,Position,FileName
0,ATCACG,8,S24,0,ATCACG,ATCACG-s_5_1
1,TTAGGC,8,S24,1,TTAGGC,TTAGGC-s_5_1


In [13]:
peak_count = pd.read_csv('../../AUG meta analysis/old/t2normed/peakUTR_geneList_ACTTGA-s_7_1.txt',header=None)
peak_count.columns=['AccNum','Count']
peak_count.head(2)

Unnamed: 0,AccNum,Count
0,NM_017582,0.0
1,NM_014372,0.0


In [54]:
all_accs = []
for acc in accNums:
    count = peak_count.loc[peak_count['AccNum']==acc,'Count'].values[0]
    all_accs.extend([acc]*int(count))

In [55]:
all_seqs=Sequences(Sequences.genes2seq(all_accs))

# of valid seqs:  67395


In [57]:
img=all_seqs.plotlogo()
with open("AUG_t2_peak/weighted_logo.png", "wb") as png:
    png.write(img.data)