# Logos of UR of Hosts (process VUR of hsamp_all_segs)

## Functions 

In [5]:
#%%writefile UR_host_funcs.py 

import pandas as pd
import re
from functools import reduce
from collections import Counter

def intersect_series_values(df, func):
    '''Returns the intersection of lists in a Series input. In the Series
    input, each row is a list. The function returns the intersection over 
    all lists. Currently not used'''
    all_sets = [set(df.iloc[i][:]) for i in range(df.shape[0])] # convert to a list of sets
    return reduce((lambda x, y: x & y), all_sets)
    
def reduce_series_values(df, func):
    '''Reduces the Series values using 'func'. In the Series
    input, each row is a list. The function returns a reduce operation over 
    all lists, where the operation is defined by the input "func"'''
    # convert to a list of sets (each set in a Series value (list)) and then reduce
    return reduce(func, [set(df.iloc[i][:]) for i in range(df.shape[0])])

def most_series_values(df, perc=90.0):
    '''df is a Series, where each value is a list. The function returns the elements that appear 
    at least perc% in all lists. Here, an element can appear only once in each list.'''
    thres = df.count()*perc/100.0
    # aggregate all URs (with multiplicity), count the multiplicity (Counter) and return only
    # elements with multiplicity >= thres
    return {k for k, v in Counter(df.aggregate('sum')).items() if v>=thres}

def rename_hdb(name):
    '''This provides a better host DB names'''
    return 'Vertebrate' if name=='Ensembl' else name.split('_')[-1]
    
    
def cluster_NTs(ur, mlen):
    '''Clusters list of nucleotide (each of length mlen) to several groups:
    1. 'eql': All with same NT,
    2. 'gc': All with just G and C,
    3. 'at': All with just A and T, 
    4. 'noneql': ur \ {'eql'}, 
    5. 'nongc': ur \ {'gc'},
    6. 'nonat': ur \ {'at'}.
    Input 'ur' must be a set.'''
    clst_nt = {}
    clst_nt['all'] = ur
    clst_nt['eql'] = {s for s in ur if len(set(s))==1}  # URs with the same nucleotides (e.g., AAAA)
    clst_nt['gc'] = {s for s in ur if re.search('[CG]{'+str(mlen)+'}',s.upper())}  # URs that contain only G and/or C
    clst_nt['at'] = {s for s in ur if re.search('[AT]{'+str(mlen)+'}',s.upper())}  # URs that contain only A and/or T
    clst_nt['neql'] = ur - clst_nt['eql']  # all except URs with equal NTs
    clst_nt['ngc'] = ur - clst_nt['gc']    # all except URs with only G and/or C
    clst_nt['nat'] = ur - clst_nt['at']    # all except URs with only A and/or T
    return clst_nt

nhdb = {  # simpler names for host DB 
'Ensembl' : 'Vertebrate',
'Ensembl_Bacteria' : 'Bacteria',
'Ensembl_Fungi' : 'Fungi',
'Ensembl_Metazoa' : 'Metazoa',
'Ensembl_Plants' : 'Plants', 
'Ensembl_Protists' : 'Protists'
}


Overwriting UR_host_funcs.py


In [2]:
import pandas as pd
import os, sys
import re
import myseq_logo as mysl
import pald_funcs as mypal # my palindrome functions
from Bio import motifs, SeqIO
from Bio.Seq import Seq
import matplotlib.pyplot as plt
from collections import Counter
from functools import reduce



# Processing hsamp_all_segs host VURs
# =======================================
import operator 

# Users parameters
# ==============================================================================================================
mlens = (3, 4, 5)
#tst_hnames = ('Erinaceus europaeus', 'Mustela putorius furo', 'Escherichia coli')
tst_hnames = ('Bos taurus', 'Gallus gallus', 'Meleagris gallopavo', 'Ovis aries', 'Escherichia coli',\
              'Sus scrofa', \
             'Drosophila melanogaster', 'Homo sapiens', 'Mus musculus', 'Saccharomyces cerevisiae', \
             'Bacillus cereus', 'Enterococcus faecalis', 'Mycobacterium abscessus', 'Propionibacterium acnes', \
             'Mycobacterium smegmatis', 'Streptococcus pneumoniae', 'Vibrio cholerae', \
             'Streptococcus thermophilus', 'Aedes aegypti',\
             'Arabidopsis thaliana', 'Solanum lycopersicum')
backg = {'A':0.25,'C':0.25,'G':0.25,'T':0.25}  # background for PSSM 
base_save_file = 'hst_URs_diff' # code appends '.png' to save the figure and '.txt' to save the prints. 
                                   # Set to '' to disable
perc = 80 # in percentage, URs that appear at least this percentage over all rows of host name 
          # that is over all samples of hosts corresponding to the different viruses
proc_type = 'neql'#'neql' # allowd values here are the keys of the return parameter from cluster_NTs function
base_save_file = '' #'hallsamp_URs_'+proc_type+'_'+str(perc) # code appends '.png' to save the figure and '.txt' to save the prints. 
                                   # Set to '' to disable
# =============================================================================================================
base_path = '/Users/yoramzarai/work/school/Simulation/Viruses/VUR/hsamp_all_segs/'
dtype_rnd = {'vtaxid':int, 'num_cds':int, 'signf_common':str, 'signf_F1':str, 'signf_F2':str, 'signf_F3':str}
mmlens = ['m'+str(m) for m in mlens]
# handle for text file or sys.stdout
ftxt = open(base_save_file+'.txt', 'wt') if base_save_file!='' else sys.stdout

fig, axs = plt.subplots(len(tst_hnames), len(mlens), figsize=(12*len(mlens), 5*len(tst_hnames)), squeeze=False)
for j, mlen in enumerate(mlens):
    rnd_model = 'dnt_samp' if mlen==3 else 'syn_perm+dnt_samp'
    # VUR xslx file
    xlsx_file = os.path.join(base_path, mmlens[j], rnd_model, 'Results', 'UR_allhsamp_'+rnd_model+'_'+mmlens[j]+'.xlsx')
    df_hrnd = pd.read_excel(xlsx_file, header=0, dtype=dtype_rnd)
    hnames = df_hrnd.loc[:, 'hst_name']  # all host names
    hdbs = df_hrnd.loc[:]['hst_db']  # all DB names
    for i, hname in enumerate(tst_hnames):
        # all rows corresponding to this host name
        #tmp = df_hrnd[:][df_hrnd[:]['hst_name']==hname]
        #print('Processing ', hname)
        allrows = df_hrnd[:][hnames==hname]
        if allrows.empty:
            print(hname, 'not found !!. Moving on...')
            continue
        #hdb = rename_hdb(hdbs[allrows.index[0]]) # all have the same DB
        hdb = nhdb[hdbs[allrows.index[0]]]
        if not allrows.empty:
            num_rows = allrows.shape[0]
            #print('{}:{}: num of corresponding viruses {}.'.format(hdbs[allrows.index[0]], hname, num_rows) )
            urs_sr = allrows.loc[:,'signf_common'].apply(lambda x:x.split('|'))  # convert Series with lists of URs
            #hurs = intersect_series_values(urs) # find the intersection over all rows (lists)
            hurs = reduce_series_values(urs_sr, operator.and_) # find the intersection over all rows (lists)
            mst_hurs = most_series_values(urs_sr, perc) # URs that appear perc% over all rows (lists)
            # here we use the mst_hurs as the actual host common-UR sequences
            clst_hurs = cluster_NTs(mst_hurs, mlen)
            
            # CHOOSE here what is proc_nt (the URs to show logos)
            proc_nt = clst_hurs[proc_type] #mst_hurs
            print('m{}:{}:{} ({} corresponding viruses):\n\tintersect={}\n\t@least {}%={}\n\teql={}\n\tgc={}\n\tat={}'\
                  .format(mlen, hdb, hname, num_rows, hurs, perc, mst_hurs, \
                          clst_hurs['eql'], clst_hurs['gc'], clst_hurs['at']), file=ftxt)
            # generate sequence logo
            if proc_nt and list(proc_nt)[0]!='nan':  # 'nan' is generated by pd.read_excel for empty cells
                _, _, _, m, _ = mysl.compute_pssm(proc_nt, backg, backg)
                rel_info = mysl.calc_rel_info(m, 'no')
                maxy = mysl.gen_nt_sequence_logo(axs[i,j], rel_info)
                axs[i,j].set_title(hdb+':'+hname+ ' ('+str(len(proc_nt))+' cURs, '+str(num_rows)+ ' vrs)',\
                                   fontsize=25, color='blue')
                axs[i,j].set_ylabel('Bits')
                #axs[i,j].set_xlabel('NT index')
                axs[i,j].axis([0.5, mlen+0.5, 0, maxy])
                axs[i,j].set_xticks(range(1, mlen+1))
            else: 
                axs[i,j].set_title(hdb+':'+hname+ ' ('+str(num_rows)+ ' vrs)', fontsize=15, color='blue')
                axs[i,j].text(0.1, 0.5, 'No Common URs', fontsize=25, color='red')
plt.tight_layout()
if base_save_file!='': 
    plt.savefig(base_save_file+'.png', dpi=250)
    path = !pwd  # shell command
    print('Figure saved in {}/{}'.format(path[0], base_save_file+'.png'))
    ftxt.close()


m3:Vertebrate:Bos taurus (84 corresponding viruses):
	intersect=set()
	@least 80%={'CAA', 'TTG'}
	eql=set()
	gc=set()
	at=set()
m3:Vertebrate:Gallus gallus (47 corresponding viruses):
	intersect=set()
	@least 80%={'CAA', 'TTG'}
	eql=set()
	gc=set()
	at=set()
m3:Vertebrate:Meleagris gallopavo (18 corresponding viruses):
	intersect=set()
	@least 80%={'ACT', 'CAA', 'TTG'}
	eql=set()
	gc=set()
	at=set()
m3:Vertebrate:Ovis aries (28 corresponding viruses):
	intersect={'TTG'}
	@least 80%={'CAA', 'TTG'}
	eql=set()
	gc=set()
	at=set()
m3:Bacteria:Escherichia coli (120 corresponding viruses):
	intersect={'GAG', 'CAA'}
	@least 80%={'CCC', 'GAG', 'GGA', 'TAG', 'AAT', 'GAC', 'GGG', 'CAA', 'CTT', 'CTC', 'CGA'}
	eql={'GGG', 'CCC'}
	gc={'GGG', 'CCC'}
	at={'AAT'}
m3:Vertebrate:Sus scrofa (79 corresponding viruses):
	intersect=set()
	@least 80%={'TTG'}
	eql=set()
	gc=set()
	at=set()
m3:Vertebrate:Drosophila melanogaster (6 corresponding viruses):
	intersect=set()
	@least 80%=set()
	eql=set()
	gc=set()


  return [Hg+sum([pwm[b][l]*np.nan_to_num(np.log2(pwm[b][l])) for b in bases])\


m3:Bacteria:Enterococcus faecalis (14 corresponding viruses):
	intersect={'CCC', 'GAG', 'GGT'}
	@least 80%={'CCC', 'GAG', 'GCA', 'GGT', 'TGC'}
	eql={'CCC'}
	gc={'CCC'}
	at=set()
m3:Bacteria:Mycobacterium abscessus (36 corresponding viruses):
	intersect={'TTA', 'CCC', 'GGG'}
	@least 80%={'TTA', 'CCC', 'GGG'}
	eql={'GGG', 'CCC'}
	gc={'GGG', 'CCC'}
	at={'TTA'}
m3:Bacteria:Propionibacterium acnes (53 corresponding viruses):
	intersect={'ACA', 'TCT'}
	@least 80%={'ACA', 'TCT', 'AGA'}
	eql=set()
	gc=set()
	at=set()
m3:Bacteria:Mycobacterium smegmatis (35 corresponding viruses):
	intersect={'GGG', 'CCC', 'AAA', 'GAC'}
	@least 80%={'GGG', 'CCC', 'GAC', 'AAA'}
	eql={'GGG', 'CCC', 'AAA'}
	gc={'GGG', 'CCC'}
	at={'AAA'}
m3:Bacteria:Streptococcus pneumoniae (9 corresponding viruses):
	intersect={'GGG', 'CCC', 'GCA'}
	@least 80%={'CCC', 'GCA', 'GGG', 'TGT', 'CGA'}
	eql={'GGG', 'CCC'}
	gc={'GGG', 'CCC'}
	at=set()
m3:Bacteria:Vibrio cholerae (18 corresponding viruses):
	intersect={'GGA'}
	@least 80%={