# UR Correspondence between Hosts and Viruses

Here we analyze the UR correspondance between hosts and viruses. The analyzed spreadsheet is 
cross_UR_m[val]_[rnd model].xlsx.

## Parsing the xlsx file
 

In [1]:
import pandas as pd
import os, sys
import re
import myseq_logo as mysl
import pald_funcs as mypal # my palindrome functions
from Bio import motifs, SeqIO
from Bio.Seq import Seq
import matplotlib.pyplot as plt
import UR_host_funcs as urh
from pprint import pprint
from collections import defaultdict


# MOVE THIS TO UTILITIES FUNCTIONS
def get_seq_count(seqs):
    '''This function returns a dictionary, where the keys are the (unique)
    sequences and the values are the number of occurances. Input seqs is a list
    of strings.'''
    from collections import defaultdict
    d = defaultdict(int)
    for s in seqs: d[s] += 1
    #sorted(d, key=d.get, reverse=True) # this returns a sorted list in descending order
    return d 


def my_filter_df(df, field, val, cols=''):
    '''Given a Pandas dataframe, this function returns a sub-set dataframe where the value of
    the column "field" is equal to "val". "cols" (if give) is a list of column fields (strings) to return.'''
    return df.loc[df[field]==val, cols] if cols else df.loc[df[field]==val, :]




mlens = [3, 4, 5]

vtypes = ('ssDNA', 'dsDNA', 'ssRNA', 'dsRNA')
htypes = ('Vertebrate', 'Bacteria', 'Fungi', 'Metazoa', 'Plants', 'Protists')
cross_type = ('v_h', 'v_nh', 'nv_h')
# =====================================================================================================
base_path = '/Users/yoramzarai/work/school/Simulation/Viruses/Data_stats/'
dtype_rnd = {'hst_db':str, 'hst_name':str, 'hst_taxid':int, 'vrs_taxid':int, \
             'vrs_type':str, 'UR_v_h':str, 'UR_v_nh':str, 'UR_nv_h':str, 'UR_nv_nh':str}
mmlens = ['m'+str(m) for m in mlens]


cross_d = defaultdict()
for j, mlen in enumerate(mlens):
    rnd_model = 'dnt_samp' if mlen==3 else 'syn_perm+dnt_samp'
    # VUR xslx file
    xlsx_file = os.path.join(base_path, 'cross_UR_'+mmlens[j]+'_'+rnd_model+'.xlsx')
    df_cross = pd.read_excel(xlsx_file, header=0, dtype=dtype_rnd)
    
    
    # different virus types (NOT USED)
    #print('\nVirus info:')
    for vtype in vtypes:
        df_type = df_cross.loc[df_cross['vrs_type']==vtype, :]
        for ctype in cross_type:
            ur = df_type.loc[:,'UR_'+ctype].apply(lambda x: x.split('|'))
            allurs = [x for i in ur for x in i if x!='nan']
            d_allurs = get_seq_count(allurs)
            srt_allurs = sorted(d_allurs, key=d_allurs.get, reverse=True)
            #print('m{}: {}: {}: {} URs/virus : (Top) {}'.format(mlen, vtype, ctype, len(allurs)/ur.size, srt_allurs[0:3]))
            #for k in srt_allurs: print('{}: {}'.format(k, d_allurs[k]))
    
    
    # different virus types (NOT USED)
    #print('\nHost info:')
    for htype in htypes:
        df_type = df_cross.loc[df_cross['hst_db']==htype, :]
        for ctype in cross_type:
            ur = df_type.loc[:,'UR_'+ctype].apply(lambda x: x.split('|'))
            allurs = [x for i in ur for x in i if x!='nan']
            d_allurs = get_seq_count(allurs)
            srt_allurs = sorted(d_allurs, key=d_allurs.get, reverse=True)
            #print('m{}: {}: {}: {} URs/host : (Top) {}'.format(mlen, htype, ctype, len(allurs)/ur.size, srt_allurs[0:3]))
    
    
    # host-virus types
    #print('\nHost-Virus info:')
    for htype in htypes:
        #df_type = df_cross.loc[df_cross['hst_db']==htype, :]  # pick all rows with specified host DB
        df_type = my_filter_df(df_cross, 'hst_db', htype)
        for vtype in vtypes:
            #df_type2 = df_type.loc[df_type['vrs_type']==vtype, :]  # pick all rows (of specified host DB) with specified virus type
            df_type2 = my_filter_df(df_type, 'vrs_type', vtype)
            #print(df_type2.head)
            for ctype in cross_type:
                ur = df_type2.loc[:,'UR_'+ctype].apply(lambda x: x.split('|'))
                allurs = [x for i in ur for x in i if x!='nan']
                d_allurs = get_seq_count(allurs) if allurs else {}
                cross_d[':'.join([mmlens[j], htype, vtype, ctype, str(ur.size)])] = d_allurs  # saving for post processing
                if d_allurs:
                    srt_allurs = sorted(d_allurs, key=d_allurs.get, reverse=True)
                    #print('{}:{}:{}:{}:{} {} URs/virus : (Top) {}'.format(mmlens[j], htype, vtype, ctype, str(ur.size), len(allurs)/ur.size, srt_allurs[0:3]))
    
   

## Processing the correspondance (with filtering)

In [7]:
# MAKE SURE THE CELL ABOVE RUNS FIRST !!
%config InlineBackend.figure_format = 'retina'

# thresholds to show
num_thres = 11 #11    # minimum number of entries in each group (set to 0 to show all)
ratio_thres = 0.0 #0.5 # minimum ratio of seq occurance in group (set to 0 to show all)
corsp_grp = ['nv_h', 'nv_h', 'nv_h'] # correspondance groups to analyze (all is ['v_h', 'v_nh', 'nv_h'])

from pprint import pprint
print('Total of {} entries'.format(len(cross_d)))
for k, v in cross_d.items():
    srt = sorted(v, key=v.get, reverse=True)
    mm, hh, vv, cc, nn = tuple(k.split(':'))
    # printing a subset
    if cc in corsp_grp:
        print(mm, hh, vv, cc, nn, sep=',')
        for x in srt: 
            if v[x]/float(nn) >= ratio_thres and float(nn) >= num_thres:
                print('\t{}: {} ({:2.1f}%)'.format(x, v[x], v[x]/float(nn)*100))



Total of 216 entries
m3,Vertebrate,ssDNA,nv_h,128
	TTG: 93 (72.7%)
	CAA: 62 (48.4%)
	TAG: 30 (23.4%)
	ACT: 26 (20.3%)
	CCC: 18 (14.1%)
	GGG: 15 (11.7%)
m3,Vertebrate,dsDNA,nv_h,207
	TTG: 184 (88.9%)
	CAA: 163 (78.7%)
	TAG: 156 (75.4%)
	CCC: 124 (59.9%)
	GGG: 119 (57.5%)
	ACT: 110 (53.1%)
	GCA: 65 (31.4%)
	CGA: 52 (25.1%)
	TGC: 51 (24.6%)
	GTT: 46 (22.2%)
	AAC: 41 (19.8%)
	TGA: 40 (19.3%)
	CTA: 36 (17.4%)
	GAC: 32 (15.5%)
	GGT: 28 (13.5%)
	GTC: 22 (10.6%)
	AGG: 22 (10.6%)
m3,Vertebrate,ssRNA,nv_h,354
	TTG: 338 (95.5%)
	CAA: 274 (77.4%)
	TAG: 208 (58.8%)
	ACT: 204 (57.6%)
	GGG: 146 (41.2%)
	CCC: 130 (36.7%)
	TGA: 59 (16.7%)
	GCA: 54 (15.3%)
	TGC: 48 (13.6%)
m3,Vertebrate,dsRNA,nv_h,34
	TTG: 28 (82.4%)
	CAA: 25 (73.5%)
	ACT: 20 (58.8%)
	CCC: 18 (52.9%)
	TAG: 17 (50.0%)
	GGG: 11 (32.4%)
	CGA: 8 (23.5%)
	TGA: 6 (17.6%)
	GCA: 5 (14.7%)
	GTT: 5 (14.7%)
	AAC: 4 (11.8%)
m3,Bacteria,ssDNA,nv_h,49
	GGG: 27 (55.1%)
	GGA: 23 (46.9%)
	GAG: 23 (46.9%)
	CCC: 22 (44.9%)
	CTC: 18 (36.7%)
	AAT: 17 (34.7%

# Scratch Pad

In [None]:
 # all dsDNA
df_type = df_cross.loc[df_cross['vrs_type']=='Retro-transcribing', :]
df_type1 = df_type.loc[df_type['hst_db']=='Plants']  # all Plants with Retro- viruses
#print(df_type1)
#ur = df_type.loc[:,'UR_nv_h'].apply(lambda x:x.split('|'))
#print(ur.iloc[0])
#allurs = [x for i in ur for x in i if x!='nan']
#print(ur.index[0])
#print(allurs, len(allurs), ur.shape[0], len(allurs)/ur.size)
    
xx = ['AAA', 'AAB', 'AAC', 'AAA', 'AAA', 'AAB', 'KKK', 'AAA', 'KKK']
pprint(get_seq_count(xx))
    
ss = ':'.join(['a', 'b', 'c'])
print(ss)

print(my_filter_df(df_type2, 'hst_db', 'Protists'))
print(my_filter_df(df_type2, 'hst_db', 'Protists', ['hst_name', 'hst_taxid', 'vrs_taxid', 'UR_nv_h']))
    