# UR Correspondence between Hosts and Viruses

Here we analyze the UR correspondance between hosts and viruses. The analyzed spreadsheet is 
cross_UR_m[val]_[rnd model].xlsx.

## Parsing the xlsx file
 

In [16]:
import pandas as pd
import os, sys
import re
import myseq_logo as mysl
import pald_funcs as mypal # my palindrome functions
from Bio import motifs, SeqIO
from Bio.Seq import Seq
import matplotlib.pyplot as plt
import UR_host_funcs as urh
from pprint import pprint
from collections import defaultdict


# MOVE THIS TO UTILITIES FUNCTIONS
def get_seq_count(seqs):
    '''This function returns a dictionary, where the keys are the (unique)
    sequences and the values are the number of occurances. Input seqs is a list
    of strings.'''
    from collections import defaultdict
    d = defaultdict(int)
    for s in seqs: d[s] += 1
    #sorted(d, key=d.get, reverse=True) # this returns a sorted list in descending order
    return d 



mlens = [3, 4, 5] #, 4] #, 4, 5)

vtypes = ('ssDNA', 'dsDNA', 'ssRNA', 'dsRNA')
htypes = ('Vertebrate', 'Bacteria', 'Fungi', 'Metazoa', 'Plants', 'Protists')
cross_type = ('v_h', 'v_nh', 'nv_h')
# =====================================================================================================
base_path = '/Users/yoramzarai/work/school/Simulation/Viruses/Data_stats/'
dtype_rnd = {'hst_db':str, 'hst_name':str, 'hst_taxid':int, 'vrs_taxid':int, \
             'vrs_type':str, 'UR_v_h':str, 'UR_v_nh':str, 'UR_nv_h':str, 'UR_nv_nh':str}
mmlens = ['m'+str(m) for m in mlens]


cross_d = defaultdict()
for j, mlen in enumerate(mlens):
    rnd_model = 'dnt_samp' if mlen==3 else 'syn_perm+dnt_samp'
    # VUR xslx file
    xlsx_file = os.path.join(base_path, 'cross_UR_'+mmlens[j]+'_'+rnd_model+'.xlsx')
    df_cross = pd.read_excel(xlsx_file, header=0, dtype=dtype_rnd)
    
    
    # different virus types
    print('\nVirus info:')
    for vtype in vtypes:
        df_type = df_cross.loc[df_cross['vrs_type']==vtype, :]
        for ctype in cross_type:
            ur = df_type.loc[:,'UR_'+ctype].apply(lambda x: x.split('|'))
            allurs = [x for i in ur for x in i if x!='nan']
            d_allurs = get_seq_count(allurs)
            srt_allurs = sorted(d_allurs, key=d_allurs.get, reverse=True)
            #print('m{}: {}: {}: {} URs/virus : (Top) {}'.format(mlen, vtype, ctype, len(allurs)/ur.size, srt_allurs[0:3]))
            #for k in srt_allurs: print('{}: {}'.format(k, d_allurs[k]))
    
    
    # different virus types
    print('\nHost info:')
    for htype in htypes:
        df_type = df_cross.loc[df_cross['hst_db']==htype, :]
        for ctype in cross_type:
            ur = df_type.loc[:,'UR_'+ctype].apply(lambda x: x.split('|'))
            allurs = [x for i in ur for x in i if x!='nan']
            d_allurs = get_seq_count(allurs)
            srt_allurs = sorted(d_allurs, key=d_allurs.get, reverse=True)
            #print('m{}: {}: {}: {} URs/host : (Top) {}'.format(mlen, htype, ctype, len(allurs)/ur.size, srt_allurs[0:3]))
    
    
    # host-virus types
    print('\nHost-Virus info:')
    for htype in htypes:
        df_type = df_cross.loc[df_cross['hst_db']==htype, :]
        for vtype in vtypes:
            df_type2 = df_type.loc[df_type['vrs_type']==vtype, :]
            #print(df_type2.head)
            for ctype in cross_type:
                ur = df_type2.loc[:,'UR_'+ctype].apply(lambda x: x.split('|'))
                allurs = [x for i in ur for x in i if x!='nan']
                d_allurs = get_seq_count(allurs) if allurs else {}
                cross_d[':'.join([mmlens[j], htype, vtype, ctype, str(ur.size)])] = d_allurs  # saving for post processing
                if d_allurs:
                    srt_allurs = sorted(d_allurs, key=d_allurs.get, reverse=True)
                    #print('{}:{}:{}:{}:{} {} URs/virus : (Top) {}'.format(mmlens[j], htype, vtype, ctype, str(ur.size), len(allurs)/ur.size, srt_allurs[0:3]))
    
   


Virus info:

Host info:

Host-Virus info:

Virus info:

Host info:

Host-Virus info:

Virus info:

Host info:

Host-Virus info:


## Processing the correspondance 

In [22]:
# MAKE SURE THE CELL ABOVE RUNS FIRST !!

from pprint import pprint
print(len(cross_d))
for k, v in cross_d.items():
    srt = sorted(v, key=v.get, reverse=True)
    mm, hh, vv, cc, nn = tuple(k.split(':'))
    print(mm, hh, vv, cc, nn, sep=',')
    #print(k, end=':\n')
    for x in srt: print('\t{}: {}'.format(x, v[x]))



216
m3,Vertebrate,ssDNA,v_h,128
	CCC: 5
	ACT: 2
m3,Vertebrate,ssDNA,v_nh,128
	CCC: 2
	ACT: 1
	AAA: 1
	GAA: 1
	TTT: 1
m3,Vertebrate,ssDNA,nv_h,128
	TTG: 93
	CAA: 62
	TAG: 30
	ACT: 26
	CCC: 18
	GGG: 15
	TGA: 3
	GCA: 3
	GGT: 2
	TGC: 2
	AAC: 2
	GTT: 2
	GAC: 1
	AAT: 1
m3,Vertebrate,dsDNA,v_h,207
	ACT: 32
	TTG: 18
	CAA: 8
	TGA: 5
	CTA: 3
	AGT: 2
	CGA: 2
	AAT: 2
	GAC: 2
	TAG: 1
	TCG: 1
	AAC: 1
	GGG: 1
m3,Vertebrate,dsDNA,v_nh,207
	CCT: 14
	AGT: 9
	TCG: 8
	ATT: 8
	CTT: 8
	TGA: 6
	AAT: 5
	TGC: 3
	TAC: 3
	CGA: 3
	GGT: 3
	AAG: 3
	ACC: 3
	CGG: 2
	GTA: 2
	TCA: 2
	AGC: 1
	GGC: 1
	GAC: 1
	GCT: 1
	AGG: 1
	CAT: 1
	ACG: 1
m3,Vertebrate,dsDNA,nv_h,207
	TTG: 184
	CAA: 163
	TAG: 156
	CCC: 124
	GGG: 119
	ACT: 110
	GCA: 65
	CGA: 52
	TGC: 51
	GTT: 46
	AAC: 41
	TGA: 40
	CTA: 36
	GAC: 32
	GGT: 28
	GTC: 22
	AGG: 22
	TAA: 17
	AAT: 16
	GCG: 16
	CAC: 13
	TCG: 11
	GAT: 7
	CAT: 7
	TCA: 6
	AGT: 4
	ATC: 3
	AGA: 1
	CGT: 1
	CGC: 1
m3,Vertebrate,ssRNA,v_h,354
	GGG: 9
	CCC: 2
	CGA: 2
m3,Vertebrate,ssRNA,v_nh,354
	AAA: 8
	G

# Scratch Pad

In [None]:
 # all dsDNA
df_type = df_cross.loc[df_cross['vrs_type']=='Retro-transcribing', :]
df_type1 = df_type.loc[df_type['hst_db']=='Plants']  # all Plants with Retro- viruses
#print(df_type1)
#ur = df_type.loc[:,'UR_nv_h'].apply(lambda x:x.split('|'))
#print(ur.iloc[0])
#allurs = [x for i in ur for x in i if x!='nan']
#print(ur.index[0])
#print(allurs, len(allurs), ur.shape[0], len(allurs)/ur.size)
    
xx = ['AAA', 'AAB', 'AAC', 'AAA', 'AAA', 'AAB', 'KKK', 'AAA', 'KKK']
pprint(get_seq_count(xx))
    
ss = ':'.join(['a', 'b', 'c'])
print(ss)
    