# UR Correspondence between Hosts and Viruses

Here we analyze the UR correspondance between hosts and viruses. The analyzed spreadsheet is 
cross_UR_m[val]_[rnd model].xlsx.

## Parsing the correspondance xlsx files
 

In [None]:
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import os, sys
import re
import myseq_logo as mysl
import pald_funcs as mypal # my palindrome functions
import mysequtils as myut
from Bio import motifs, SeqIO
from Bio.Seq import Seq
import matplotlib.pyplot as plt
import UR_host_funcs as urh
from pprint import pprint
from collections import defaultdict


# MOVE These 2  TO UTILITIES FUNCTIONS





def analyze_correp_URs(df, htypes, vtypes, ctypes, mlen, cross_d, ctype_pre = 'UR_'):
    '''This function analyzes the UR correspondance between hosts and their
    related viruses for a given m length.
    In the input dictionary cross_d (to be updated and returned):
    1. The keys are "m:h:v:c:n", where
    m is the UR length, h is the host domain, v is the virus group, c is the 
    correspondance type (v_h, v_nh, or nv_h) and n is
    the total number of URs found.
    2. The values are a dictionary where the keys are the URs and the values are the
    corresponding number of occurances found.
    The input cross_d can be an empty dictionary, that will be updated by the function and returned.'''
    for htype in htypes:
        df_type = myut.my_filter_df(df, 'hst_db', htype)
        for vtype in vtypes:
            df_type2 = myut.my_filter_df(df_type, 'vrs_type', vtype)
            #print(df_type2.head)
            for ctype in ctypes:
                ur = df_type2.loc[:,ctype_pre+ctype].apply(lambda x: x.split('|'))
                allurs = [x for i in ur for x in i if x!='nan']
                d_allurs = myut.get_seq_count(allurs) if allurs else {}
                cross_d[':'.join([str(mlen), htype, vtype, ctype, str(ur.size)])] = d_allurs
                if d_allurs:
                    srt_allurs = sorted(d_allurs, key=d_allurs.get, reverse=True)
    return cross_d, srt_allurs
    
# ------------------------------------------------------------------------------------------------------

mlens = [3, 4, 5]

vtypes = ('ssDNA', 'dsDNA', 'ssRNA', 'dsRNA', 'Retro-transcribing')
htypes = ('Vertebrate', 'Bacteria', 'Fungi', 'Metazoa', 'Plants', 'Protists')
ctypes = ('v_h', 'v_nh', 'nv_h')
# =====================================================================================================
base_path = '/Users/yoramzarai/work/school/Simulation/Viruses/Data_stats/'
dtype_rnd = {'hst_db':str, 'hst_name':str, 'hst_taxid':int, 'vrs_taxid':int, \
             'vrs_type':str, 'UR_v_h':str, 'UR_v_nh':str, 'UR_nv_h':str, 'UR_nv_nh':str}
mmlens = ['m'+str(m) for m in mlens]


cross_d = defaultdict()
for j, mlen in enumerate(mlens):
    rnd_model = 'dnt_samp' if mlen==3 else 'syn_perm+dnt_samp'
    # VUR xslx file
    xlsx_file = os.path.join(base_path, 'cross_UR_'+mmlens[j]+'_'+rnd_model+'.xlsx')
    df_cross = pd.read_excel(xlsx_file, header=0, dtype=dtype_rnd)
    
    # host-virus correspondance
    cross_d, srt_allurs = analyze_correp_URs(df_cross, htypes, vtypes, ctypes, mmlens[j], cross_d)
    
   

## Processing the correspondance 

In [None]:
# MAKE SURE THE CELL ABOVE RUNS FIRST !!

# thresholds to show
num_thres = 1 #11    # minimum number of entries in each group (set to 0 to show all)
ratio_thres = 0.1 #0.5 # minimum ratio of seq occurance in group (set to 0 to show all)
corsp_grp = {'nv_h', 'nv_h', 'nv_h'} # correspondance groups to analyze (all is ['v_h', 'v_nh', 'nv_h'])
# ======================================================================================================
print('Total of {} entries'.format(len(cross_d)))
for k, v in cross_d.items():
    srt = sorted(v, key=v.get, reverse=True)
    mm, hh, vv, cc, nn = tuple(k.split(':'))
    # printing a subset
    if cc in corsp_grp:
        print(mm, hh, vv, cc, nn, sep=',', end=':\n')
        for x in srt: 
            if v[x]/float(nn) >= ratio_thres and float(nn) >= num_thres:
                print('\t{}: {} ({:2.1f}%)'.format(x, v[x], v[x]/float(nn)*100))



# Unique Correspondance (unique URs)

In [None]:
# RUN THE FIRST CELL FIRST

qmlens = [4, 5]  # 4 and/or 5 are the only options
# ====================================================================================================
base_q_file = '/Users/yoramzarai/work/school/Simulation/Viruses/Data_stats/cross_UR_uq_m'
dtype_uq_rnd = {'hst_db':str, 'hst_name':str, 'hst_taxid':int, 'vrs_taxid':int, \
             'vrs_type':str, 'URq_v_h':str, 'URq_v_nh':str, 'URq_nv_h':str}

cross_uq_d = defaultdict()
for mlen in qmlens:
    df_uq_cross = pd.read_excel(base_q_file+str(mlen)+'.xlsx', header=0, dtype=dtype_uq_rnd)
    # host-virus correspondance
    cross_uq_d, srt_uq_allurs = analyze_correp_URs(df_uq_cross, htypes, vtypes, ctypes, mlen, cross_uq_d, 'URq_')

## Processing the unique correspondance

In [None]:
# MAKE SURE THE CELL ABOVE RUNS FIRST

# thresholds to show
num_uq_thres = 11 #11    # minimum number of entries in each group (set to 0 to show all)
ratio_uq_thres = 0.03 #0.5 # minimum ratio of seq occurance in group (set to 0 to show all)
corsp_uq_grp = {'nv_h', 'nv_h', 'nv_h'} # correspondance groups to analyze (all is ['v_h', 'v_nh', 'nv_h'])
# =========================================================================================================
print('Total of {} entries'.format(len(cross_uq_d)))
for k, v in cross_uq_d.items():
    srt = sorted(v, key=v.get, reverse=True)
    qmm, qhh, qvv, qcc, qnn = tuple(k.split(':'))
    # printing a subset
    if qcc in corsp_uq_grp:
        print(qmm, qhh, qvv, qcc, qnn, sep=',', end=':\n')
        for x in srt: 
            if v[x]/float(qnn) >= ratio_uq_thres and float(qnn) >= num_uq_thres:
                print('\t{}: {} ({:2.1f}%)'.format(x, v[x], v[x]/float(qnn)*100))

# Scratch Pad

In [None]:
from pprint import pprint
htype_tmp = 'Bacteria'
vtype_tmp = 'dsRNA'
ctype_tmp = 'nv_h'

cross_d_tmp = defaultdict()

df_tmp1 = myut.my_filter_df(df_cross, 'hst_db', htype_tmp)
df_tmp2 = myut.my_filter_df(df_tmp1, 'vrs_type', vtype_tmp)
#print(df_tmp2)
ur_tmp = df_tmp2.loc[:,'UR_'+ctype_tmp].apply(lambda x: x.split('|'))
allurs_tmp = [x for i in ur_tmp for x in i if x!='nan']
d_allurs_tmp = myut.get_seq_count(allurs_tmp) if allurs_tmp else {}
#print(allurs_tmp)
pprint(d_allurs_tmp)
cross_d_tmp[':'.join([str(4), htype_tmp, vtype_tmp, ctype_tmp, str(ur_tmp.size)])] = d_allurs_tmp
pprint(cross_d_tmp)

#ur = df_type.loc[:,'UR_nv_h'].apply(lambda x:x.split('|'))
#print(ur.iloc[0])
#allurs = [x for i in ur for x in i if x!='nan']
#print(ur.index[0])
#print(allurs, len(allurs), ur.shape[0], len(allurs)/ur.size)
    
#xx = ['AAA', 'AAB', 'AAC', 'AAA', 'AAA', 'AAB', 'KKK', 'AAA', 'KKK']
#pprint(myut.get_seq_count(xx))
    
#ss = ':'.join(['a', 'b', 'c'])
#print(ss)

#print(my_filter_df(df_type2, 'hst_db', 'Protists'))
#print(my_filter_df(df_type2, 'hst_db', 'Protists', ['hst_name', 'hst_taxid', 'vrs_taxid', 'UR_nv_h']))
    