# Virus UR clusters logos

In [8]:
import pandas as pd
import os, sys
import re
from Bio import motifs, SeqIO
from Bio.Seq import Seq
import matplotlib.pyplot as plt
from collections import OrderedDict
import subprocess
from pprint import pprint

import mysequtils as myut
import myseqclusters as mycl
import myseq_logo as mysl
import pald_funcs as mypal # my palindrome functions


mlens = [3, 4, 5]
num_tst_vtaxids = 10 # the number of vtaxids with the most common UR to process
clst_path = './clust_data'
clst_bfname = 'ur' # file name is clst_bfname_<vtaxid>_<mlen>.seq for UR and .clst for clusters

# starcode command
str_exe = '~/work/mystuff/tools/starcode/starcode'
#str_flags = ' -r 1 --print-clusters'   # message-passing algorithm
str_flags = ' -s --print-clusters'   # sphere algorithm

base_path = '/Users/yoramzarai/work/school/Simulation/Viruses/VUR/viruses/'
dtype_rnd = {'vtaxid':int, 'num_cds':int, 'signf_common':str, 'signf_F1':str, 'signf_F2':str, 'signf_F3':str}
backg = {'A':0.25,'C':0.25,'G':0.25,'T':0.25}  # background for PSSM 
base_save_file = '' # 'v_most_URs_diff' code appends '.png' to save the figure and '.txt' to save the prints. 
                    #  Set to '' to disable 
# =============================================================================================================
mmlens = ['m'+str(m) for m in mlens]

# handle for text file or sys.stdout
ftxt = open(base_save_file+'.txt', 'wt') if base_save_file!='' else sys.stdout

bfdata = os.path.join(clst_path, clst_bfname)
base_command = str_exe + str_flags

'''keys are (vtaxid, mlen), and values are dictionaries containing
clusters information (where keys are the centroids and values are the corresponding
cluster's sequences)'''
ur_clst_info = OrderedDict()
for j, mlen in enumerate(mlens):
    rnd_model = 'dnt_samp' if mlen==3 else 'syn_perm+dnt_samp'
    mmlen = mmlens[j]
    xlsx_file = os.path.join(base_path, mmlens[j], rnd_model, 'Results', 'vsignf'+'_'+rnd_model+'_'+mmlens[j]+'.xlsx')
    df_vrnd = pd.read_excel(xlsx_file, header=0, dtype=dtype_rnd)
    vtaxids = df_vrnd.loc[:, 'vtaxid']  # vtaxid Series
    # find the vtaxids with the most common URs
    sgf = df_vrnd.loc[:,'signf_common']  # all common URs per vtaxid
    tmp = sgf.apply(lambda x:len(x.split('|'))) # this works on the values of sgf (which are the URs)
    srt_tmp = tmp.sort_values(ascending=False)
    for i in range(num_tst_vtaxids):
        indx = srt_tmp.index[i]  # vtaxid index in srt_tmp
        vtaxid = vtaxids.iloc[indx]
        ur = df_vrnd.iloc[indx]['signf_common'].split('|')
        eql_nt = {s for s in ur if len(set(s))==1}  # URs with the same nucleotides (e.g., AAAA)
        gc_nt = {s for s in ur if re.search('[CG]{'+str(mlen)+'}',s)}  # URs that contain only G and/or C
        at_nt = {s for s in ur if re.search('[AT]{'+str(mlen)+'}',s)}  # URs that contain only A and/or T
        
        # CHOOSE here what is proc_nt (the URs to show logos)
        #proc_nt = set(ur)-eql_nt # all UR other than the ones with equal NT
        proc_nt = set(ur)
        #proc_nt = gc_nt
        #proc_nt = at_nt
        print('{}:\n\teql={} \n\tgc={}\n\tat={}\n\tproc={}'.format(vtaxid, eql_nt, gc_nt, at_nt, proc_nt), file=ftxt)
        if proc_nt and list(proc_nt)[0]!='nan':  # 'nan' is generated by pd.read_excel for empty cells
            _, _, _, m, _ = mysl.compute_pssm(proc_nt, backg, backg)
            rel_info = mysl.calc_rel_info(m, 'no')
            
            # clustering
            infile = '_'.join([bfdata, str(vtaxid), str(mlen)]) + '.seq'
            outfile = re.sub('.seq', '.clst', infile)
            myut.seq2plain_text_file(proc_nt, infile)
            ret, sout = subprocess.getstatusoutput(base_command+' -i ' + infile  + ' -o ' + outfile)
            #print(sout)
            if ret == 0:
                # parse starcode output
                clusters = mycl.parse_starcode_cls_out(outfile)
                #print('Found {} clusters:'.format(len(clusters)))
                #for k in clusters.keys(): print(clusters[k])
                ur_clst_info[(vtaxid, mlen)]=clusters
            else:
                print('starcode errored and returned: ', sout)
            

1605721:
	eql={'CCC'} 
	gc={'CCC'}
	at={'ATT'}
	proc={'CAC', 'GAA', 'CTG', 'GCT', 'GTT', 'GGA', 'CCC', 'ATT', 'TTC', 'AGC', 'TCC', 'TGA'}
1273750:
	eql={'AAA'} 
	gc={'CGC', 'CCG'}
	at={'AAA'}
	proc={'GAC', 'TTG', 'CAA', 'CTA', 'CCG', 'CTC', 'GTC', 'AGA', 'CGC', 'GAG', 'AAA', 'CAT'}
1079999:
	eql={'TTT', 'AAA'} 
	gc={'GCC', 'GGC'}
	at={'TTT', 'AAT', 'AAA'}
	proc={'GGC', 'AAT', 'GCT', 'TTT', 'ACA', 'GCC', 'GTC', 'CTC', 'TGT', 'AAA'}
1141526:
	eql={'TTT', 'AAA'} 
	gc=set()
	at={'TTT', 'AAA'}
	proc={'CAC', 'GGA', 'GAT', 'CTA', 'TTT', 'CCA', 'GTG', 'CTC', 'GAG', 'AAA'}
268746:
	eql={'GGG', 'TTT', 'AAA'} 
	gc={'GCC', 'GGG'}
	at={'TTT', 'AAT', 'AAA'}
	proc={'GGG', 'AAT', 'GCT', 'TTT', 'ACA', 'GCC', 'GTC', 'TGT', 'AAA'}
1029988:
	eql={'TTT'} 
	gc={'GCC'}
	at={'TTT'}
	proc={'GAC', 'GGA', 'TTG', 'TAG', 'CAA', 'TTT', 'ACA', 'GCC', 'GAG'}
1195085:
	eql={'TTT'} 
	gc=set()
	at={'TTT'}
	proc={'GAC', 'GGA', 'TTG', 'TAG', 'CAA', 'TTT', 'ACA', 'TGT', 'GAG'}
1283071:
	eql={'TTT', 'AAA'} 
	gc={'GGC'}
	at=

  return [Hg+sum([pwm[b][l]*np.nan_to_num(np.log2(pwm[b][l])) for b in bases])\


1273750:
	eql={'GGGG'} 
	gc={'CGCG', 'GGGG', 'GCGC'}
	at=set()
	proc={'ACTC', 'AGCT', 'CTCC', 'GCTC', 'CTCA', 'CATG', 'GGTC', 'AGAG', 'GTAC', 'GTCC', 'GGAC', 'GGGG', 'GACG', 'GAGT', 'GCAA', 'GACC', 'GTCA', 'GCGC', 'GAGA', 'CAAG', 'CAAC', 'CGCG', 'GAGC', 'GGAG'}
1605721:
	eql={'GGGG', 'CCCC'} 
	gc={'GGGG', 'CCCC'}
	at=set()
	proc={'AAGC', 'AGCT', 'GGGA', 'CTTC', 'CTCC', 'GAAG', 'TCCG', 'TCAG', 'GGAA', 'GTTC', 'CTTA', 'AACG', 'CTGA', 'ACGT', 'CCCC', 'CGGA', 'GCTT', 'GGGG', 'CGTT', 'GTTA', 'AGCC', 'GGAG', 'TTCG'}
929832:
	eql={'AAAA', 'TTTT'} 
	gc={'GCCG', 'GGCC', 'CGGC', 'CGCG', 'CCGG', 'GCGC'}
	at={'AAAA', 'AAAT', 'TTTT'}
	proc={'GCCG', 'AGCT', 'GGCC', 'TTTT', 'CGGC', 'AGCC', 'TTCA', 'GGCT', 'AGGC', 'CGCG', 'TCAA', 'CCGG', 'GCGC', 'AAAA', 'AAAT'}
445700:
	eql={'TTTT'} 
	gc={'CGGC', 'CCGG', 'GGCC', 'GCGC'}
	at={'AAAT', 'TTTT'}
	proc={'TTTC', 'GGCC', 'AGCT', 'TCGA', 'CGGC', 'GCTT', 'AGCC', 'TTTT', 'GGCT', 'GCTC', 'CCGG', 'GTAC', 'GCGC', 'AAAT'}
444878:
	eql={'AAAA', 'TTTT'} 
	gc={'CCGG', 

## Displaying sequence logos of clusters data

In [2]:
# MAKE SURE THE CELL ABOVE RUNS FIRST !!!

# ur_clst_info contains clusters information
for k in ur_clst_info:
    print('clusters for {}:'.format(k))
    val = ur_clst_info[k]
    for i, kk in enumerate(val.keys()):
        print('  C{} ({} seq): {}'.format(i,len(val[kk]), val[kk]))




clusters for (1605721, 3):
  C0 (8 seq): ['AGC', 'ATT', 'CAC', 'CCC', 'GCT', 'TCC', 'TGA', 'TTC']
  C1 (2 seq): ['CTG', 'GTT']
  C2 (2 seq): ['GAA', 'GGA']
clusters for (1273750, 3):
  C0 (10 seq): ['CTA', 'CTC', 'CAA', 'CCG', 'CGC', 'AAA', 'AGA', 'CAT', 'GTC', 'TTG']
  C1 (2 seq): ['GAG', 'GAC']
clusters for (1079999, 3):
  C0 (7 seq): ['GCT', 'GCC', 'GGC', 'CTC', 'GTC', 'TGT', 'TTT']
  C1 (3 seq): ['AAA', 'AAT', 'ACA']
clusters for (1141526, 3):
  C0 (8 seq): ['CTA', 'CCA', 'CTC', 'AAA', 'CAC', 'GGA', 'GTG', 'TTT']
  C1 (2 seq): ['GAT', 'GAG']
clusters for (268746, 3):
  C0 (6 seq): ['GCT', 'GCC', 'GGG', 'GTC', 'TGT', 'TTT']
  C1 (3 seq): ['AAA', 'AAT', 'ACA']
clusters for (1029988, 3):
  C0 (6 seq): ['GAC', 'GAG', 'GCC', 'ACA', 'CAA', 'GGA']
  C1 (3 seq): ['TTG', 'TAG', 'TTT']
clusters for (1195085, 3):
  C0 (7 seq): ['TAG', 'GAG', 'TTG', 'CAA', 'GAC', 'TGT', 'TTT']
  C1 (2 seq): ['GGA', 'ACA']
clusters for (1283071, 3):
  C0 (7 seq): ['GGA', 'GGC', 'GAC', 'GAG', 'AAA', 'ACA', 'AGG'