# Virus UR clusters logos

Run first the first cell (Computing the clusters) and then the third cell (displaying the clusters' logos). The second cell can be used to display the clusters to stdout.

## Computing the clusters

The code below supports two clustering tools:
1. Starcode.
2. MeshClust.

Set the parameters in the cell below. Then execute, and then execute the third cell to generate the logos figures, that are also captured in files.



In [3]:
import pandas as pd
import os, sys
import re
from Bio import motifs, SeqIO
from Bio.Seq import Seq
import matplotlib.pyplot as plt
from collections import OrderedDict
import subprocess
from pprint import pprint

import mysequtils as myut
import myseqclusters as mycl
import myseq_logo as mysl
import pald_funcs as mypal # my palindrome functions


def run_clst_tool(clst_tool, infile, outfile, exe_command, flags):
    '''This function calls the corresponding clustering tool'''
    command = exe_command+flags+' -i ' + infile  + ' -o ' + outfile if clst_tool=='strc' \
    else ' '.join([exe_command, infile, flags, '--output '+outfile])
    return subprocess.getstatusoutput(command)   

def parse_clst_tool_output(clst_tool, outfile):
    '''This function parses the output file generated by the 
    cluster tool.'''
    return mycl.parse_starcode_cls_out(outfile) if clst_tool=='strc' else mycl.parse_meshclsut_cls_out(outfile)

# ===================================================================================================

mlens = [3, 4, 5]
num_tst_vtaxids = 10 # the number of vtaxids with the most common UR to process

# Clustering tool
clst_tool = 'strc' # select a cluster tool: 'strc' for Starcode or 'mesh' for MeshClust
clst_bfname = 'ur' # file name is clst_bfname_<vtaxid>_<mlen>.seq for UR and .clst for clusters
clst_path = './clst_data' # will be appended with clst_tool, and in case 
                          # of Starcode, with _mp or _ph based on str_clst_type
# starcode parameters
str_exe = '~/work/mystuff/tools/starcode/starcode'
str_clst_type = 'sp' # 'mp' for message-passing algorithm, 'sp' for sphere or 'cc' for connected-components
# MeshClust parameters
mesh_exe = '~/work/mystuff/tools/MeShClust-master/src/cluster/meshclust'
mesh_id = 0.5

# UR information
base_path = '/Users/yoramzarai/work/school/Simulation/Viruses/VUR/viruses/'
dtype_rnd = {'vtaxid':int, 'num_cds':int, 'signf_common':str, 'signf_F1':str, 'signf_F2':str, 'signf_F3':str}
backg = {'A':0.25,'C':0.25,'G':0.25,'T':0.25}  # background for PSSM 
base_save_file = '' # 'v_most_URs_diff' code appends '.png' to save the figure and '.txt' to save the prints. 
                    #  Set to '' to disable 
# =============================================================================================================

if clst_tool=='strc':
    print('Clustering tool selected is Starcode')
    print('Clustering using {} algorithm'.format(str_clst_type))
    clst_path += '_'+clst_tool+'_'+str_clst_type 
    clst_flags = ' -s' if str_clst_type=='sp' else ' -r 1' if str_clst_type=='mp' else ' -c -d 1'
    clst_flags += ' --print-clusters'
    clst_exe = str_exe
elif clst_tool=='mesh':
    print('Clustering tool selected is MeshClust')
    clst_path += '_'+clst_tool
    clst_flags = ' --id '+str(mesh_id)
    clst_exe = mesh_exe
else: 
    print('Cluster tool {} not supported ! Exiting...'.format(clst_tool))
    sys.exit()

os.makedirs(clst_path, exist_ok=True)
mmlens = ['m'+str(m) for m in mlens]
# handle for text file or sys.stdout
ftxt = open(base_save_file+'.txt', 'wt') if base_save_file!='' else sys.stdout
bfdata = os.path.join(clst_path, clst_bfname)

'''keys are (vtaxid, mlen), and values are dictionaries containing
clusters information (where keys are the centroids and values are the corresponding
cluster's sequences)'''
ur_clst_info = OrderedDict()
for j, mlen in enumerate(mlens):
    rnd_model = 'dnt_samp' if mlen==3 else 'syn_perm+dnt_samp'
    mmlen = mmlens[j]
    xlsx_file = os.path.join(base_path, mmlens[j], rnd_model, 'Results', 'vsignf'+'_'+rnd_model+'_'+mmlens[j]+'.xlsx')
    df_vrnd = pd.read_excel(xlsx_file, header=0, dtype=dtype_rnd)
    vtaxids = df_vrnd.loc[:, 'vtaxid']  # vtaxid Series
    # find the vtaxids with the most common URs
    sgf = df_vrnd.loc[:,'signf_common']  # all common URs per vtaxid
    tmp = sgf.apply(lambda x:len(x.split('|'))) # this works on the values of sgf (which are the URs)
    srt_tmp = tmp.sort_values(ascending=False)
    for i in range(num_tst_vtaxids):
        indx = srt_tmp.index[i]  # vtaxid index in srt_tmp
        vtaxid = vtaxids.iloc[indx]
        ur = df_vrnd.iloc[indx]['signf_common'].split('|')
        eql_nt = {s for s in ur if len(set(s))==1}  # URs with the same nucleotides (e.g., AAAA)
        gc_nt = {s for s in ur if re.search('[CG]{'+str(mlen)+'}',s)}  # URs that contain only G and/or C
        at_nt = {s for s in ur if re.search('[AT]{'+str(mlen)+'}',s)}  # URs that contain only A and/or T
        
        # CHOOSE here what is proc_nt (the URs to show logos)
        #proc_nt = set(ur)-eql_nt # all UR other than the ones with equal NT
        proc_nt = set(ur)
        #proc_nt = gc_nt
        #proc_nt = at_nt
        #print('{}:\n\teql={} \n\tgc={}\n\tat={}\n\tproc={}'.format(vtaxid, eql_nt, gc_nt, at_nt, proc_nt), file=ftxt)
        if proc_nt and list(proc_nt)[0]!='nan':  # 'nan' is generated by pd.read_excel for empty cells
            _, _, _, m, _ = mysl.compute_pssm(proc_nt, backg, backg)
            rel_info = mysl.calc_rel_info(m, 'no')
            
            # clustering
            infile = '_'.join([bfdata, str(vtaxid), str(mlen)]) + '.seq'
            outfile = re.sub('.seq', '.clst', infile)
            #myut.seq2plain_text_file(proc_nt, infile)  # generate input file
            myut.create_fasta_file(infile, proc_nt, proc_nt)  # generate input file
            # run the cluster tool
            ret, sout = run_clst_tool(clst_tool, infile, outfile, clst_exe, clst_flags)
            #print(sout)
            if ret == 0:
                # parse starcode output
                ur_clst_info[(vtaxid, mlen)] = parse_clst_tool_output(clst_tool, outfile)
            else:
                print('Clustering tool {} errored and returned: {}'.format(clst_tool, sout))
            

Clustering tool selected is Starcode
Clustering using sp algorithm


## Displaying sequence logos of clusters data 

NOTE: This cell doesn't have to be executed. It is only used to print the clusters

In [4]:
# MAKE SURE THE CELL ABOVE RUNS FIRST !!!

# ur_clst_info contains clusters information
for k in ur_clst_info:
    print('clusters for {}:'.format(k))
    val = ur_clst_info[k]
    print('number of subplots = ', sum( [len(val[kk])>1 for kk in val.keys()]))
    for i, kk in enumerate(val.keys()):
        print('  C{} ({} seq): {}'.format(i,len(val[kk]), val[kk]))




clusters for (1605721, 3):
number of subplots =  3
  C0 (8 seq): ['AGC', 'ATT', 'CAC', 'CCC', 'GCT', 'TCC', 'TGA', 'TTC']
  C1 (2 seq): ['CTG', 'GTT']
  C2 (2 seq): ['GAA', 'GGA']
clusters for (1273750, 3):
number of subplots =  2
  C0 (10 seq): ['CTA', 'CTC', 'CAA', 'CCG', 'CGC', 'AAA', 'AGA', 'CAT', 'GTC', 'TTG']
  C1 (2 seq): ['GAG', 'GAC']
clusters for (1079999, 3):
number of subplots =  2
  C0 (7 seq): ['GCT', 'GCC', 'GGC', 'CTC', 'GTC', 'TGT', 'TTT']
  C1 (3 seq): ['AAA', 'AAT', 'ACA']
clusters for (1141526, 3):
number of subplots =  2
  C0 (8 seq): ['CTA', 'CCA', 'CTC', 'AAA', 'CAC', 'GGA', 'GTG', 'TTT']
  C1 (2 seq): ['GAT', 'GAG']
clusters for (268746, 3):
number of subplots =  2
  C0 (6 seq): ['GCT', 'GCC', 'GGG', 'GTC', 'TGT', 'TTT']
  C1 (3 seq): ['AAA', 'AAT', 'ACA']
clusters for (1029988, 3):
number of subplots =  2
  C0 (6 seq): ['GAC', 'GAG', 'GCC', 'ACA', 'CAA', 'GGA']
  C1 (3 seq): ['TTG', 'TAG', 'TTT']
clusters for (1195085, 3):
number of subplots =  2
  C0 (7 seq): 

## Displaying the clusters' logos

MAKE SURE THE FIRST CELL IS RAN FIRST !!

In [None]:
# MAKE SURE THE FIRST CELL RUNS FIRST !!!
import matplotlib.pyplot as plt
import myseq_logo as mysl
import pald_funcs as mypal # my palindrome functions
from Bio import motifs, SeqIO
from Bio.Seq import Seq

figs_path = './clst_figs'  # will be appended with _mp or _ph based on str_clst_type
figs_bfname = 'ur' # file name is figs_bfname_<vtaxid>_<mlen>.png
backg = {'A':0.25,'C':0.25,'G':0.25,'T':0.25}  # background for PSSM 
# =========================================================================================
figs_path += '_'+clst_tool+'_'+str_clst_type 
os.makedirs(figs_path, exist_ok=True)
base_fname = os.path.join(figs_path, figs_bfname)

# ur_clst_info contains clusters information
for k in ur_clst_info:
    vtaxid, mlen = k
    fname = '_'.join([base_fname, str(vtaxid), str(mlen)]) + '.png'
    num_urs = sum([len(v) for v in ur_clst_info[k].values()])
    # processing clusters that contain at least two sequences
    clsts = [v for v in ur_clst_info[k].values() if len(v)>1]
    fig, axs = plt.subplots(len(clsts), 1, figsize=(6, 3*len(clsts)), squeeze=False, constrained_layout=True)
    fig.suptitle('m'+str(mlen)+':'+str(vtaxid)+' ('+str(num_urs)+' URs)', fontsize=25, color='blue')
    for i, c in enumerate(clsts):
        _, _, _, m, _ = mysl.compute_pssm(c, backg, backg)
        rel_info = mysl.calc_rel_info(m, 'no')
        maxy = mysl.gen_nt_sequence_logo(axs[i,0], rel_info)
        #axs[i,0].set_title('m'+str(mlen)+' : '+str(vtaxid) + \
        axs[i,0].set_title('('+str(len(c))+' cURs)',\
                               fontsize=15, color='black')
        axs[i,0].set_ylabel('Bits')
        #axs[i,j].set_xlabel('NT index')
        axs[i,0].axis([0.5, mlen+0.5, 0, maxy])
        axs[i,0].set_xticks(range(1, mlen+1))
        
    #plt.tight_layout()
    #plt.subplots_adjust(wspace=0.8, top=0.8)
    # save plot
    plt.savefig(fname, dpi=200)
    path = !pwd  # shell command
    print('Figure saved in {}/{}'.format(path[0], fname))


    
    

# Scratch Pad

In [None]:
# SCRATCH PAD
########################
from pprint import pprint

a = (0,1)
b, c = a
print(b,c)


d = { 'a': [1,2], 'b': [1,2,3], 'c': [2], 'd': [1,2,3, 4, 5]}
pprint(d)

onlyv = [v for k,v in d.items() if len(v)>1]
for i, c in enumerate(onlyv):
    print(i, c)

k = {k: v for k,v in d.items() if len(v)>1}
pprint(k)
print(len(k))