UR Clustering - scratch pad

In [None]:
%reset -f

from Bio import SeqIO
from Bio.Cluster import kcluster


file = 'ur.fasta'
ur = [record.id for record in SeqIO.parse(file, 'fasta')]
print(ur)

# clustering


In [None]:
%%latex

This is a latex cell. You can define $\lambda_i$, for all $i\in\{0,\dots,n\}$ and
\[
a = \frac{m}{k}.    
\]
and
\begin{equation}
\dot{x} = q_{i-1}(x) - q_i(x), \quad \text{for all } i=1,\dots,n.
\label{eq:sample}
\end{equation}

# General Purpose Functions

In [1]:
from Bio import SeqIO
def my_fasta_read(fname):
    '''Reads fasta file. 
    The functions returns a list of the sequences, a list of the sequences IDs 
    and a list of the sequences descriptions (content of the text following >>)'''
    seq = []
    iD = []
    desc = []  # description 
    for record in SeqIO.parse(fname, "fasta"):
        desc.append(record.description)
        seq.append(str(record.seq))
        iD.append(record.id)
    return seq, iD, desc

## Functions for nucleotide clustering

In [2]:
def NT_base_seq_score(two_seqs, eql=+5, neql=-4):
    '''Returns the score of the distance between two sequences. Default scorings are nuc44-based.'''
    return(sum([eql if s1==s2 else neql for s1,s2 in zip(two_seqs[0], two_seqs[1])]))


import itertools as itr
def NT_pair_seqs_score(all_seqs, score_func, eql=+5, neql=-4):
    '''Returns a dictionary where the keys are all possible unique pairs of all_seqs, and the values are
    the corresponding scores based on score_func'''
    scores = {}
    for a in itr.combinations(range(len(all_seqs)), 2):
        #scores[all_seqs[a[0]]+'~'+all_seqs[a[1]]] = score_func((all_seqs[a[0]], all_seqs[a[1]]), eql, neql)
        scores[(all_seqs[a[0]], all_seqs[a[1]])] = score_func((all_seqs[a[0]], all_seqs[a[1]]), eql, neql)
    return scores





In [7]:
'''Testing the functions above'''

from pprint import pprint
import mysequtils as myut

#A = ('CTGC', 'ATGG')
#print('base score of {}:{} is {}'.format(A[0], A[1], NT_base_seq_score(A)))


import itertools as itr
#all_seqs = ('ATGC', 'ATTG', 'ACGT', 'ATGG')
#all_scores=NT_pair_seqs_score(all_seqs, NT_base_seq_score)
#pprint(all_scores)


from Bio import SeqIO
ur_file = 'ur.fasta'
ur1_file = 'ur1.fasta'

seq, iD, desc = my_fasta_read(ur1_file)
#for i, s, d in zip(iD, seq, desc):
#    print('id:{}, desc:{}, seq:{}'.format(i,d, s))





#urs = [x.id for x in list(SeqIO.parse(ur_file, "fasta"))]
urs,_,_ = my_fasta_read(ur_file)
#print(urs)
#urs_scores=NT_pair_seqs_score(urs, NT_base_seq_score, 1, -1)

# this is the UR set of vtaxid=1090134
myur = { 'TCA', 'TGT', 'AGT', 'TTT', 'GAG', 'CAA', 'CTC', 'GAC' }
urs_scores=NT_pair_seqs_score(list(myur), NT_base_seq_score, 1, -1)
#pprint(urs_scores)

from operator import itemgetter
aa = sorted(urs_scores.items(), key=itemgetter(1), reverse=True)
#print(aa)
bb = sorted(aa, key=lambda entry: entry[0][0])
#bb = sorted(urs_scores.items(), key=lambda entry: entry[0][0])

for a in bb:
    print('{}:{}'.format(a[0], a[1]))
    
# saving ur in a fasta file
fasta_name = './ur2.fasta'
with open(fasta_name, 'wt') as fout:
    print(myut.seq_create_fasta(myur,myur), file=fout)
print('file saved in ', fasta_name)


('AGT', 'TGT'):1
('AGT', 'TTT'):-1
('AGT', 'CTC'):-3
('AGT', 'TCA'):-3
('AGT', 'CAA'):-3
('CTC', 'TTT'):-1
('CTC', 'CAA'):-1
('CTC', 'TCA'):-3
('CTC', 'TGT'):-3
('GAC', 'GAG'):1
('GAC', 'CTC'):-1
('GAC', 'CAA'):-1
('GAC', 'AGT'):-3
('GAC', 'TTT'):-3
('GAC', 'TCA'):-3
('GAC', 'TGT'):-3
('GAG', 'CAA'):-1
('GAG', 'AGT'):-3
('GAG', 'CTC'):-3
('GAG', 'TTT'):-3
('GAG', 'TCA'):-3
('GAG', 'TGT'):-3
('TCA', 'TGT'):-1
('TCA', 'CAA'):-1
('TGT', 'CAA'):-3
('TTT', 'TGT'):1
('TTT', 'TCA'):-1
('TTT', 'CAA'):-3
file saved in  ./ur2.fasta


# Parse the output of starcode

In [34]:
import csv

infile = 'ur2.fasta'
file = 'ur_clust'  # startcode output

ur_seq, _, _ = my_fasta_read(infile)

with open(file, 'rt') as fin:
    cin = csv.reader(fin, delimiter=' ')
    clusts = [r for r in cin]

print('urs = {}'.format(ur_seq))
print('Found {} clusters:'.format(len(clusts)))
for x in clusts:
    info = x[0].split('\t')
    #print('centroid={}, members={}'.format(info[0], info[-1]))
    print(info[-1].split(','))



urs = ['GAC', 'GAG', 'AGT', 'CTC', 'TTT', 'TCA', 'TGT', 'CAA']
Found 2 clusters:
['CAA', 'CTC', 'GAC', 'GAG', 'TCA']
['AGT', 'TGT', 'TTT']
