UR Clustering - scratch pad

In [None]:
%reset -f

from Bio import SeqIO
from Bio.Cluster import kcluster


file = 'ur.fasta'
ur = [record.id for record in SeqIO.parse(file, 'fasta')]
print(ur)

# clustering


In [None]:
%%latex

This is a latex cell. You can define $\lambda_i$, for all $i\in\{0,\dots,n\}$ and
\[
a = \frac{m}{k}.    
\]
and
\begin{equation}
\dot{x} = q_{i-1}(x) - q_i(x), \quad \text{for all } i=1,\dots,n.
\label{eq:sample}
\end{equation}

# General Purpose Functions

In [1]:
from Bio import SeqIO
def my_fasta_read(fname):
    '''Reads fasta file'''
    seq = []
    id = []
    desc = []  # description 
    for record in SeqIO.parse(fname, "fasta"):
        desc.append(record.description)
        seq.append(str(record.seq))
        id.append(record.id)
    return seq, id, desc

## Functions for nucleotide clustering

In [2]:
def NT_base_seq_score(two_seqs, eql=+5, neql=-4):
    '''Returns the score of the distance between two sequences. Default scorings are nuc44-based.'''
    return(sum([eql if s1==s2 else neql for s1,s2 in zip(two_seqs[0], two_seqs[1])]))


import itertools as itr
def NT_pair_seqs_score(all_seqs, score_func, eql=+5, neql=-4):
    '''Returns a dictionary where the keys are all possible unique pairs of all_seqs, and the values are
    the corresponding scores based on score_func'''
    scores = {}
    for a in itr.combinations(range(len(all_seqs)), 2):
        scores[all_seqs[a[0]]+'~'+all_seqs[a[1]]] = score_func((all_seqs[a[0]], all_seqs[a[1]]), eql, neql)
    return scores





In [None]:
'''Testing the functions above'''

from pprint import pprint

#A = ('CTGC', 'ATGG')
#print('base score of {}:{} is {}'.format(A[0], A[1], NT_base_seq_score(A)))


import itertools as itr
#all_seqs = ('ATGC', 'ATTG', 'ACGT', 'ATGG')
#all_scores=NT_pair_seqs_score(all_seqs, NT_base_seq_score)
#pprint(all_scores)


from Bio import SeqIO
ur_file = 'ur.fasta'
ur1_file = 'ur1.fasta'

seq, id, desc = my_fasta_read(ur1_file)
for i, s, d in zip(id, seq, desc):
    print('id:{}, desc:{}, seq:{}'.format(i,d, s))



#urs = [x.id for x in list(SeqIO.parse(ur_file, "fasta"))]
urs,_,_ = my_fasta_read(ur_file)
#print(urs)
urs_scores=NT_pair_seqs_score(urs, NT_base_seq_score)
#pprint(urs_scores)

from operator import itemgetter
aa = sorted(urs_scores.items(), key=itemgetter(1), reverse=True)
print(aa)
