In [1]:
#!/usr/bin/env python

from __future__ import print_function, division

from operator import itemgetter
import os
import sys
import tempfile
import warnings

try:
    from Bio import pairwise2
    from Bio.SubsMat import MatrixInfo as matlist
except ImportError as exception:
    print("[!] Could not import Biopython modules", file=sys.stderr)
    raise exception

#
def align_sequences(sequence_A, sequence_B, **kwargs):
    """
    Performs a global pairwise alignment between two sequences
    using the BLOSUM62 matrix and the Needleman-Wunsch algorithm
    as implemented in Biopython. Returns the alignment, the sequence
    identity and the residue mapping between both original sequences.
    """

    def _calculate_identity(sequenceA, sequenceB):
        """
        Returns the percentage of identical characters between two sequences.
        Assumes the sequences are aligned.
        """

        sa, sb, sl = sequenceA, sequenceB, len(sequenceA)
        matches = [sa[i] == sb[i] for i in xrange(sl)]
        seq_id = (100 * sum(matches)) / sl

        gapless_sl = sum([1 for i in xrange(sl) if (sa[i] != '-' and sb[i] != '-')])
        gap_id = (100 * sum(matches)) / gapless_sl
        return (seq_id, gap_id)

    #
    matrix = kwargs.get('matrix', matlist.blosum62)
    gap_open = kwargs.get('gap_open', -10.0)
    gap_extend = kwargs.get('gap_extend', -0.5)

    alns = pairwise2.align.globalds(sequence_A, sequence_B,
                                    matrix, gap_open, gap_extend)

    best_aln = alns[0]
    aligned_A, aligned_B, score, begin, end = best_aln

    # Calculate sequence identity
    seq_id, g_seq_id = _calculate_identity(aligned_A, aligned_B)
    return ((aligned_A, aligned_B), seq_id, g_seq_id)

In [2]:
import os
pdb_file = open('/home/tongwade780/pdb_website/onebyone_cluster_code/representative_pdbname.csv').readlines ()

pdbname = []
for line in pdb_file:
    pdbname.append(line.rstrip('\r\n').split(',')[0][0:4].lower())

In [3]:
sequence_A = 'MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG'
sequence_B = 'AAGSITTLPALPEDGGSGAFPPGHFKDPKRLYCKNGGFFLRIHPDGRVDGVREKSDPHIKLQLQAEERGVVSIKGVSANRYLAMKEDGRLLASKSVTDECFFFERLESNNYNTYRSRKYTSWYVALKRTGQYKLGSKTGPGQKAILFLPMSAKS'

((aligned_A, aligned_B), seq_id, g_seq_id) = align_sequences(sequence_A, sequence_B)

seq_id

In [2]:
pdbname = ['1ubq','1bas','3ta8','1azp']

In [2]:
pdbname = ['5ctl','1kh8','2xjx','1ufp','1ymb','4rvr','3dn0','1xr1','4hy8','4n1m','2qu9','1c7p','1q21','4gr3']

In [49]:
sequence_A = ''.join(open('/home/tongwade780/pdb_website/onebyone_cluster_code/test_seq_alin/{0}.fasta.txt'.format(pdbname[0])).read().split('\n')[1:])

#sequence_B = ''.join(open('/home/tongwade780/pdb_website/onebyone_cluster_code/test_seq_alin/{0}.fasta.txt'.format(pdbname[2])).read().split('\n')[1:])

b = open('/home/tongwade780/pdb_website/onebyone_cluster_code/test_seq_alin/{0}.fasta.txt'.format(pdbname[3])).read().split('\n')[1:]
sequence_B = ''
for line in b:
    if line == '':
        break
    elif line[0] == '>':
        break
    else:
        sequence_B = sequence_B+line
((aligned_A, aligned_B), seq_id, g_seq_id) = align_sequences(sequence_A, sequence_B)
seq_id

6.481481481481482

In [None]:
import numpy
matrix = numpy.zeros((2792,2792))
problem_pdb = []
row = -1
for name1 in pdbname:
    row = row+1
    print(row)
    sequence_A = ''
    a = open('/home/tongwade780/pdb_website/onebyone_cluster_code/representative_fasta_sequence/{0}.fasta.txt'.format(name1)).read().split('\n')[1:]
    for line in a:
        if line == '':
            break
        elif line[0] == '>':
            break
        else:
            sequence_A = sequence_A+line    
    for colunm in range(row,2792):
        name2 = pdbname[colunm]
        b = open('/home/tongwade780/pdb_website/onebyone_cluster_code/representative_fasta_sequence/{0}.fasta.txt'.format(name2)).read().split('\n')[1:]
        sequence_B = ''
        for line in b:
            if line == '':
                break
            elif line[0] == '>':
                break
            else:
                sequence_B = sequence_B+line

        ((aligned_A, aligned_B), seq_id, g_seq_id) = align_sequences(sequence_A, sequence_B)
        matrix[row,colunm] = seq_id

0


In [None]:
matrix

In [19]:
sa, sb, sl = sequence_A, sequence_B, len(sequence_A)
matches = [sa[i] == sb[i] for i in xrange(sl)]
seq_id = (100 * sum(matches)) / sl

gapless_sl = sum([1 for i in xrange(sl) if (sa[i] != '-' and sb[i] != '-')])
gap_id = (100 * sum(matches)) / gapless_sl

IndexError: string index out of range

In [33]:
xrange(330)

xrange(330)

In [12]:
import pandas as pd
matrix_data=pd.DataFrame(matrix)

matrix_data.columns = pdbname

matrix_data.index = pdbname


SyntaxError: invalid syntax (<ipython-input-12-48fc22c3be67>, line 8)

In [14]:
matrix_data.to_csv('/home/tongwade780/pdb_website/onebyone_cluster_code/MD_14_pdb_identity.csv')

In [15]:
matrix[matrix < 30] = 0

matrix[matrix > 30] = 1


array([[ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  1.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
         0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0., 

In [18]:
matrix_data=pd.DataFrame(matrix)

matrix_data.columns = pdbname
matrix_data.index = pdbname
matrix_data.to_csv('/home/tongwade780/pdb_website/onebyone_cluster_code/MD_14_pdb_like_or_not.csv')