In [1]:
import os
import sys

sys.path.append('../..')
from pathlib import Path
from seq import ReadSeq, AlignSeq
from functools import reduce
from prody import parsePDB, LOGGER
LOGGER.verbosity = 'none'

from download_protein import DownloadProtein

In [2]:
target_dir = '../../../pdb/scop_cl_equal_globular100/1VLK_A/'
fasta_path = target_dir + '1VLK_A.fasta'
pdb_path = target_dir + '1VLK.pdb'
chain = 'A'

In [3]:
fasta_seq = ReadSeq.fasta2seq(fasta_path)
fasta_seq

'CDNFPQMLRDLRDAFSRVKTFFQTKDEVDNLLLKESLLEDFKGYLGCQALSEMIQFYLEEVMPQAENQDPEAKDHVNSLGENLKTLRLRLRRCHRFLPCENKSKAVEQIKNAFNKLQEKGIYKAMSEFDIFINYIEAYMTIK'

In [4]:
pdb_seq, pdb_resnum = ReadSeq.pdb2seq(pdb_path, chain, read_HETATM=True)
pdb_seq

'CDNFPQMLRDLRDAFSRVKTFFQTKDEVDNLLLKESLLEDFKGYLGCQALSEMIQFYLEEVMPQAENQDPEAKDHVNSLGENLKTLRLRLRRCHRFLPCENKSKAVEQIKNAFNKLQEKGIYKAMSEFDIFINYIEAYMTIKXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

In [5]:
AlignSeq.align_fasta_and_pdb(fasta_seq, pdb_seq)

(array(['C', 'D', 'N', 'F', 'P', 'Q', 'M', 'L', 'R', 'D', 'L', 'R', 'D',
        'A', 'F', 'S', 'R', 'V', 'K', 'T', 'F', 'F', 'Q', 'T', 'K', 'D',
        'E', 'V', 'D', 'N', 'L', 'L', 'L', 'K', 'E', 'S', 'L', 'L', 'E',
        'D', 'F', 'K', 'G', 'Y', 'L', 'G', 'C', 'Q', 'A', 'L', 'S', 'E',
        'M', 'I', 'Q', 'F', 'Y', 'L', 'E', 'E', 'V', 'M', 'P', 'Q', 'A',
        'E', 'N', 'Q', 'D', 'P', 'E', 'A', 'K', 'D', 'H', 'V', 'N', 'S',
        'L', 'G', 'E', 'N', 'L', 'K', 'T', 'L', 'R', 'L', 'R', 'L', 'R',
        'R', 'C', 'H', 'R', 'F', 'L', 'P', 'C', 'E', 'N', 'K', 'S', 'K',
        'A', 'V', 'E', 'Q', 'I', 'K', 'N', 'A', 'F', 'N', 'K', 'L', 'Q',
        'E', 'K', 'G', 'I', 'Y', 'K', 'A', 'M', 'S', 'E', 'F', 'D', 'I',
        'F', 'I', 'N', 'Y', 'I', 'E', 'A', 'Y', 'M', 'T', 'I', 'K', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-'

In [6]:
align_pdbseq, align_fastaseq, align_fastaindices, align_pdbindices = AlignSeq.align_fasta_and_pdb(fasta_seq, pdb_seq)

In [7]:
mol = parsePDB(pdb_path, chain=chain)

In [8]:
sel_mol = mol.select('resindex {}'.format(reduce(lambda a, b: str(a) + ' ' + str(b), align_pdbindices)))

In [9]:
fasta_resnum = align_fastaindices + 1

In [10]:
resnum_convert_dict = dict(zip(align_pdbindices, fasta_resnum))

In [11]:
new_resnum = [resnum_convert_dict[resindex] for resindex in sel_mol.getResindices()]

In [12]:
sel_mol.setResnums(new_resnum)

In [13]:
sel_mol.getResnums()

array([  1,   1,   1, ..., 142, 142, 142])

In [14]:
sel_mol.select('name CA').getResnums()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142])

In [15]:
def fix_resnum(fasta_path, pdb_path, chain):
    fasta_seq = ReadSeq.fasta2seq(fasta_path)
    pdb_seq, _ = ReadSeq.pdb2seq(pdb_path, chain=chain, read_HETATM=True)
    align_pseq, align_fseq, align_findices, align_pindices = AlignSeq.align_fasta_and_pdb(fasta_seq, pdb_seq)
    mol = parsePDB(pdb_path, chain=chain)
    sel_mol = mol.select('resindex {}'.format(reduce(lambda a, b: str(a) + ' ' + str(b), align_pindices)))
    if sel_mol is None:
        print(align_pindices)
    old_resnum = sel_mol.getResnums()
    print(old_resnum[0], old_resnum[-1])
    fasta_resnum = align_findices + 1
    convert_resnum_dict = dict(zip(align_pindices, fasta_resnum))
    new_resnum = [convert_resnum_dict[resindex] for resindex in sel_mol.getResindices()]
    sel_mol.setResnums(new_resnum)
    # print(sel_mol.select('name CA').getResnums())
    return sel_mol, old_resnum[0], old_resnum[-1]

In [16]:
def align_fasta_pdb(fasta_path, pdb_path, chain):
    fasta_seq = ReadSeq.fasta2seq(fasta_path)
    pdb_seq, _ = ReadSeq.pdb2seq(pdb_path, chain=chain, read_HETATM=True)
    align_pseq, align_fseq, align_findices, align_pindices = AlignSeq.align_fasta_and_pdb(fasta_seq, pdb_seq, alignment_percentage_threshold=0)
    return align_pseq, align_fseq, align_findices, align_pindices

In [17]:
def align_fasta_pdb_simple(fasta_path, pdb_path, chain):
    fasta_seq = ReadSeq.fasta2seq(fasta_path)
    pdb_seq, _ = ReadSeq.pdb2seq(pdb_path, chain=chain, read_HETATM=True)
    align_pseq, align_fseq, align_findices, align_pindices = AlignSeq.align_seq(fasta_seq, pdb_seq)
    print(fasta_seq)
    print(pdb_seq)
    return align_pseq, align_fseq, align_findices, align_pindices

In [18]:
fix_resnum(fasta_path, pdb_path, chain)

12 157


(<Selection: 'resindex 0 1 2 ...138 139 140 141' from 1VLKA (1118 atoms)>,
 12,
 157)

In [19]:
fasta_dir = Path('../../../fasta/scop_cl_equal_globular100')

In [39]:
num_error = 0
def get_resnum(fasta_path):
    with open(fasta_path, 'r') as f:
        line = f.readline()
        resnumstr = line.split()[8].split(':')[1]
        start, end = resnumstr.split('-')
        return int(start), int(end)

for fasta in fasta_dir.glob('*.fasta'):
    pdb_id = fasta.stem.split('_')[0]
    chain = fasta.stem.split('_')[1]
    print(pdb_id, chain)
    scop_start, scop_end = get_resnum(fasta)
    native_path = (Path('native_pdb') / pdb_id).with_suffix('.pdb')
    if not native_path.exists():
        DownloadProtein.download_native_pdb(pdb_id, chain, native_path)
    try:
        _, pdb_start, pdb_end = fix_resnum(fasta, native_path, chain)
    except ValueError:
        num_error += 1
        print(str(fasta))
        print(native_path)
        print('start: {}, end: {}'.format(scop_start, scop_end))
    else:
        if pdb_start == scop_start and pdb_end == scop_end:
            print('Great!!!')
        else:
            print('No: pstart: {}, sstart: {}, pend: {}, send: {}'.format(pdb_start, scop_start, pdb_end, scop_end))
    print()
print(num_error)

4EFH A
148 374
Great!!!

2UUB E
74 154
Great!!!

4WE5 A
4 324
Great!!!

5V93 o
2 88
Great!!!

1HV5 A
102 262
Great!!!

2V2F F
268 651
Great!!!

4HRR H
2 151
Great!!!

5XX9 B
1 157
Great!!!

1IFV A
1 155
Great!!!

1W3B A
16 400
No: pstart: 16, sstart: 13, pend: 400, send: 400

3DMI A
46 132
Great!!!

2F2H A
586 665
Great!!!

1DE4 A
4 181
Great!!!

4ZPL A
208 314
Great!!!

6KNA A
1 76
Great!!!

3RHH C
8 483
Great!!!

2WP2 B
27 136
Great!!!

5CZJ B
22 340
Great!!!

1F2E A
81 201
Great!!!

2BWJ A
5 196
Great!!!

1AYZ A
2 154
Great!!!

1I0D A
35 365
Great!!!

3JAI SS
6 142
Great!!!

4PNE B
13 282
Great!!!

2MR9 A
4 43
Great!!!

1P3C A
1 215
Great!!!

2I2Y A
65 150
Great!!!

6K9F B
451 537
Great!!!

6O8W g
2 155
Great!!!

2D9Y A
8 111
Great!!!

2VKE A
47 208
Great!!!

2DS4 A
8 113
Great!!!

1OHE A
199 380
Great!!!

3O1D A
Alignment length is not sufficient.
238 298
../../../fasta/scop_cl_equal_globular100/3O1D_A.fasta
native_pdb/3O1D.pdb
start: 156, end: 453

1RSY A
140 265
Great!!!

6CD6 A


In [59]:
fasta_path = '../../../fasta/scop_cl_equal_globular100/3M0G_A.fasta'
pdb_path = 'native_pdb/3M0G.pdb'
chain = 'A'
align_fasta_pdb_simple(fasta_path, pdb_path, chain)

SERLKEVQDGVEAAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMEQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHRKWDEATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAIIAGADRALLAAYATALGLAFQIADDILDVEGDEEAAGKRLGKDAEAHKATFVSLLGLPGAKARAADLVAEAEAALAPYGEAAATLRACARYVIE
SLSERLKEVQDAVETAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMAQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIEXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


(array(['-', '-', 'S', 'E', 'R', 'L', 'K', 'E', 'V', 'Q', 'D', 'G', 'V',
        'E', 'A', 'A', 'M', 'A', 'A', 'A', 'I', 'G', 'R', 'L', 'P', 'A',
        'G', 'D', 'L', 'R', 'D', 'A', 'M', 'A', 'Y', 'A', 'A', 'Q', 'G',
        'G', 'K', 'R', 'L', 'R', 'A', 'F', 'L', 'A', 'I', 'E', 'S', 'A',
        'A', 'I', 'H', 'G', 'I', 'S', 'M', 'E', 'Q', 'A', 'M', 'P', 'A',
        'A', 'L', 'A', 'V', 'E', 'A', 'L', 'H', 'A', 'Y', 'S', 'L', 'V',
        'H', 'D', 'D', 'M', 'P', 'C', 'M', 'D', 'N', 'D', 'D', 'L', 'R',
        'R', 'G', 'L', 'P', 'T', 'V', 'H', 'R', 'K', 'W', 'D', 'E', 'A',
        'T', 'A', 'V', 'L', 'A', 'G', 'D', 'A', 'L', 'Q', 'T', 'L', 'A',
        'F', 'E', 'L', 'C', 'T', 'D', 'P', 'V', 'L', 'G', 'S', 'A', 'E',
        'N', 'R', 'V', 'A', 'L', 'V', 'A', 'A', 'L', 'A', 'Q', 'A', 'S',
        'G', 'A', 'E', 'G', 'M', 'V', 'Y', 'G', 'Q', 'A', 'L', 'D', 'I',
        'A', 'A', 'E', 'T', 'A', 'A', 'V', 'P', 'L', 'T', 'L', 'D', 'E',
        'I', 'I', 'R', 'L', 'Q', 'A', 'G', 'K', 'T'

In [57]:
fasta_seq = 'MSLSERLKEVQDAVETAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMAQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILDVEGNEEAAGKRLGKDAEAHKATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIERDKEGHHHHHH'
pdb_seq = 'SERLKEVQDAVETAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMAQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIESERLKEVQDAVETAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMAQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILDVKATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIE'
pdb_seq_not_hetatm = 'SERLKEVQDAVETAAAAIGRLPAGDLRDAAYAAQGGKRLRAFLAIESAAIHGISAQAPAALAVEALHAYSLVHDDPCDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIESERLKEVQDAVETAAAAIGRLPAGDLRDAAYAAQGGKRLRAFLAIESAAIHGISAQAPAALAVEALHAYSLVHDDPCDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILDVKATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIE'
AlignSeq.align_fasta_and_pdb(fasta_seq, pdb_seq)

Alignment length is not sufficient.
216 297


ValueError: Alignment length is not sufficient

In [54]:
AlignSeq.align_seq(fasta_seq, pdb_seq)

(array(['S', 'E', 'R', 'L', 'K', 'E', 'V', 'Q', 'D', 'A', 'V', 'E', 'T',
        'A', 'M', 'A', 'A', 'A', 'I', 'G', 'R', 'L', 'P', 'A', 'G', 'D',
        'L', 'R', 'D', 'A', 'M', 'A', 'Y', 'A', 'A', 'Q', 'G', 'G', 'K',
        'R', 'L', 'R', 'A', 'F', 'L', 'A', 'I', 'E', 'S', 'A', 'A', 'I',
        'H', 'G', 'I', 'S', 'M', 'A', 'Q', 'A', 'M', 'P', 'A', 'A', 'L',
        'A', 'V', 'E', 'A', 'L', 'H', 'A', 'Y', 'S', 'L', 'V', 'H', 'D',
        'D', 'M', 'P', 'C', 'M', 'D', 'N', 'D', 'D', 'L', 'R', 'R', 'G',
        'L', 'P', 'T', 'V', 'H', 'K', 'K', 'W', 'D', 'D', 'A', 'T', 'A',
        'V', 'L', 'A', 'G', 'D', 'A', 'L', 'Q', 'T', 'L', 'A', 'F', 'E',
        'L', 'C', 'T', 'D', 'P', 'V', 'L', 'G', 'S', 'A', 'E', 'N', 'R',
        'V', 'A', 'L', 'V', 'A', 'A', 'L', 'A', 'Q', 'A', 'S', 'G', 'A',
        'E', 'G', 'M', 'V', 'Y', 'G', 'Q', 'A', 'L', 'D', 'I', 'A', 'A',
        'E', 'T', 'A', 'A', 'V', 'P', 'L', 'T', 'L', 'D', 'E', 'I', 'I',
        'R', 'L', 'Q', 'A', 'G', 'K', 'T', 'G', 'A'

In [55]:
pdb_path = 'native_pdb/3M0G.pdb'
mol = parsePDB(pdb_path)
mol.select('resnum 4 to 286').select('name CA').getSequence()

'SERLKEVQDAVETAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMAQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIESERLKEVQDAVETAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMAQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILDVKATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIE'

In [56]:
mol = parsePDB(pdb_path)
mol.select('resnum 4 to 286').select('name CA').select('not hetatm').getSequence()

'SERLKEVQDAVETAAAAIGRLPAGDLRDAAYAAQGGKRLRAFLAIESAAIHGISAQAPAALAVEALHAYSLVHDDPCDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIESERLKEVQDAVETAAAAIGRLPAGDLRDAAYAAQGGKRLRAFLAIESAAIHGISAQAPAALAVEALHAYSLVHDDPCDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILDVKATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIE'

In [44]:
fasta_path = '../../../fasta/scop_cl_equal_globular100/3M0G_A.fasta'
pdb_path = 'native_pdb/3M0G.pdb'
chain = 'A'
align_fasta_pdb(fasta_path, pdb_path, chain)

(array(['-', '-', 'S', 'E', 'R', 'L', 'K', 'E', 'V', 'Q', 'D', 'G', 'V',
        'E', 'A', 'A', 'M', 'A', 'A', 'A', 'I', 'G', 'R', 'L', 'P', 'A',
        'G', 'D', 'L', 'R', 'D', 'A', 'M', 'A', 'Y', 'A', 'A', 'Q', 'G',
        'G', 'K', 'R', 'L', 'R', 'A', 'F', 'L', 'A', 'I', 'E', 'S', 'A',
        'A', 'I', 'H', 'G', 'I', 'S', 'M', 'E', 'Q', 'A', 'M', 'P', 'A',
        'A', 'L', 'A', 'V', 'E', 'A', 'L', 'H', 'A', 'Y', 'S', 'L', 'V',
        'H', 'D', 'D', 'M', 'P', 'C', 'M', 'D', 'N', 'D', 'D', 'L', 'R',
        'R', 'G', 'L', 'P', 'T', 'V', 'H', 'R', 'K', 'W', 'D', 'E', 'A',
        'T', 'A', 'V', 'L', 'A', 'G', 'D', 'A', 'L', 'Q', 'T', 'L', 'A',
        'F', 'E', 'L', 'C', 'T', 'D', 'P', 'V', 'L', 'G', 'S', 'A', 'E',
        'N', 'R', 'V', 'A', 'L', 'V', 'A', 'A', 'L', 'A', 'Q', 'A', 'S',
        'G', 'A', 'E', 'G', 'M', 'V', 'Y', 'G', 'Q', 'A', 'L', 'D', 'I',
        'A', 'A', 'E', 'T', 'A', 'A', 'V', 'P', 'L', 'T', 'L', 'D', 'E',
        'I', 'I', 'R', 'L', 'Q', 'A', 'G', 'K', 'T'

In [34]:
# 190から251が飛んでいたので問題なし
fasta_path = '../../../fasta/scop_cl_equal_globular100/3O1D_A.fasta'
pdb_path = 'native_pdb/3O1D.pdb'
chain = 'A'
align_fasta_pdb_simple(fasta_path, pdb_path, chain)

LSDEQMQIINSLVEAHHKTYDDSYSDFVRFRPPVREGPVTRSASRAASLHSLSDASSDSFNHSPESVDTKLNFSNLLMMYQDSGSPDSSEEDQQSRLSMLPHLADLVSYSIQKVIGFAKMIPGFRDLTAEDQIALLKSSAIEIIMLRSNQSFSLEDMSWSCGGPDFKYCINDVTKAGHTLELLEPLVKFQVGLKKLKLHEEEHVLLMAICLLSPDRPGVQDHVRIEALQDRLCDVLQAYIRIQHPGGRLLYAKMIQKLADLRSLNEEHSKQYRSLSFQPEHSMQLTPLVLEVFGSEVS
HMLSDEQMQIINSLVEAHHKTYDDSYSDFVRFRPPVRRLSMLPHLADLVSYSIQKVIGFAKMIPGFRDLTAEDQIALLKSSAIEIIMLRSNQSFSLEDMSWSCGGPDFKYCINDVTKAGHTLELLEPLVKFQVGLKKLKLHEEEHVLLMAICLLSPDRPGVQDHVRIEALQDRLCDVLQAYIRIQHPGGRLLYAKMIQKLADLRSLNEEHSKQYRSLSFQPEHSMQLTPLVLEVFGSEVSXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


(array(['-', '-', 'L', 'S', 'D', 'E', 'Q', 'M', 'Q', 'I', 'I', 'N', 'S',
        'L', 'V', 'E', 'A', 'H', 'H', 'K', 'T', 'Y', 'D', 'D', 'S', 'Y',
        'S', 'D', 'F', 'V', 'R', 'F', 'R', 'P', 'P', 'V', 'R', 'R', 'L',
        'S', 'M', 'L', 'P', 'H', 'L', 'A', 'D', 'L', 'V', 'S', 'Y', 'S',
        'I', 'Q', 'K', 'V', 'I', 'G', 'F', 'A', 'K', 'M', 'I', 'P', 'G',
        'F', 'R', 'D', 'L', 'T', 'A', 'E', 'D', 'Q', 'I', 'A', 'L', 'L',
        'K', 'S', 'S', 'A', 'I', 'E', 'I', 'I', 'M', 'L', 'R', 'S', 'N',
        'Q', 'S', 'F', 'S', 'L', 'E', 'D', 'M', 'S', 'W', 'S', 'C', 'G',
        'G', 'P', 'D', 'F', 'K', 'Y', 'C', 'I', 'N', 'D', 'V', 'T', 'K',
        'A', 'G', 'H', 'T', 'L', 'E', 'L', 'L', 'E', 'P', 'L', 'V', 'K',
        'F', 'Q', 'V', 'G', 'L', 'K', 'K', 'L', 'K', 'L', 'H', 'E', 'E',
        'E', 'H', 'V', 'L', 'L', 'M', 'A', 'I', 'C', 'L', 'L', 'S', 'P',
        'D', 'R', 'P', 'G', 'V', 'Q', 'D', 'H', 'V', 'R', 'I', 'E', 'A',
        'L', 'Q', 'D', 'R', 'L', 'C', 'D', 'V', 'L'

In [38]:
align_fasta_pdb(fasta_path, pdb_path, chain)

(array(['-', '-', 'L', 'S', 'D', 'E', 'Q', 'M', 'Q', 'I', 'I', 'N', 'S',
        'L', 'V', 'E', 'A', 'H', 'H', 'K', 'T', 'Y', 'D', 'D', 'S', 'Y',
        'S', 'D', 'F', 'V', 'R', 'F', 'R', 'P', 'P', 'V', 'R', 'R', 'L',
        'S', 'M', 'L', 'P', 'H', 'L', 'A', 'D', 'L', 'V', 'S', 'Y', 'S',
        'I', 'Q', 'K', 'V', 'I', 'G', 'F', 'A', 'K', 'M', 'I', 'P', 'G',
        'F', 'R', 'D', 'L', 'T', 'A', 'E', 'D', 'Q', 'I', 'A', 'L', 'L',
        'K', 'S', 'S', 'A', 'I', 'E', 'I', 'I', 'M', 'L', 'R', 'S', 'N',
        'Q', 'S', 'F', 'S', 'L', 'E', 'D', 'M', 'S', 'W', 'S', 'C', 'G',
        'G', 'P', 'D', 'F', 'K', 'Y', 'C', 'I', 'N', 'D', 'V', 'T', 'K',
        'A', 'G', 'H', 'T', 'L', 'E', 'L', 'L', 'E', 'P', 'L', 'V', 'K',
        'F', 'Q', 'V', 'G', 'L', 'K', 'K', 'L', 'K', 'L', 'H', 'E', 'E',
        'E', 'H', 'V', 'L', 'L', 'M', 'A', 'I', 'C', 'L', 'L', 'S', 'P',
        'D', 'R', 'P', 'G', 'V', 'Q', 'D', 'H', 'V', 'R', 'I', 'E', 'A',
        'L', 'Q', 'D', 'R', 'L', 'C', 'D', 'V', 'L'

In [43]:
fasta_path = '../../../fasta/scop_cl_equal_globular100/3O1D_A.fasta'
pdb_path = 'native_pdb/3O1D.pdb'
chain = 'A'
align_fasta_pdb(fasta_path, pdb_path, chain)

(array(['-', '-', 'L', 'S', 'D', 'E', 'Q', 'M', 'Q', 'I', 'I', 'N', 'S',
        'L', 'V', 'E', 'A', 'H', 'H', 'K', 'T', 'Y', 'D', 'D', 'S', 'Y',
        'S', 'D', 'F', 'V', 'R', 'F', 'R', 'P', 'P', 'V', 'R', 'R', 'L',
        'S', 'M', 'L', 'P', 'H', 'L', 'A', 'D', 'L', 'V', 'S', 'Y', 'S',
        'I', 'Q', 'K', 'V', 'I', 'G', 'F', 'A', 'K', 'M', 'I', 'P', 'G',
        'F', 'R', 'D', 'L', 'T', 'A', 'E', 'D', 'Q', 'I', 'A', 'L', 'L',
        'K', 'S', 'S', 'A', 'I', 'E', 'I', 'I', 'M', 'L', 'R', 'S', 'N',
        'Q', 'S', 'F', 'S', 'L', 'E', 'D', 'M', 'S', 'W', 'S', 'C', 'G',
        'G', 'P', 'D', 'F', 'K', 'Y', 'C', 'I', 'N', 'D', 'V', 'T', 'K',
        'A', 'G', 'H', 'T', 'L', 'E', 'L', 'L', 'E', 'P', 'L', 'V', 'K',
        'F', 'Q', 'V', 'G', 'L', 'K', 'K', 'L', 'K', 'L', 'H', 'E', 'E',
        'E', 'H', 'V', 'L', 'L', 'M', 'A', 'I', 'C', 'L', 'L', 'S', 'P',
        'D', 'R', 'P', 'G', 'V', 'Q', 'D', 'H', 'V', 'R', 'I', 'E', 'A',
        'L', 'Q', 'D', 'R', 'L', 'C', 'D', 'V', 'L'

In [22]:
fasta_path = '../../../fasta/scop_cl_equal_globular100/5JT8_A.fasta'
pdb_path = 'native_pdb/5JT8.pdb'
chain = 'A'
align_fasta_pdb_simple(fasta_path, pdb_path, chain)

EIKTFEQFKKVFGKVYRNAEEEARREHHFKEQLKWVEEHNGIDGVEYAINEYSDMSEQEFSFHLSGGGLNFTYMKMEAAKEPLINTYGSLPQNFDWRQKARLTRIRQQGSCGSCWAFAAAGVAESLYSIQKQQSIELSEQELVDCTYNRYDSSYQCNGCGSGYSTEAFKYMIRTGLVEEENYPYNMRTQWCNPDVEGQRYHVSGYQQLRYQSSDEDVMYTIQQHGPVVIYMHGSNNYFRNLGNGVLRGVAYNDAYTDHAVILVGWGTVQGVDYWIIRNSWGTGWGNGGYGYVERGHNSLGINNFVTYATL
EIKTFEQFKKVFGKVYRNAEEEARREHHFKEQLKWVEEHNGIDGVEYAINEYSDMSEQEFSFHLSGGGLNFTYMKMEAAKEPLINTYGSLPQNFDWRQKARLTRIRQQGSCGSCWAFAAAGVAESLYSIQKQQSIELSEQELVDCTYNRYDPSYQCNGCGSGYSTEAFKYMIRTGLVEERNYPYNMRTQWCDPDVEGQRYHVSGYQQLRYHSSDEDVMYTIQQHGPVVIYMHGSNNYFRNLGNGVLRGVAYNDAYTDHAVILVGWGTVQGVDYWIIRNSWGTGWGNGGYGYVERGHNSLGINNYVTYATLXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


(array(['E', 'I', 'K', 'T', 'F', 'E', 'Q', 'F', 'K', 'K', 'V', 'F', 'G',
        'K', 'V', 'Y', 'R', 'N', 'A', 'E', 'E', 'E', 'A', 'R', 'R', 'E',
        'H', 'H', 'F', 'K', 'E', 'Q', 'L', 'K', 'W', 'V', 'E', 'E', 'H',
        'N', 'G', 'I', 'D', 'G', 'V', 'E', 'Y', 'A', 'I', 'N', 'E', 'Y',
        'S', 'D', 'M', 'S', 'E', 'Q', 'E', 'F', 'S', 'F', 'H', 'L', 'S',
        'G', 'G', 'G', 'L', 'N', 'F', 'T', 'Y', 'M', 'K', 'M', 'E', 'A',
        'A', 'K', 'E', 'P', 'L', 'I', 'N', 'T', 'Y', 'G', 'S', 'L', 'P',
        'Q', 'N', 'F', 'D', 'W', 'R', 'Q', 'K', 'A', 'R', 'L', 'T', 'R',
        'I', 'R', 'Q', 'Q', 'G', 'S', 'C', 'G', 'S', 'C', 'W', 'A', 'F',
        'A', 'A', 'A', 'G', 'V', 'A', 'E', 'S', 'L', 'Y', 'S', 'I', 'Q',
        'K', 'Q', 'Q', 'S', 'I', 'E', 'L', 'S', 'E', 'Q', 'E', 'L', 'V',
        'D', 'C', 'T', 'Y', 'N', 'R', 'Y', 'D', 'S', 'S', 'Y', 'Q', 'C',
        'N', 'G', 'C', 'G', 'S', 'G', 'Y', 'S', 'T', 'E', 'A', 'F', 'K',
        'Y', 'M', 'I', 'R', 'T', 'G', 'L', 'V', 'E'

In [25]:
from Bio import pairwise2
import numpy as np
seqA = 'DSSY'
seqB = 'DPSY'
alignment_list = pairwise2.align.globalms(seqA, seqB, 5, -10, -2, -1)
print(alignment_list)
alignment = None
var_indices_min = float('inf')
for a in alignment_list:
    var_indices = np.var(np.where(np.array(list(a.seqA)) != '-')[0]) + np.var(np.where(np.array(list(a.seqB)) != '-')[0])
    if var_indices < var_indices_min:
        alignment = a
        var_indices_min = var_indices
align_seqA, align_seqB = np.array(list(alignment.seqA)), np.array(list(alignment.seqB))
print(align_seqA)
print()
print(align_seqB)
# バグの原因
"""
DSSY
DPSY
"""

[Alignment(seqA='DS-SY', seqB='D-PSY', score=11.0, start=0, end=5), Alignment(seqA='D-SSY', seqB='DPS-Y', score=11.0, start=0, end=5)]
['D' '-' 'S' 'S' 'Y']

['D' 'P' 'S' '-' 'Y']


'\nDSSY\nDPSY\n'

In [26]:
print(align_seqA, align_seqB)
align_seqAc, align_seqBc = align_seqA[np.where(align_seqB != '-')], align_seqB[np.where(align_seqA != '-')]
align_indicesA = np.where(align_seqBc != '-')[0]
align_indicesB = np.where(align_seqAc != '-')[0]
print(align_seqAc)
print(align_seqBc)
print(align_indicesA)
print(align_indicesB)

['D' '-' 'S' 'S' 'Y'] ['D' 'P' 'S' '-' 'Y']
['D' '-' 'S' 'Y']
['D' 'S' '-' 'Y']
[0 1 3]
[0 2 3]


In [27]:
from Bio import pairwise2
import numpy as np
seqA = 'AVHCKAG'
seqB = 'AVHSAG'
alignment_list = pairwise2.align.globalms(seqA, seqB, 5, -10, -2, -1)
alignment = None
var_indices_min = float('inf')
for a in alignment_list:
    var_indices = np.var(np.where(np.array(list(a.seqA)) != '-')[0])
    if var_indices < var_indices_min:
        alignment = a
        var_indices_min = var_indices
align_seqA, align_seqB = np.array(list(alignment.seqA)), np.array(list(alignment.seqB))
# align_seqA, align_seqB = cls._correct_misalignment_by_X(align_seqA, align_seqB)
# align_seqA, align_seqB = align_seqA[np.where(align_seqB != '-')], align_seqB[np.where(align_seqA != '-')]
# align_indicesA = np.where(align_seqB != '-')[0]
# align_indicesB = np.where(align_seqA != '-')[0]

In [28]:
align_seqA

array(['A', 'V', 'H', 'C', 'K', '-', 'A', 'G'], dtype='<U1')

In [29]:
align_seqB

array(['A', 'V', 'H', '-', '-', 'S', 'A', 'G'], dtype='<U1')

In [30]:
np.where(align_seqA != '-')[0]

array([0, 1, 2, 3, 4, 6, 7])

In [32]:
fasta_path = '../../../fasta/scop_cl_equal_globular100/1OHE_A.fasta'
pdb_path = 'native_pdb/1OHE.pdb'
chain = 'A'
align_fasta_pdb_simple(fasta_path, pdb_path, chain)

SFNLDEYEHYEKAENGDLNWIIPDRFIAFCGPHSRARLESGYHQHSPETYIQYFKNHNVTTIIRLNKRMYDAKRFTDAGFDHHDLFFADGSTPTDAIVKEFLDICENAEGAIAVHCKAGLGRTGTLIACYIMKHYRMTAAETIAWVRICRPGSVIGPQQQFLVMKQTNLWLEGDYFRQKLKG
RDPQDDVYLDITDRLCFAILYSRPKSASNVHYFSIDNELEYENFYADFGPLNLAMVYRYCCKINKKLKSITMLRKKIVHFTGSDQRKQANAAFLVGCYMVIYLGRTPEEAYRILIFGETSYIPFRDAAYGSCNFYITLLDCFHAVKKAMQYGFLNFNSFNLDEYEHYEKAENGDLNWIIPDRFIAFCGPHSRARLESGYHQHSPETYIQYFKNHNVTTIIRLNKRMYDAKRFTDAGFDHHDLFFADGSTPTDAIVKEFLDICENAEGAIAVHSKAGLGRTGTLIACYIMKHYRMTAAETIAWVRICRPGSVIGPQQQFLVMKQTNLWLEGDYFRQKLKGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX


(array(['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
        '-', 'S', 'F', 'N', 'L', 'D', 'E', 'Y', 'E', 'H', 'Y', 'E', 'K',
        'A', 'E', 'N', 'G', 'D', 'L', 'N', 'W', 'I'

In [33]:
fasta_path = '../../../fasta/scop_cl_equal_globular100/2BWJ_A.fasta'
pdb_path = 'native_pdb/2BWJ.pdb'
align_fasta_pdb(fasta_path, pdb_path, 'A')

(array(['-', '-', 'M', 'E', 'D', 'L', 'R', 'K', 'C', 'K', 'I', 'I', 'F',
        'I', 'I', 'G', 'G', 'P', 'G', 'S', 'G', 'K', 'G', 'T', 'Q', 'C',
        'E', 'K', 'L', 'V', 'E', 'K', 'Y', 'G', 'F', 'T', 'H', 'L', 'S',
        'T', 'G', 'E', 'L', 'L', 'R', 'E', 'E', 'L', 'A', 'S', 'E', 'S',
        'E', 'R', 'S', 'K', 'L', 'I', 'R', 'D', 'I', 'M', 'E', 'R', 'G',
        'D', 'L', 'V', 'P', 'S', 'G', 'I', 'V', 'L', 'E', 'L', 'L', 'K',
        'E', 'A', 'M', 'V', 'A', 'S', 'L', 'G', 'D', 'T', 'R', 'G', 'F',
        'L', 'I', 'D', 'G', 'Y', 'P', 'R', 'E', 'V', 'K', 'Q', 'G', 'E',
        'E', 'F', 'G', 'R', 'R', 'I', 'G', 'D', 'P', 'Q', 'L', 'V', 'I',
        'C', 'M', 'D', 'C', 'S', 'A', 'D', 'T', 'M', 'T', 'N', 'R', 'L',
        'L', 'Q', 'R', 'S', 'R', 'S', 'S', 'L', 'P', 'V', 'D', 'D', 'T',
        'T', 'K', 'T', 'I', 'A', 'K', 'R', 'L', 'E', 'A', 'Y', 'Y', 'R',
        'A', 'S', 'I', 'P', 'V', 'I', 'A', 'Y', 'Y', 'E', 'T', 'K', 'T',
        'Q', 'L', 'H', 'K', 'I', 'N', 'A', 'E', 'G'

In [47]:
pdb_path = 'native_pdb/4V52.pdb'
mol = parsePDB(pdb_path, chain='BC')
mol

<AtomGroup: 4V52BC (2087 atoms)>

In [49]:
mol.getResnums()

array([  1,   1,   1, ..., 303, 304, 305])

In [52]:
mol.getChids()

array(['B', 'B', 'B', ..., 'B', 'B', 'B'], dtype='<U2')