In [1]:
from pathlib import Path
import numpy as np
from prody import parsePDB, writePDB, LOGGER
LOGGER.verbosity = 'None'

In [2]:
fasta_dir = Path('../../../fasta/scop_cl_equal_globular100/')
native_dir = Path('native_pdb/')
cut_pdb_dir = Path('cut_pdb/')
correct_pdb_dir = Path('correct_pdb/')

In [3]:
import sys
sys.path.append('../..')
from seq import ReadSeq, AlignSeq

In [4]:
def correct_renums(mol) -> np.ndarray:
    """correct the residue number if different residues are assigned the same residue number.

    Args:
        mol (prody.Atom.atomic): mol read by prody.

    Returns:
        np.ndarray: residue numbers after modification.
    """
    new_resnum_list = []
    plus = 0
    resnum_before = float('inf')
    resid_before = float('inf')
    for resnum, resid in zip(mol.getResnums(), mol.getResindices()):
        if resnum_before == resnum and resid_before != resid:
            plus += 1
        new_resnum_list.append(resnum + plus)
        resnum_before = resnum
        resid_before = resid
    return np.array(new_resnum_list)

def cut_native(native_path: str, chain: str, start: str, end: str, out_pdb_path: str):
    mol = parsePDB(native_path, chain=chain)
    mol = mol.select('resnum ' + start + ' to ' + end)
    resnums = correct_renums(mol)
    mol.setResnums(resnums)
    writePDB(str(out_pdb_path), mol)

In [5]:
def read_fasta(fasta_path):
    seq = ReadSeq.fasta2seq(fasta_path)
    with open(fasta_path, 'r') as f:
        line = f.readline()
        start, end = line.split()[8].split(':')[1].split('-')
    return seq, start, end

In [6]:
def save_cutted_pdb(target: str):
    pdb_id = target.split('_')[0]
    chain = target.split('_')[1]
    fasta_path = (fasta_dir / target).with_suffix('.fasta')
    pdb_path = (native_dir / pdb_id).with_suffix('.pdb')
    out_pdb_path = (cut_pdb_dir / target).with_suffix('.pdb')
    seq, scop_start, scop_end = read_fasta(fasta_path)
    cut_native(pdb_path, chain, scop_start, scop_end, out_pdb_path)

In [7]:
target = '1IRU_I'
save_cutted_pdb(target)

In [8]:
target = '1IRU_I'
pdb_id = target.split('_')[0]
chain = target.split('_')[1]
fasta_path = (fasta_dir / target).with_suffix('.fasta')
pdb_path = (native_dir / pdb_id).with_suffix('.pdb')
out_pdb_path = (cut_pdb_dir / target).with_suffix('.pdb')
fasta_seq, scop_start, scop_end = read_fasta(fasta_path)
cut_native(pdb_path, chain, scop_start, scop_end, out_pdb_path)
pdb_seq, pdb_resnum = ReadSeq.pdb2seq(out_pdb_path, chain, read_HETATM=True)

In [9]:
fasta_seq

'TTIAGVVYKDGIVLGADTRATEGMVVADKNCSKIHFISPNIYCCGAGTAADTDMTTQLISSNLELHSLSTGRLPRVVTANRMLKQMLFRYQGYIGAALVLGGVDVTGPHLYSIYPHGSTDKLPYVTMGSGSLAAMAVFEDKFRPDMEEEEAKKLVSEAIAAGIFNDLGSGSNIDLCVISKSKLDFLRPYSVPNKKGTRFGRYRCEKGTNAVLTEKVTTLE'

In [10]:
pdb_seq

'TTIAGVVYKDGIVLGADTRATEGMVVADKNCSKIHFISPNIYCCGAGTAADTDMTTQLISSNLELHSLSTGRLPRVVTANRMLKQMLFRYRGYIGAALVLGGVDVTGPHLYSIYPHGSTDKLPYVTMGSGSLAAMAVFEDKFRPDMEEEEAKNLVSEAIAAGIFNDLGSGSNIDLCVISKNKLDFLRPYTVPNKKGTRLGRYRCEKGTTAVLTEKITPLE'

In [11]:
from Bio import pairwise2
# TODO もしかしてミスマッチのペナルティがデカすぎたせいでアライメントのミスが目立ってた？
pairwise2.align.globalms(fasta_seq, pdb_seq, 5, -1, -2, -1)

[Alignment(seqA='TTIAGVVYKDGIVLGADTRATEGMVVADKNCSKIHFISPNIYCCGAGTAADTDMTTQLISSNLELHSLSTGRLPRVVTANRMLKQMLFRYQGYIGAALVLGGVDVTGPHLYSIYPHGSTDKLPYVTMGSGSLAAMAVFEDKFRPDMEEEEAKKLVSEAIAAGIFNDLGSGSNIDLCVISKSKLDFLRPYSVPNKKGTRFGRYRCEKGTNAVLTEKVTTLE', seqB='TTIAGVVYKDGIVLGADTRATEGMVVADKNCSKIHFISPNIYCCGAGTAADTDMTTQLISSNLELHSLSTGRLPRVVTANRMLKQMLFRYRGYIGAALVLGGVDVTGPHLYSIYPHGSTDKLPYVTMGSGSLAAMAVFEDKFRPDMEEEEAKNLVSEAIAAGIFNDLGSGSNIDLCVISKNKLDFLRPYTVPNKKGTRLGRYRCEKGTTAVLTEKITPLE', score=1052.0, start=0, end=220)]

In [12]:
pdb_resnum_resname_dict = dict(zip(pdb_resnum, pdb_seq))
missing = 0
diff = 0
for i, r in enumerate(fasta_seq):
    fasta_resnum = i + 1
    if fasta_resnum in pdb_resnum_resname_dict:
        if r != pdb_resnum_resname_dict[fasta_resnum]:
            diff += 1
    else:
        missing += 1
print('length:', len(fasta_seq))
print('diff:', diff)
print('missing:', missing)

length: 220
diff: 8
missing: 0


In [13]:
def align_fasta_pdb(target):
    pdb_id = target.split('_')[0]
    chain = target.split('_')[1]
    fasta_path = (fasta_dir / target).with_suffix('.fasta')
    pdb_path = (native_dir / pdb_id).with_suffix('.pdb')
    fasta_seq = ReadSeq.fasta2seq(fasta_path)
    pdb_seq, _ = ReadSeq.pdb2seq(pdb_path, chain, read_HETATM=True)
    alignment_list = pairwise2.align.globalms(fasta_seq, pdb_seq, 5, -1, -2, -1)
    return alignment_list

for fasta in fasta_dir.glob('*.fasta'):
    target = fasta.stem
    alignment_list = align_fasta_pdb(target)
    if len(alignment_list) > 1:
        print(target)
        for alignment in alignment_list:
            print(alignment)
        print()

2I2Y_A
Alignment(seqA='----------------------------------------------------------------MHRDSCPLDCKVYVGNLGNNGNKTELERAFGYYGPLRSVWVARNPPGFAFVEFEDPRDAADAVRELDGRTLCGCRVRVELSNGEKR', seqB='MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTEGSHHHHHHMHRDSCPLDCKVYVGNLGNNGNKTELERAFGYYGPLRSVWVARNPPGFAFVEFEDPRDAADAVRELDGRTLCGCRVRVELSNGEKR', score=365.0, start=0, end=150)
Alignment(seqA='M----------------------------------------------------------------HRDSCPLDCKVYVGNLGNNGNKTELERAFGYYGPLRSVWVARNPPGFAFVEFEDPRDAADAVRELDGRTLCGCRVRVELSNGEKR', seqB='MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTEGSHHHHHHMHRDSCPLDCKVYVGNLGNNGNKTELERAFGYYGPLRSVWVARNPPGFAFVEFEDPRDAADAVRELDGRTLCGCRVRVELSNGEKR', score=365.0, start=0, end=150)

3M0G_A
Alignment(seqA='--SERLKEVQDGVEAAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMEQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHRKWDEATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAIIAGADRALLAAYATALGLAFQIADDILDVEGDEEAAGKRLGKDAEAHKATFVSLLGLPGA

In [14]:
target = '1VLK_A'
alignment_list = align_fasta_pdb(target)
for alignment in alignment_list:
    print(alignment)

Alignment(seqA='CDNFP----QMLRDLRDAFSRVKTFFQTKDEVDNLLLKESLLEDFKGYLGCQALSEMIQFYLEEVMPQAENQDPEAKDHVNSLGENLKTLRLRLRRCHRFLPCENKSKAVEQIKNAFNKLQEKGIYKAMSEFDIFINYIEAYMTIK----------------------------------------------------------------------------------------------------------------------', seqB='CDNFP----QMLRDLRDAFSRVKTFFQTKDEVDNLLLKESLLEDFKGYLGCQALSEMIQFYLEEVMPQAENQDPEAKDHVNSLGENLKTLRLRLRRCHRFLPCENKSKAVEQIKNAFNKLQEKGIYKAMSEFDIFINYIEAYMTIK-------------------------------------------XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX', score=586.0, start=0, end=264)


## 想定
1. pdbをダウンロード
2. correct resnumで残基番号を修正
3. 残基番号をもとに配列を生成．このときmissing residueは'-'か何かにする
4. fastaと生成した配列をアライメント

In [15]:
target = '4PNE_B'
pdb_id = target.split('_')[0]
chain = target.split('_')[1]
fasta_path = (fasta_dir / target).with_suffix('.fasta')
pdb_path = (native_dir / pdb_id).with_suffix('.pdb')
fasta_seq = ReadSeq.fasta2seq(fasta_path)
correct_pdb_path = (correct_pdb_dir / pdb_id).with_suffix('.pdb')
mol = parsePDB(pdb_path, chain=chain)
new_resnum = correct_renums(mol)
mol.setResnums(new_resnum)
writePDB(str(correct_pdb_path), mol)

'correct_pdb/4PNE.pdb'

In [16]:
fasta_seq

'APTSQQVGQMYDLVTPLLNSVAGGPCAIHHGYWENDGRASWQQAADRLTDLVAERTVLDGGVRLLDVGCGTGQPALRVARDNAIQITGITVSQVQVAIAADCARERGLSHRVDFSCVDAMSLPYPDNAFDAAWAMQSLLEMSEPDRAIREILRVLKPGGILGVTEVVKREAGGGMPVSGDRWPTGLRICLAEQLLESLRAAGFEILDWEDVSSRTRYFMPQFAEELAAHQHGIADRYGPAVAGWAAAVCDYEKYAHDMGYAILTARKPVG'

In [17]:
pdb_seq, pdb_resnums = ReadSeq.pdb2seq(correct_pdb_path, chain, read_HETATM=True, insert_gap=False)

In [18]:
pdb_resnums

array([  4,   5,   6,   7,  13,  14,  15,  16,  17,  18,  19,  20,  21,
        22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,  34,
        35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
        48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,
        61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,
        74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,
        87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
       100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
       152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
       165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177,
       178, 179, 180, 181, 182, 183, 186, 187, 188, 189, 190, 19

In [19]:
pdb_seq

'SGIPAPTSQQVGQMYDLVTPLLNSVAGGPCAIHHGYWENDGRASWQQAADRLTDLVAERTVLDGGVRLLDVGCGTGQPALRVARDNAIQITGITVSQVQVAIAADCARERGLSHRVDFSCVDAMSLPYPDNAFDAAWAMQSLLEMSEPDRAIREILRVLKPGGILGVTEVVKREAGMPVSGDRWPTGLRICLAEQLLESLRAAGFEILDWEDVSSRTRYFMPQFAEELAAHQHGIADRYGPAVAGWAAAVCDYEKYAHDMGYAILTARKPVGCXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

In [20]:
seq_len = pdb_resnums[-1] - pdb_resnums[0] + 1
seq_array = np.array(['-'] * seq_len)
seq_array

array(['-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-',
       '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-', '-

In [21]:
seq_indices = pdb_resnums - pdb_resnums[0]
seq_indices

array([  0,   1,   2,   3,   9,  10,  11,  12,  13,  14,  15,  16,  17,
        18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,
        31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,
        44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
        57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,
        83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
        96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
       109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
       122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
       135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147,
       148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
       161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173,
       174, 175, 176, 177, 178, 179, 182, 183, 184, 185, 186, 18

In [22]:
seq_array[seq_indices] = list(pdb_seq)
seq_array

array(['S', 'G', 'I', 'P', '-', '-', '-', '-', '-', 'A', 'P', 'T', 'S',
       'Q', 'Q', 'V', 'G', 'Q', 'M', 'Y', 'D', 'L', 'V', 'T', 'P', 'L',
       'L', 'N', 'S', 'V', 'A', 'G', 'G', 'P', 'C', 'A', 'I', 'H', 'H',
       'G', 'Y', 'W', 'E', 'N', 'D', 'G', 'R', 'A', 'S', 'W', 'Q', 'Q',
       'A', 'A', 'D', 'R', 'L', 'T', 'D', 'L', 'V', 'A', 'E', 'R', 'T',
       'V', 'L', 'D', 'G', 'G', 'V', 'R', 'L', 'L', 'D', 'V', 'G', 'C',
       'G', 'T', 'G', 'Q', 'P', 'A', 'L', 'R', 'V', 'A', 'R', 'D', 'N',
       'A', 'I', 'Q', 'I', 'T', 'G', 'I', 'T', 'V', 'S', 'Q', 'V', 'Q',
       'V', 'A', 'I', 'A', 'A', 'D', 'C', 'A', 'R', 'E', 'R', 'G', 'L',
       'S', 'H', 'R', 'V', 'D', 'F', 'S', 'C', 'V', 'D', 'A', 'M', 'S',
       'L', 'P', 'Y', 'P', 'D', 'N', 'A', 'F', 'D', 'A', 'A', 'W', 'A',
       'M', 'Q', 'S', 'L', 'L', 'E', 'M', 'S', 'E', 'P', 'D', 'R', 'A',
       'I', 'R', 'E', 'I', 'L', 'R', 'V', 'L', 'K', 'P', 'G', 'G', 'I',
       'L', 'G', 'V', 'T', 'E', 'V', 'V', 'K', 'R', 'E', 'A', '-

In [23]:
pdb_seq_correct = ''.join(seq_array)
pdb_seq_correct

'SGIP-----APTSQQVGQMYDLVTPLLNSVAGGPCAIHHGYWENDGRASWQQAADRLTDLVAERTVLDGGVRLLDVGCGTGQPALRVARDNAIQITGITVSQVQVAIAADCARERGLSHRVDFSCVDAMSLPYPDNAFDAAWAMQSLLEMSEPDRAIREILRVLKPGGILGVTEVVKREA--GMPVSGDRWPTGLRICLAEQLLESLRAAGFEILDWEDVSSRTRYFMPQFAEELAAHQHGIADRYGPAVAGWAAAVCDYEKYAHDMGYAILTARKPVG------------------CX--------------------------------------------------------------------------------------------------XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

In [24]:
pairwise2.align.globalms(fasta_seq, pdb_seq_correct, 5, -1, -2, -1)

[Alignment(seqA='---------APTSQQVGQMYDLVTPLLNSVAGGPCAIHHGYWENDGRASWQQAADRLTDLVAERTVLDGGVRLLDVGCGTGQPALRVARDNAIQITGITVSQVQVAIAADCARERGLSHRVDFSCVDAMSLPYPDNAFDAAWAMQSLLEMSEPDRAIREILRVLKPGGILGVTEVVKREAGGGMPVSGDRWPTGLRICLAEQLLESLRAAGFEILDWEDVSSRTRYFMPQFAEELAAHQHGIADRYGPAVAGWAAAVCDYEKYAHDMGYAILTARKPVG---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------', seqB='SGIP-----APTSQQVGQMYDLVTPLLNSVAGGPCAIHHGYWENDGRASWQQAADRLTDLVAERTVLDGGVRLLDVGCGTGQPALRVARDNAIQITGITVSQVQVAIAADCARERGLSHRVDFSCVDAMSLPYPDNAFDAAWAMQSLLEMSEPDRAIREILRVLKPGGILGVTEVVKREA--GMPVSGDRWPTGLRICLAEQLLESLRAAGFEILDWEDVSSRTRYFMPQFAEELAAHQHGIADRYGPAVAGWAAAVCDYEKYAHDMGYAILTARKPVG------------------CX--------------------------------------------------------------------------------------------------XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX

In [25]:
def align_fasta_pdb(target):
    pdb_id = target.split('_')[0]
    chain = target.split('_')[1]
    fasta_path = (fasta_dir / target).with_suffix('.fasta')
    pdb_path = (native_dir / pdb_id).with_suffix('.pdb')
    correct_pdb_path = (correct_pdb_dir / target).with_suffix('.pdb')
    fasta_seq = ReadSeq.fasta2seq(fasta_path)
    mol = parsePDB(pdb_path, chain=chain)
    new_resnums = correct_renums(mol)
    mol.setResnums(new_resnums)
    pdb_seq, pdb_resnum = ReadSeq.mol2seq(mol, insert_gap=False)
    seq_len = np.max(pdb_resnum) - np.min(pdb_resnum) + 1
    seq_array = np.array(['-'] * seq_len)
    seq_indices = pdb_resnum - pdb_resnum[0]
    try:
        seq_array[seq_indices] = list(pdb_seq)
    except IndexError as e:
        print(target, e)
        print(fasta_seq)
        print(pdb_seq)
        print(pdb_resnum)
        print(seq_indices)
        return None
    pdb_seq_correct = ''.join(seq_array)
    alignment_list = pairwise2.align.globalms(fasta_seq, pdb_seq_correct, 5, -1, -2, -1)
    return alignment_list

for fasta in fasta_dir.glob('*.fasta'):
    target = fasta.stem
    alignment_list = align_fasta_pdb(target)
    # try:
    #     alignment_list = align_fasta_pdb(target)
    # except IndexError as e:
    #     print('target:', target, e)
    if len(alignment_list) > 1:
        print(target)
        for alignment in alignment_list:
            print(alignment)
        print()

2I2Y_A
Alignment(seqA='----------------------------------------------------------------MHRDSCPLDCKVYVGNLGNNGNKTELERAFGYYGPLRSVWVARNPPGFAFVEFEDPRDAADAVRELDGRTLCGCRVRVELSNGEKR', seqB='MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTEGSHHHHHHMHRDSCPLDCKVYVGNLGNNGNKTELERAFGYYGPLRSVWVARNPPGFAFVEFEDPRDAADAVRELDGRTLCGCRVRVELSNGEKR', score=365.0, start=0, end=150)
Alignment(seqA='M----------------------------------------------------------------HRDSCPLDCKVYVGNLGNNGNKTELERAFGYYGPLRSVWVARNPPGFAFVEFEDPRDAADAVRELDGRTLCGCRVRVELSNGEKR', seqB='MQYKLILNGKTLKGETTTEAVDAATAEKVFKQYANDNGVDGEWTYDDATKTFTVTEGSHHHHHHMHRDSCPLDCKVYVGNLGNNGNKTELERAFGYYGPLRSVWVARNPPGFAFVEFEDPRDAADAVRELDGRTLCGCRVRVELSNGEKR', score=365.0, start=0, end=150)

3M0G_A
Alignment(seqA='--SERLKEVQDGVEAAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMEQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHRKWDEATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAIIAGADRALLAAYATALGLAFQIADDILDVEGDEEAAGKRLGKDAEAHKATFVSLLGLPGA

In [26]:
from functools import reduce

def get_native_target(target):
    pdb_id = target.split('_')[0]
    chain = target.split('_')[1]
    fasta_path = (fasta_dir / target).with_suffix('.fasta')
    pdb_path = (native_dir / pdb_id).with_suffix('.pdb')
    correct_pdb_path = (correct_pdb_dir / target).with_suffix('.pdb')
    fasta_seq = ReadSeq.fasta2seq(fasta_path)
    mol = parsePDB(pdb_path, chain=chain)
    new_resnums = correct_renums(mol)
    mol.setResnums(new_resnums)
    # TODO ここでresnumsをmolのresnumsの順番から変更してしまっているのでバグが出る
    pdb_seq, pdb_resnums = ReadSeq.mol2seq(mol)
    # pdb_seq, pdb_resnum = ReadSeq.mol2seq(mol, insert_gap=False)
    # if pdb_resnum[0] != np.min(pdb_resnum):
    #     print('different!!!', pdb_resnum[0], np.min(pdb_resnum))
    #     del_indices = np.where(pdb_resnum < pdb_resnum[0])[0]
    #     pdb_seq = ''.join(np.delete(list(pdb_seq), del_indices, 0))
    #     pdb_resnum = np.delete(pdb_resnum, del_indices, 0)
    # if pdb_resnum[0] != np.min(pdb_resnum):
    #     print('different again!!!', pdb_resnum[0], np.min(pdb_resnum))
    # seq_len = np.max(pdb_resnum) - np.min(pdb_resnum) + 1
    # seq_array = np.array(['-'] * seq_len)
    # seq_indices = pdb_resnum - pdb_resnum[0]
    # seq_array[seq_indices] = list(pdb_seq)
    # pdb_seq_correct = ''.join(seq_array)
    align_fseq, align_pseq, align_findices, align_pindices = AlignSeq.align_fasta_and_pdb(fasta_seq, pdb_seq, alignment_percentage_threshold=0.7)
    # TODO 単純にpdb_seq_correctとmol.getResindices()の順序が一部変更されているのでalign_pindicesをそのまま使うとずれが生じる
    # sel_mol = mol.select('resindex {}'.format(reduce(lambda a, b: str(a) + ' ' + str(b), align_pindices)))
    sel_pdb_resnums = pdb_resnums[align_pindices]
    sel_mol = mol.select('resnum {}'.format(reduce(lambda a, b: str(a) + ' ' + str(b), sel_pdb_resnums)))
    assert sel_mol is not None
    # pdb_resnum = sel_mol.getResnums()
    pdb_resnum_start, pdb_resnum_end = str(sel_pdb_resnums[0]), str(sel_pdb_resnums[-1])
    _, scop_resnum_start, scop_resnum_end = read_fasta(fasta_path)
    if pdb_resnum_start != scop_resnum_start or pdb_resnum_end != scop_resnum_end:
        print('The residue number of pdb and scop are different')
        print('pdb start: {}, scop start: {}, pdb end: {}, scop end:{}'
                .format(pdb_resnum_start, scop_resnum_start, pdb_resnum_end, scop_resnum_end))
    fasta_resnum = align_findices + 1
    # TODO ここもalign_pindicesをそのまま使うとずれが生じる
    convert_resnum_dict = dict(zip(sel_pdb_resnums, fasta_resnum))
    # TODO ResindicesではなくResnumsを使う
    new_resnum = [convert_resnum_dict[resnum] for resnum in sel_mol.getResnums()]
    sel_mol.setResnums(new_resnum)
    writePDB(str(correct_pdb_path), sel_mol)

In [27]:
for fasta in fasta_dir.glob('*.fasta'):
    target = fasta.stem
    print(target)
    get_native_target(target)
    # try:
    #     get_native_target(target)
    # except ValueError as e:
    #     print(e)

4EFH_A
2UUB_E
4WE5_A
5V93_o
1HV5_A
2V2F_F
4HRR_H
5XX9_B
1IFV_A
1W3B_A
3DMI_A
2F2H_A
1DE4_A
4ZPL_A
6KNA_A
3RHH_C
2WP2_B
5CZJ_B
1F2E_A
2BWJ_A
1AYZ_A
1I0D_A
3JAI_SS
4PNE_B
2MR9_A
1P3C_A
2I2Y_A
6K9F_B
6O8W_g
2D9Y_A
2VKE_A
2DS4_A
1OHE_A
3O1D_A
1RSY_A
6CD6_A
1VLK_A
5GJH_A
1I4K_A
4IOP_B
2DKF_A
5DE0_C
1WLU_A
3GFO_A
5JP5_A
1EA9_C
4RM4_A
6SPF_T
5JT8_A
2CNW_D
2HSZ_A
1HTA_A
4G12_A
1TW4_A
1ST9_A
4NBU_B
6KWQ_A
1YR0_A
3M0G_A
1MZE_A
2RKG_A
1V2F_A
1XWN_A
1XZP_A
3JB4_B
1S3J_A
1KWF_A
1OH0_A
1UTI_A
1IRU_I
2GH9_A
5Y4E_A
4NYN_A
1EXR_A
2P6R_A
2UUB_Q
3HRX_A
1UB7_A
2CYB_B
4V52_BC
1KA9_F
2B5A_A
1W8O_A
2V90_C
The residue number of pdb and scop are different
pdb start: 247, scop start: 247, pdb end: 338, scop end:339
4KE6_E
1RE5_A
3CRN_A
3TJ3_A
2YV3_A
2C21_A
1V70_A
1BQU_A
1G4I_A
1QNI_A
1K28_A
4M1Q_B
3GM8_A
2ZJR_H
1RT8_A
1RHF_A


# Test the extracted pdb sequence matches to the fasta sequence for all targets

In [28]:
cut_pdb_dir = Path('correct_pdb')

def test_match_from_seq(fasta_seq, pdb_seq, pdb_resnum):
    fasta_seq_array = np.array(list(fasta_seq))
    pdb_seq_array = np.copy(fasta_seq_array)
    pdb_seq_array[pdb_resnum - 1] = list(pdb_seq)
    num_diff = np.count_nonzero(fasta_seq_array != pdb_seq_array)
    num_missing = len(fasta_seq) - len(pdb_seq)
    return num_diff, num_missing

def test_match(target):
    chain = target.split('_')[1]
    pdb_path = (cut_pdb_dir / target).with_suffix('.pdb')
    fasta_path = (fasta_dir / target).with_suffix('.fasta')
    fasta_seq = ReadSeq.fasta2seq(fasta_path)
    pdb_seq, pdb_resnum = ReadSeq.pdb2seq(pdb_path, chain=chain, read_HETATM=True, insert_gap=False)

    # pdb_resnum_resname_dict = dict(zip(pdb_resnum, pdb_seq))
    # missing = 0
    # diff = 0
    # for i, r in enumerate(fasta_seq):
    #     fasta_resnum = i + 1
    #     if fasta_resnum in pdb_resnum_resname_dict:
    #         if r != pdb_resnum_resname_dict[fasta_resnum]:
    #             diff += 1
    #     else:
    #         missing += 1
    print('length:', len(fasta_seq))
    # print('diff:', diff)
    # print('missing:', missing)

    diff, missing = test_match_from_seq(fasta_seq, pdb_seq, pdb_resnum)
    print('diff:', diff)
    print('missing:', missing)


for fasta in fasta_dir.glob('*.fasta'):
    target = fasta.stem
    print(target)
    test_match(target)
    print()

4EFH_A
length: 227
diff: 0
missing: 0

2UUB_E
length: 81
diff: 0
missing: 0

4WE5_A
length: 321
diff: 0
missing: 0

5V93_o
length: 87
diff: 0
missing: 0

1HV5_A
length: 161
diff: 0
missing: 0

2V2F_F
length: 384
diff: 0
missing: 17

4HRR_H
length: 150
diff: 3
missing: 0

5XX9_B
length: 157
diff: 0
missing: 0

1IFV_A
length: 155
diff: 0
missing: 0

1W3B_A
length: 388
diff: 2
missing: 0

3DMI_A
length: 87
diff: 0
missing: 0

2F2H_A
length: 80
diff: 0
missing: 0

1DE4_A
length: 178
diff: 0
missing: 0

4ZPL_A
length: 107
diff: 0
missing: 0

6KNA_A
length: 76
diff: 0
missing: 0

3RHH_C
length: 476
diff: 0
missing: 0

2WP2_B
length: 110
diff: 0
missing: 0

5CZJ_B
length: 319
diff: 0
missing: 0

1F2E_A
length: 121
diff: 0
missing: 0

2BWJ_A
length: 192
diff: 1
missing: 0

1AYZ_A
length: 153
diff: 0
missing: 0

1I0D_A
length: 331
diff: 0
missing: 0

3JAI_SS
length: 137
diff: 0
missing: 0

4PNE_B
length: 270
diff: 0
missing: 2

2MR9_A
length: 40
diff: 0
missing: 0

1P3C_A
length: 215
diff: 0
mi

In [29]:
target = '2V2F_F'
chain = target.split('_')[1]
pdb_path = (cut_pdb_dir / target).with_suffix('.pdb')
fasta_path = (fasta_dir / target).with_suffix('.fasta')
fasta_seq = ReadSeq.fasta2seq(fasta_path)
pdb_seq, pdb_resnum = ReadSeq.pdb2seq(pdb_path, chain=chain, read_HETATM=True, insert_gap=False)

In [30]:
fasta_seq

'YPAYMDNYLKEVINQVEQETGYNLLTTGMDVYTNVDQEAQKHLWDIYNSDQYVSYPDDDLQVASTVVDVSNGKVIAQLGARHQASNVSFGTNQAVETNRDWGSAMKPITDYAPAIEYGVYDSTATMVNDIPYNYPGTSTPVYNWDRAYFGNITLQYALQQSRNVTAVETLNKVGLDRAKTFLNGLGIDYPSMHYANAISSNTTESNKQYGASSEKMAAAYAAFANGGIYHKPMYINKVVFSDGSKKEFSDVGTRAMKETTAYMMTEMMKTVLAYGTGRGAYLPWLAQAGKTGTSNYTDDEIEKHIKNTGYVAPDEMFVGYTRKYSMAVWTGYSNRLTPIVGDGFLVAAKVYRSMITYLSEGSNPEDWNIPEGLYRNGEFVFKNG'

In [31]:
pdb_seq

'YPAYMDNYLKEVINQVEQETGYNLLTTGMDVYTNVDQEAQKHLWDIYNSDQYVSYPDDDLQVASTVVDVSNGKVIAQLGARHSFGTNQAVETNRDWGSAMKPITDYAPAIEYGVYDSTATMVNDIPYNYPGTSTPVYNWDRAYFGNITLQYALQQSRNVTAVETLNKVGLDRAKTFLNGLGIDYPSMHYANAISSNTTESNKQYGASSEKMAAAYAAFANGGIYHKPMYINKVVFSDGSKKEFSDVGTRAMKETTAYMMTEMMKTVLAYGTGRGAYLPWLAQAGKTGTSNYTDYVAPDEMFVGYTRKYSMAVWTGYSNRLTPIVGDGFLVAAKVYRSMITYLSEGSPEDWNIPEGLYRNGEFVFKNG'

In [32]:
pdb_resnum

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  88,  89,  90,  91,  92,  93,  94,  95,  96,
        97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
       110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
       123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135,
       136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
       149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161,
       162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174,
       175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 18

In [33]:
pdb_seq_insert = np.array(['-'] * len(fasta_seq))
pdb_seq_insert[pdb_resnum - 1] = list(pdb_seq)

In [34]:
pdb_seq_insert

array(['Y', 'P', 'A', 'Y', 'M', 'D', 'N', 'Y', 'L', 'K', 'E', 'V', 'I',
       'N', 'Q', 'V', 'E', 'Q', 'E', 'T', 'G', 'Y', 'N', 'L', 'L', 'T',
       'T', 'G', 'M', 'D', 'V', 'Y', 'T', 'N', 'V', 'D', 'Q', 'E', 'A',
       'Q', 'K', 'H', 'L', 'W', 'D', 'I', 'Y', 'N', 'S', 'D', 'Q', 'Y',
       'V', 'S', 'Y', 'P', 'D', 'D', 'D', 'L', 'Q', 'V', 'A', 'S', 'T',
       'V', 'V', 'D', 'V', 'S', 'N', 'G', 'K', 'V', 'I', 'A', 'Q', 'L',
       'G', 'A', 'R', 'H', '-', '-', '-', '-', '-', 'S', 'F', 'G', 'T',
       'N', 'Q', 'A', 'V', 'E', 'T', 'N', 'R', 'D', 'W', 'G', 'S', 'A',
       'M', 'K', 'P', 'I', 'T', 'D', 'Y', 'A', 'P', 'A', 'I', 'E', 'Y',
       'G', 'V', 'Y', 'D', 'S', 'T', 'A', 'T', 'M', 'V', 'N', 'D', 'I',
       'P', 'Y', 'N', 'Y', 'P', 'G', 'T', 'S', 'T', 'P', 'V', 'Y', 'N',
       'W', 'D', 'R', 'A', 'Y', 'F', 'G', 'N', 'I', 'T', 'L', 'Q', 'Y',
       'A', 'L', 'Q', 'Q', 'S', 'R', 'N', 'V', 'T', 'A', 'V', 'E', 'T',
       'L', 'N', 'K', 'V', 'G', 'L', 'D', 'R', 'A', 'K', 'T', 'F

In [52]:
def test_match(fasta_seq, pdb_seq, pdb_resnum):
    fasta_seq_array = np.array(list(fasta_seq))
    pdb_seq_array = np.copy(fasta_seq_array)
    pdb_seq_array[pdb_resnum - 1] = list(pdb_seq)
    num_diff = np.count_nonzero(fasta_seq_array != pdb_seq_array)
    num_missing = len(fasta_seq) - len(pdb_seq)
    return num_diff, num_missing

In [38]:
target = '3M0G_A'
chain = target.split('_')[1]
pdb_path = (cut_pdb_dir / target).with_suffix('.pdb')
fasta_path = (fasta_dir / target).with_suffix('.fasta')
fasta_seq = ReadSeq.fasta2seq(fasta_path)
pdb_seq, pdb_resnum = ReadSeq.pdb2seq(pdb_path, chain=chain, read_HETATM=True, insert_gap=True)

In [37]:
pdb_seq

'SERLKEVQDAVETAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMAQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHKKWDDATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAILAGADRGPLTAYATALGLAFQIADDILATFVSLLGLAGAKSRAADLVAEAEAALAPYGEAASTLRACARYVIE'

In [41]:
fasta_seq

'SERLKEVQDGVEAAMAAAIGRLPAGDLRDAMAYAAQGGKRLRAFLAIESAAIHGISMEQAMPAALAVEALHAYSLVHDDMPCMDNDDLRRGLPTVHRKWDEATAVLAGDALQTLAFELCTDPVLGSAENRVALVAALAQASGAEGMVYGQALDIAAETAAVPLTLDEIIRLQAGKTGALISFAAQAGAIIAGADRALLAAYATALGLAFQIADDILDVEGDEEAAGKRLGKDAEAHKATFVSLLGLPGAKARAADLVAEAEAALAPYGEAAATLRACARYVIE'

In [42]:
AlignSeq.align_seq(fasta_seq, pdb_seq)

(array(['S', 'E', 'R', 'L', 'K', 'E', 'V', 'Q', 'D', 'G', 'V', 'E', 'A',
        'A', 'M', 'A', 'A', 'A', 'I', 'G', 'R', 'L', 'P', 'A', 'G', 'D',
        'L', 'R', 'D', 'A', 'M', 'A', 'Y', 'A', 'A', 'Q', 'G', 'G', 'K',
        'R', 'L', 'R', 'A', 'F', 'L', 'A', 'I', 'E', 'S', 'A', 'A', 'I',
        'H', 'G', 'I', 'S', 'M', 'E', 'Q', 'A', 'M', 'P', 'A', 'A', 'L',
        'A', 'V', 'E', 'A', 'L', 'H', 'A', 'Y', 'S', 'L', 'V', 'H', 'D',
        'D', 'M', 'P', 'C', 'M', 'D', 'N', 'D', 'D', 'L', 'R', 'R', 'G',
        'L', 'P', 'T', 'V', 'H', 'R', 'K', 'W', 'D', 'E', 'A', 'T', 'A',
        'V', 'L', 'A', 'G', 'D', 'A', 'L', 'Q', 'T', 'L', 'A', 'F', 'E',
        'L', 'C', 'T', 'D', 'P', 'V', 'L', 'G', 'S', 'A', 'E', 'N', 'R',
        'V', 'A', 'L', 'V', 'A', 'A', 'L', 'A', 'Q', 'A', 'S', 'G', 'A',
        'E', 'G', 'M', 'V', 'Y', 'G', 'Q', 'A', 'L', 'D', 'I', 'A', 'A',
        'E', 'T', 'A', 'A', 'V', 'P', 'L', 'T', 'L', 'D', 'E', 'I', 'I',
        'R', 'L', 'Q', 'A', 'G', 'K', 'T', 'G', 'A'