In [1]:
import pandas as pd
import numpy as np
import urllib.request
from bs4 import BeautifulSoup

database = pd.read_csv('http://mips.helmholtz-muenchen.de/proj/ppi/negatome/combined_stringent.txt', sep='\t')

Приведем табличку со невзаимодействующими белками из Negatome в человеческий вид.

In [2]:
database.rename(columns={'Q6ZNK6' : 'Protein_A', 'Q9Y4K3' : 'Protein_B'}, inplace=True)

In [3]:
database

Unnamed: 0,Protein_A,Protein_B
0,Q9NR31,Q15797
1,P11627,P53986
2,P33176,Q96EK5
3,Q9NPY3,P02745
4,P03211,Q15796
...,...,...
6130,Q9WUF4,O88384
6131,Q9WX78,Q9Z9H6
6132,Q9XJR1,Q9XJR3
6133,Q9XJR3,Q9XJR6


In [4]:
data = []
data.insert(0, {'Protein_A': 'Q6ZNK6', 'Protein_B': 'Q9Y4K3'})
d = pd.concat([pd.DataFrame(data), database], ignore_index=True)

Для каждого из белков, используя код страницы в UniProt, определяем, является ли он человеческим.

In [6]:
import time
from tqdm import tqdm

begin = time.time()

arr_human = [0 for _ in range(6136)]

def check(protein):
    s = 'https://www.uniprot.org/uniprot/' + protein
    html_doc = urllib.request.urlopen(s).read()
    soup = BeautifulSoup(html_doc, 'html.parser')
    try:
      return soup.find("div", {"id": "content-organism"}).text == "Homo sapiens (Human)"
    except:
      return False  

def do(i):
    proteinA, proteinB = d.Protein_A[i], d.Protein_B[i]
    if check(proteinA) and check(proteinB):
        arr_human[i] = 1
    else:
        arr_human[i] = 0
    
for i in tqdm(range(len(d.Protein_A)), position=0, leave=True):
    do(i)
    
print(time.time() - begin)  

100%|██████████| 6136/6136 [2:42:17<00:00,  1.59s/it]   

9737.73421573639





In [7]:
d['human_or_not'] = arr_human
display(d.head(20))

Unnamed: 0,Protein_A,Protein_B,human_or_not
0,Q6ZNK6,Q9Y4K3,1
1,Q9NR31,Q15797,1
2,P11627,P53986,0
3,P33176,Q96EK5,1
4,Q9NPY3,P02745,1
5,P03211,Q15796,0
6,Q9SYQ8,O80809,0
7,Q99576,O43524,1
8,P55196,Q9NYB0,1
9,Q7L7X3,O60566,1


Оставляем только пары, где оба белка человеческие.

In [13]:
d = d[d.human_or_not == 1]

Сохраняем на всякий случай.

In [14]:
d.to_csv("done_proteins", index=False)

In [15]:
d.tail(20)

Unnamed: 0,Protein_A,Protein_B,human_or_not
5460,P62877,Q15843,1
5461,P62877,Q16531,1
5578,Q04637,Q32Q75,1
5583,Q08209,Q5F2G0,1
5606,Q13291,P06241,1
5607,Q14653,P05412,1
5609,Q15019,Q16181,1
5610,Q15019,Q16181,1
5611,Q15370,Q16665,1
5612,Q15370,Q16665,1


In [16]:
d = d.drop(['human_or_not'], axis=1)

In [17]:
d = d.reset_index()

In [19]:
d = d.drop(['index'], axis=1)

In [20]:
d

Unnamed: 0,Protein_A,Protein_B
0,Q6ZNK6,Q9Y4K3
1,Q9NR31,Q15797
2,P33176,Q96EK5
3,Q9NPY3,P02745
4,Q99576,O43524
...,...,...
1434,Q8WUW1,Q9Y2A7
1435,Q92558,Q9Y2A7
1436,Q9NQB7,O00512
1437,Q9UC78,P00742


Добавим для каждого белка определяющую его последовательность аминокислот, используя код соответствующей страницы сайта UniProt.

In [27]:
import time
from tqdm import tqdm

am_ac_prot_a = []
am_ac_prot_b = []

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

def check(protein):
    s = 'https://www.uniprot.org/uniprot/' + protein
    html_doc = urllib.request.urlopen(s).read()
    soup = BeautifulSoup(html_doc, 'html.parser')
    k = soup.find("pre", {"class": "sequence"}).contents
    st = ''
    for row in k:
      if isinstance(row, str) and not hasNumbers(row):
        st += row
    st = st.replace(' ', '')
    return st

def do(i):
    proteinA, proteinB = d.Protein_A[i], d.Protein_B[i]
    am_ac_prot_a.append(check(proteinA))
    am_ac_prot_b.append(check(proteinB))
    
    
for i in tqdm(range(len(d.Protein_A)), position=0, leave=True):
  do(i)

100%|██████████| 1439/1439 [54:51<00:00,  2.29s/it] 


In [31]:
d['amino_acid_prot_A'] = am_ac_prot_a
d['amino_acid_prot_B'] = am_ac_prot_b
d

Unnamed: 0,Protein_A,Protein_B,amino_acid_prot_A,amino_acid_prot_B
0,Q6ZNK6,Q9Y4K3,MEKPLTVLRVSLYHPTLGPSAFANVPPRLQHDTSPLLLGRGQDAHL...,MSLLNCENSCGSSQSESDCCVAMASSCSAVTKDDSVGGTASTGNLS...
1,Q9NR31,Q15797,MSFIFEWIYNGFSSVLQFLGLYKKSGKLVFLGLDNAGKTTLLHMLK...,MNVTSLFSFTSPAVKRLLGWKQGDEEEKWAEKAVDALVKKLKKKKG...
2,P33176,Q96EK5,MADLAECNIKVMCRFRPLNESEVNRGDKYIAKFQGEDTVVIASKPY...,MANVPWAEVCEKFQAALALSRVELHKNPEKEPYKSKYSARALLEEV...
3,Q9NPY3,P02745,MATSMGLLLLLLLLLTQPGAGTGADTEAVVCVGTACYTAHSGKLSA...,MEGPRGWLVLCVLAISLASMVTEDLCRAPDGKKGEAGRPGRRGRPG...
4,Q99576,O43524,MNTEMYQTPMEVAVYQLHNFSISFFSSLLGGDVVSVKLDNSASGAS...,MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK...
...,...,...,...,...
1434,Q8WUW1,Q9Y2A7,MAGQEDPVQREIHQDWANREYIEIITSSIKKIADFLNSFDMSCRSR...,MSRSVLQPSQQKLAEKLTILNDRGVGMLTRLYNIKKACGDPKAKPS...
1435,Q92558,Q9Y2A7,MPLVKRNIDPRHLCHTALPRGIKNELECVTNISLANIIRQLSSLSK...,MSRSVLQPSQQKLAEKLTILNDRGVGMLTRLYNIKKACGDPKAKPS...
1436,Q9NQB7,O00512,MPQLNGGGGDDLGANDELISFKDEGEQEEKSSENSSAERDLADVKS...,MHSSNPKVRSSPSGNTQSSPKSKQEVMVRPPTVMSPSGNPQLDSKF...
1437,Q9UC78,P00742,MYSNVIGTVTSGKRKVYLLSLLLIGFWDCVTCHGSPVDICTAKPRD...,MGRPLHLVLLSASLAGLLLLGESLFIRREQANNILARVTRANSFLE...


Получаем табличку невзаимодействующих человеческих белков и их последовательности аминоксилот. Выгрузим табличку от греха подальше.

In [32]:
d.to_csv("negative_table", index=False)

Теперь приведем в нужный вид табличку взаимодействующих белков из BioGrid.

In [33]:
second_db = pd.read_csv('~/BIOGRID-ORGANISM-Homo_sapiens-4.1.190.tab.txt', sep='\t')

In [34]:
second_db = second_db.drop(['ALIASES_FOR_A', 'ALIASES_FOR_B', 'EXPERIMENTAL_SYSTEM','SOURCE', 'PUBMED_ID', 'ORGANISM_A_ID', 'ORGANISM_B_ID'], axis=1)

In [35]:
second_db

Unnamed: 0,INTERACTOR_A,INTERACTOR_B,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B
0,ETG6416,ETG2318,MAP2K4,FLNC
1,ETG84665,ETG88,MYPN,ACTN2
2,ETG90,ETG2339,ACVR1,FNTA
3,ETG2624,ETG5371,GATA2,PML
4,RP4-547C9.3,ETG6774,RPA2,STAT3
...,...,...,...,...
647862,ETG64750,ETG7316,SMURF2,UBC
647863,ETG57154,ETG57154,SMURF1,SMURF1
647864,ETG64750,ETG64750,SMURF2,SMURF2
647865,ETG57154,ETG7332,SMURF1,UBE2L3


Узнаем, какие идентификаторы будут иметь данные белки в системе UniProt.

In [37]:
out1 = open("unique_gene_prot_b", "w")
for item in second_db.OFFICIAL_SYMBOL_B.unique():
    out1.write(item + '\n')

In [38]:
out2 = open("unique_gene_prot_a", "w")
for item in second_db.OFFICIAL_SYMBOL_A.unique():
    out2.write(item + '\n')

Скачав столбец с gene name, распарсим его на несколько частей и преобразуем непосредственно залив на сайт, но для этого распарсим на несколько частей, так как сайт не может преобразовать файлы такого размера. Затем снова сольем в один. Заметим, что какое-то количество данных было не найдено в системе UniProt. Обрабатывались лишь уникальные значения.

In [42]:
df_a = pd.read_csv('~/megred.tab', sep='\t').drop(['Status'], axis=1)
df_a = df.rename(columns={'yourlist:M20201026A94466D2655679D1FD8953E075198DA808DF3FI' : 'OFFICIAL_SYMBOL',
                        'Entry' : 'UNIPROT_SYMBOL',
                        'Sequence' : 'SEQUENCE'})
df_a

Unnamed: 0,UNIPROT_SYMBOL,OFFICIAL_SYMBOL,SEQUENCE
0,P45985,MAP2K4,MAAPSPSGGGGSGGGSGSGTPGPVGSPAPGHPAVSSMQGKRKALKL...
1,Q86TC9,MYPN,MQDDSIEASTSISQLLRESYLAETRHRGNNERSRAEPSSNPCHFGS...
2,Q04771,ACVR1,MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNED...
3,P23769,GATA2,MEVAPEQPRWMAHPAVLNAQHPDSHHPGLAHNYMEPAQLLPPDEVD...
4,P15927,RPA2,MWNSGFESYGSSSYGGAGGYTQSPGGFGSPAPSQAEKKSRARAQHI...
...,...,...,...
15010,A6NFD8,HELT,MSDKLKERKRTPVSHKVIEKRRRDRINRCLNELGKTVPMALAKQSS...
15011,Q6XD76,ASCL4,METRKPAERLALPYSLRTAPLGVPGTLPGLPRRDPLRVALRLDAAC...
15012,Q6ZS10,CLEC17A,MHNLYSITGYPDPPGTMEEEEEDDDYENSTPPYKDLPPKPGTMEEE...
15013,Q8NFT8,DNER,MQPRRAQAPGAQLLPALALLLLLLGAGPRGSSLANPVPAAPLSAPG...


In [41]:
df_b = pd.read_csv('~/merged_.tab', sep='\t')
df_b = df_b.rename(columns={'yourlist:M202010275C475328CEF75220C360D524E9D456CE024E8A0' : 'OFFICIAL_SYMBOL',
                        'Entry' : 'UNIPROT_SYMBOL',
                        'Sequence' : 'SEQUENCE'})
df_b

Unnamed: 0,UNIPROT_SYMBOL,SEQUENCE,OFFICIAL_SYMBOL
0,Q14315,MMNNSGYSDAGLGLGDETDEMPSTEKDLAEDAPWKKIQQNTFTRWC...,FLNC
1,P35609,MNQIEPGVQYNYVYDEDEYMIQEEEWDRDLLLDPAWEKQQRKTFTA...,ACTN2
2,P49354,MAATEGVGEAAQGGEPGQPAQPPPQPHPPPPQQQHKEEMAAEAGEA...,FNTA
3,P29590,MEPAPARSPRPQQDPARPQEPTMPPPETPSEGRQPSPSPSPTERAP...,PML
4,P40763,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...,STAT3
...,...,...,...
20313,O75881,MAGEVSAATGRFSLERLGLPGLALAAALLLLALCLLVRRTRRPGEP...,CYP7B1
20314,Q86UD5,MGDEDKRITYEDSEPSTGMNYTPSMHQEAQEETVMKLKGIDANEPT...,SLC9B2
20315,P56597,MEISMPPPQIYVEKTLAIIKPDIVDKEEEIQDIILRSGFTIVQRRK...,NME5
20316,P47900,MTEVLWPAVPNGTDAAFLAGPGSSWGNSTVASTAAVSSSFKCALTK...,P2RY1


Затем пробегаем по исходной таблице и если gene name был найден и переведен, то записываем для него его идентификатор в UniProt и последовательность аминоксислот. 

In [101]:
arr = {}
negative = set()
for i in df_a.index:
    for off_sym in df_a['OFFICIAL_SYMBOL'][i].split(','):
        uniprot_sym,  gene_seq = df_a['UNIPROT_SYMBOL'][i], df_a['SEQUENCE'][i]
        if off_sym not in arr.keys() and off_sym not in negative:
           arr.update({off_sym : (uniprot_sym,  gene_seq)})    
        elif off_sym in arr.keys():
            negative.add(off_sym)
            arr.pop(off_sym)

In [102]:
arr_ = {}
negative_ = set()
for i in df_b.index:
    for off_sym in df_b['OFFICIAL_SYMBOL'][i].split(','):
        uniprot_sym,  gene_seq = df_b['UNIPROT_SYMBOL'][i], df_b['SEQUENCE'][i]
        if off_sym not in arr_.keys() and off_sym not in negative_:
           arr_.update({off_sym : (uniprot_sym,  gene_seq)})    
        elif off_sym in arr_.keys():
            negative_.add(off_sym)
            arr_.pop(off_sym)

In [103]:
debug = second_db[[off_sym in arr for off_sym in second_db['OFFICIAL_SYMBOL_A']]]
debug = debug.assign(UNIPROT_SYMBOL_A = [arr[off_sym][0] for off_sym in debug['OFFICIAL_SYMBOL_A']])
debug = debug.assign(SEQUENCE_A = [arr[off_sym][1] for off_sym in debug['OFFICIAL_SYMBOL_A']])

In [104]:
debug

Unnamed: 0,INTERACTOR_A,INTERACTOR_B,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B,UNIPROT_SYMBOL_A,SEQUENCE_A
0,ETG6416,ETG2318,MAP2K4,FLNC,P45985,MAAPSPSGGGGSGGGSGSGTPGPVGSPAPGHPAVSSMQGKRKALKL...
1,ETG84665,ETG88,MYPN,ACTN2,Q86TC9,MQDDSIEASTSISQLLRESYLAETRHRGNNERSRAEPSSNPCHFGS...
2,ETG90,ETG2339,ACVR1,FNTA,Q04771,MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNED...
3,ETG2624,ETG5371,GATA2,PML,P23769,MEVAPEQPRWMAHPAVLNAQHPDSHHPGLAHNYMEPAQLLPPDEVD...
4,RP4-547C9.3,ETG6774,RPA2,STAT3,P15927,MWNSGFESYGSSSYGGAGGYTQSPGGFGSPAPSQAEKKSRARAQHI...
...,...,...,...,...,...,...
647862,ETG64750,ETG7316,SMURF2,UBC,Q9HAU4,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...
647863,ETG57154,ETG57154,SMURF1,SMURF1,Q9HCE7,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...
647864,ETG64750,ETG64750,SMURF2,SMURF2,Q9HAU4,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...
647865,ETG57154,ETG7332,SMURF1,UBE2L3,Q9HCE7,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...


In [105]:
debug_ = debug[[off_sym in arr_ for off_sym in debug['OFFICIAL_SYMBOL_B']]]
debug_ = debug_.assign(UNIPROT_SYMBOL_B = [arr_[off_sym][0] for off_sym in debug_['OFFICIAL_SYMBOL_B']])
debug_ = debug_.assign(SEQUENCE_B = [arr_[off_sym][1] for off_sym in debug_['OFFICIAL_SYMBOL_B']])

In [106]:
debug_

Unnamed: 0,INTERACTOR_A,INTERACTOR_B,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B,UNIPROT_SYMBOL_A,SEQUENCE_A,UNIPROT_SYMBOL_B,SEQUENCE_B
0,ETG6416,ETG2318,MAP2K4,FLNC,P45985,MAAPSPSGGGGSGGGSGSGTPGPVGSPAPGHPAVSSMQGKRKALKL...,Q14315,MMNNSGYSDAGLGLGDETDEMPSTEKDLAEDAPWKKIQQNTFTRWC...
1,ETG84665,ETG88,MYPN,ACTN2,Q86TC9,MQDDSIEASTSISQLLRESYLAETRHRGNNERSRAEPSSNPCHFGS...,P35609,MNQIEPGVQYNYVYDEDEYMIQEEEWDRDLLLDPAWEKQQRKTFTA...
2,ETG90,ETG2339,ACVR1,FNTA,Q04771,MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNED...,P49354,MAATEGVGEAAQGGEPGQPAQPPPQPHPPPPQQQHKEEMAAEAGEA...
3,ETG2624,ETG5371,GATA2,PML,P23769,MEVAPEQPRWMAHPAVLNAQHPDSHHPGLAHNYMEPAQLLPPDEVD...,P29590,MEPAPARSPRPQQDPARPQEPTMPPPETPSEGRQPSPSPSPTERAP...
4,RP4-547C9.3,ETG6774,RPA2,STAT3,P15927,MWNSGFESYGSSSYGGAGGYTQSPGGFGSPAPSQAEKKSRARAQHI...,P40763,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...
...,...,...,...,...,...,...,...,...
647862,ETG64750,ETG7316,SMURF2,UBC,Q9HAU4,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...,P0CG48,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...
647863,ETG57154,ETG57154,SMURF1,SMURF1,Q9HCE7,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...,Q9HCE7,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...
647864,ETG64750,ETG64750,SMURF2,SMURF2,Q9HAU4,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...,Q9HAU4,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...
647865,ETG57154,ETG7332,SMURF1,UBE2L3,Q9HCE7,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...,P68036,MAASRRLMKELEEIRKCGMKNFRNIQVDEANLLTWQGLIVPDNPPY...


In [107]:
debug_ = debug_.drop(['INTERACTOR_A', 'INTERACTOR_B'], axis=1)

In [108]:
columns_titles = ["UNIPROT_SYMBOL_A","UNIPROT_SYMBOL_B", "OFFICIAL_SYMBOL_A", "OFFICIAL_SYMBOL_B", "SEQUENCE_A", "SEQUENCE_B"]
debug_ = debug_.reindex(columns=columns_titles)

In [109]:
debug_

Unnamed: 0,UNIPROT_SYMBOL_A,UNIPROT_SYMBOL_B,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B,SEQUENCE_A,SEQUENCE_B
0,P45985,Q14315,MAP2K4,FLNC,MAAPSPSGGGGSGGGSGSGTPGPVGSPAPGHPAVSSMQGKRKALKL...,MMNNSGYSDAGLGLGDETDEMPSTEKDLAEDAPWKKIQQNTFTRWC...
1,Q86TC9,P35609,MYPN,ACTN2,MQDDSIEASTSISQLLRESYLAETRHRGNNERSRAEPSSNPCHFGS...,MNQIEPGVQYNYVYDEDEYMIQEEEWDRDLLLDPAWEKQQRKTFTA...
2,Q04771,P49354,ACVR1,FNTA,MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNED...,MAATEGVGEAAQGGEPGQPAQPPPQPHPPPPQQQHKEEMAAEAGEA...
3,P23769,P29590,GATA2,PML,MEVAPEQPRWMAHPAVLNAQHPDSHHPGLAHNYMEPAQLLPPDEVD...,MEPAPARSPRPQQDPARPQEPTMPPPETPSEGRQPSPSPSPTERAP...
4,P15927,P40763,RPA2,STAT3,MWNSGFESYGSSSYGGAGGYTQSPGGFGSPAPSQAEKKSRARAQHI...,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...
...,...,...,...,...,...,...
647862,Q9HAU4,P0CG48,SMURF2,UBC,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...
647863,Q9HCE7,Q9HCE7,SMURF1,SMURF1,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...
647864,Q9HAU4,Q9HAU4,SMURF2,SMURF2,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...
647865,Q9HCE7,P68036,SMURF1,UBE2L3,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...,MAASRRLMKELEEIRKCGMKNFRNIQVDEANLLTWQGLIVPDNPPY...


In [110]:
debug_ = debug_.reset_index(drop=True)
debug_

Unnamed: 0,UNIPROT_SYMBOL_A,UNIPROT_SYMBOL_B,OFFICIAL_SYMBOL_A,OFFICIAL_SYMBOL_B,SEQUENCE_A,SEQUENCE_B
0,P45985,Q14315,MAP2K4,FLNC,MAAPSPSGGGGSGGGSGSGTPGPVGSPAPGHPAVSSMQGKRKALKL...,MMNNSGYSDAGLGLGDETDEMPSTEKDLAEDAPWKKIQQNTFTRWC...
1,Q86TC9,P35609,MYPN,ACTN2,MQDDSIEASTSISQLLRESYLAETRHRGNNERSRAEPSSNPCHFGS...,MNQIEPGVQYNYVYDEDEYMIQEEEWDRDLLLDPAWEKQQRKTFTA...
2,Q04771,P49354,ACVR1,FNTA,MVDGVMILPVLIMIALPSPSMEDEKPKVNPKLYMCVCEGLSCGNED...,MAATEGVGEAAQGGEPGQPAQPPPQPHPPPPQQQHKEEMAAEAGEA...
3,P23769,P29590,GATA2,PML,MEVAPEQPRWMAHPAVLNAQHPDSHHPGLAHNYMEPAQLLPPDEVD...,MEPAPARSPRPQQDPARPQEPTMPPPETPSEGRQPSPSPSPTERAP...
4,P15927,P40763,RPA2,STAT3,MWNSGFESYGSSSYGGAGGYTQSPGGFGSPAPSQAEKKSRARAQHI...,MAQWNQLQQLDTRYLEQLHQLYSDSFPMELRQFLAPWIESQDWAYA...
...,...,...,...,...,...,...
515236,Q9HAU4,P0CG48,SMURF2,UBC,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...
515237,Q9HCE7,Q9HCE7,SMURF1,SMURF1,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...
515238,Q9HAU4,Q9HAU4,SMURF2,SMURF2,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...,MSNPGGRRNGPVKLRLTVLCAKNLVKKDFFRLPDPFAKVVVDGSGQ...
515239,Q9HCE7,P68036,SMURF1,UBE2L3,MSNPGTRRNGSSIKIRLTVLCAKNLAKKDFFRLPDPFAKIVVDGSG...,MAASRRLMKELEEIRKCGMKNFRNIQVDEANLLTWQGLIVPDNPPY...


Сохраним полученную таблицу взаимодействующих человеческих белков и их последовательности аминоксилот.

In [111]:
debug_.to_csv("positive_table_done", index=False)