In [1]:
import pandas as pd
import seaborn as sns
import time
import re
import nltk
import math
import os
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
def fasta_parser(filename):
    if os.path.exists(filename):
        pass
    else:                     
        print("The file, %s, does not exist" % filename)
        
    file = open(filename,mode='r')
    rec_all = file.read()
    file.close()
    # remove all whitespace from string all_of_it
    rec_all = rec_all.replace(' ','')
    # split records by > 
    records = rec_all.split('>')
    # Parse out the headers & sequences for each record
    headers = []
    sequences = []
    for rec in records:
        s = ''
        data = rec.split('\n')
        sq = s.join(data[1:])
        if len(data[0]) > 0:
            headers.append(data[0])
   
        if (len(sq) > 0):
            sequences.append(s.join(data[1:]))
    
    return headers, sequences

In [3]:
import glob
glob.glob('*.fasta')

['ios_7.fasta',
 'ios_8.fasta',
 'iso_1.fasta',
 'iso_2.fasta',
 'iso_3.fasta',
 'iso_4.fasta',
 'iso_5.fasta',
 'iso_6.fasta',
 'isomerase_2000.fasta',
 'uniprot-isomerase.fasta']

In [4]:
iso_head, iso_seqs = fasta_parser('uniprot-isomerase.fasta')

    

In [5]:
for hd in iso_head:
    print(hd)
    print()

sp|P61615|IDI2_SACSHIsopentenyl-diphosphatedelta-isomeraseOS=SaccharolobusshibataeOX=2286GN=fniPE=1SV=1

sp|Q746I8|IDI2_THET2Isopentenyl-diphosphatedelta-isomeraseOS=Thermusthermophilus(strainHB27/ATCCBAA-163/DSM7039)OX=262724GN=fniPE=1SV=1

sp|Q8ZWV0|PGMI_PYRAEBifunctionalphosphoglucose/phosphomannoseisomeraseOS=Pyrobaculumaerophilum(strainATCC51768/IM2/DSM7523/JCM9630/NBRC100827)OX=178306GN=PAE1610PE=1SV=1

sp|P42126|ECI1_HUMANEnoyl-CoAdeltaisomerase1,mitochondrialOS=HomosapiensOX=9606GN=ECI1PE=1SV=1

sp|Q9WUR2|ECI2_MOUSEEnoyl-CoAdeltaisomerase2,mitochondrialOS=MusmusculusOX=10090GN=Eci2PE=1SV=2

sp|O75521|ECI2_HUMANEnoyl-CoAdeltaisomerase2,mitochondrialOS=HomosapiensOX=9606GN=ECI2PE=1SV=4

sp|Q5XIC0|ECI2_RATEnoyl-CoAdeltaisomerase2,mitochondrialOS=RattusnorvegicusOX=10116GN=Eci2PE=1SV=1

sp|Q46822|IDI_ECOLIIsopentenyl-diphosphateDelta-isomeraseOS=Escherichiacoli(strainK12)OX=83333GN=idiPE=1SV=1

sp|Q15125|EBP_HUMAN3-beta-hydroxysteroid-Delta(8),Delta(7)-isomeraseOS=HomosapiensOX=960

sp|A0QXE5|DERI1_MYCS2D-erythrulose-4-phosphateisomerase1OS=Mycolicibacteriumsmegmatis(strainATCC700084/mc(2)155)OX=246196GN=derI1PE=1SV=1

sp|A2SKF8|G6PI_METPPGlucose-6-phosphateisomeraseOS=Methylibiumpetroleiphilum(strainATCCBAA-1232/LMG22953/PM1)OX=420662GN=pgiPE=3SV=1

sp|P0C1J5|FKB2B_RHIO9FK506-bindingprotein2BOS=Rhizopusdelemar(strainRA99-880/ATCCMYA-4621/FGSC9543/NRRL43880)OX=246409GN=FKBP3PE=3SV=1

sp|Q6E7F1|FCF2_ECOLXdTDP-fucopyranosemutaseOS=EscherichiacoliOX=562GN=fcf2PE=1SV=1

sp|Q2J390|DAPF_RHOP2DiaminopimelateepimeraseOS=Rhodopseudomonaspalustris(strainHaA2)OX=316058GN=dapFPE=3SV=1

sp|Q6ND59|DAPF_RHOPADiaminopimelateepimeraseOS=Rhodopseudomonaspalustris(strainATCCBAA-98/CGA009)OX=258594GN=dapFPE=3SV=1

sp|Q3BR53|GPMA_XANC52,3-bisphosphoglycerate-dependentphosphoglyceratemutaseOS=Xanthomonascampestrispv.vesicatoria(strain85-10)OX=316273GN=gpmAPE=3SV=1

sp|B9JUB9|DAPF_AGRVSDiaminopimelateepimeraseOS=Agrobacteriumvitis(strainS4/ATCCBAA-846)OX=311402GN=dapFPE=3SV=1

sp|Q81DD2


sp|B2J7M9|GSA_NOSP7Glutamate-1-semialdehyde2,1-aminomutaseOS=Nostocpunctiforme(strainATCC29133/PCC73102)OX=63737GN=hemLPE=3SV=1

sp|A1R898|GSA_PAEATGlutamate-1-semialdehyde2,1-aminomutaseOS=Paenarthrobacteraurescens(strainTC1)OX=290340GN=hemLPE=3SV=1

sp|Q5R8T8|GNPI1_PONABGlucosamine-6-phosphateisomerase1OS=PongoabeliiOX=9601GN=GNPDA1PE=2SV=1

sp|A4XK09|GSA_CALS8Glutamate-1-semialdehyde2,1-aminomutaseOS=Caldicellulosiruptorsaccharolyticus(strainATCC43494/DSM8903/Tp8T6331)OX=351627GN=hemLPE=3SV=1

sp|Q71XG0|GPMA_LISMF2,3-bisphosphoglycerate-dependentphosphoglyceratemutaseOS=Listeriamonocytogenesserotype4b(strainF2365)OX=265669GN=gpmAPE=3SV=1

sp|Q3A7W5|GSA_PELCDGlutamate-1-semialdehyde2,1-aminomutaseOS=Pelobactercarbinolicus(strainDSM2380/NBRC103641/GraBd1)OX=338963GN=hemLPE=3SV=1

sp|B6ELS3|FABA_ALISL3-hydroxydecanoyl-[acyl-carrier-protein]dehydrataseOS=Aliivibriosalmonicida(strainLFI1238)OX=316275GN=fabAPE=3SV=1

sp|C4ZT65|DEOB_ECOBWPhosphopentomutaseOS=Escherichiacoli(strainK12/MC41

sp|B8GU31|HIS4_THISH1-(5-phosphoribosyl)-5-[(5-phosphoribosylamino)methylideneamino]imidazole-4-carboxamideisomeraseOS=Thioalkalivibriosulfidiphilus(strainHL-EbGR7)OX=396588GN=hisAPE=3SV=1

sp|O27668|HACA_METTHProbablemethanogenhomoaconitaselargesubunitOS=Methanothermobacterthermautotrophicus(strainATCC29096/DSM1053/JCM10044/NBRC100330/DeltaH)OX=187420GN=hacAPE=3SV=1

sp|A1WR23|HIS4_VEREI1-(5-phosphoribosyl)-5-[(5-phosphoribosylamino)methylideneamino]imidazole-4-carboxamideisomeraseOS=Verminephrobactereiseniae(strainEF01-2)OX=391735GN=hisAPE=3SV=1

sp|A1VRR1|LEUD_POLNA3-isopropylmalatedehydratasesmallsubunitOS=Polaromonasnaphthalenivorans(strainCJ2)OX=365044GN=leuDPE=3SV=1

sp|C5D5M1|ILVC_GEOSWKetol-acidreductoisomerase(NADP(+))OS=Geobacillussp.(strainWCH70)OX=471223GN=ilvCPE=3SV=1

sp|A8ALM7|LEUC_CITK83-isopropylmalatedehydrataselargesubunitOS=Citrobacterkoseri(strainATCCBAA-895/CDC4225-83/SGSC4696)OX=290338GN=leuCPE=3SV=1

sp|Q88FY4|MAIA_PSEPKMaleateisomeraseOS=Pseudomonasputida(stra


sp|A5UA16|HIS4_HAEIE1-(5-phosphoribosyl)-5-[(5-phosphoribosylamino)methylideneamino]imidazole-4-carboxamideisomeraseOS=Haemophilusinfluenzae(strainPittEE)OX=374930GN=hisAPE=3SV=1

sp|C1AGA3|LEUC_MYCBT3-isopropylmalatedehydrataselargesubunitOS=Mycobacteriumbovis(strainBCG/Tokyo172/ATCC35737/TMC1019)OX=561275GN=leuCPE=3SV=1

sp|Q03KB3|LEUC_STRTD3-isopropylmalatedehydrataselargesubunitOS=Streptococcusthermophilus(strainATCCBAA-491/LMD-9)OX=322159GN=leuCPE=3SV=1

sp|A0RZ77|HIS4_CENSY1-(5-phosphoribosyl)-5-[(5-phosphoribosylamino)methylideneamino]imidazole-4-carboxamideisomeraseOS=Cenarchaeumsymbiosum(strainA)OX=414004GN=hisAPE=3SV=1

sp|Q7NH80|ILVC_GLOVIKetol-acidreductoisomerase(NADP(+))OS=Gloeobacterviolaceus(strainATCC29082/PCC7421)OX=251221GN=ilvCPE=3SV=1

sp|Q9HZA4|LEUD_PSEAE3-isopropylmalatedehydratasesmallsubunitOS=Pseudomonasaeruginosa(strainATCC15692/DSM22644/CIP104116/JCM14847/LMG12228/1C/PRS101/PAO1)OX=208964GN=leuDPE=3SV=1

sp|Q1R079|HIS4_CHRSD1-(5-phosphoribosyl)-5-[(5-phosph

sp|P67911|HLDD_ECO57ADP-L-glycero-D-manno-heptose-6-epimeraseOS=EscherichiacoliO157:H7OX=83334GN=hldDPE=1SV=1

sp|B1X953|HLDD_ECODHADP-L-glycero-D-manno-heptose-6-epimeraseOS=Escherichiacoli(strainK12/DH10B)OX=316385GN=hldDPE=3SV=1

sp|B7K2I1|GSA_RIPO1Glutamate-1-semialdehyde2,1-aminomutaseOS=Rippkaeaorientalis(strainPCC8801)OX=41431GN=hemLPE=3SV=1

sp|Q5PC05|HLDD_SALPAADP-L-glycero-D-manno-heptose-6-epimeraseOS=SalmonellaparatyphiA(strainATCC9150/SARB42)OX=295319GN=hldDPE=3SV=1

sp|C5BB97|HLDD_EDWI9ADP-L-glycero-D-manno-heptose-6-epimeraseOS=Edwardsiellaictaluri(strain93-146)OX=634503GN=hldDPE=3SV=1

sp|B5F8R5|GSA_SALA4Glutamate-1-semialdehyde2,1-aminomutaseOS=Salmonellaagona(strainSL483)OX=454166GN=hemLPE=3SV=1

sp|Q7VKK8|HLDD_HAEDUADP-L-glycero-D-manno-heptose-6-epimeraseOS=Haemophilusducreyi(strain35000HP/ATCC700724)OX=233412GN=hldDPE=3SV=1

sp|Q81S27|ILVC2_BACANKetol-acidreductoisomerase(NADP(+))2OS=BacillusanthracisOX=1392GN=ilvC2PE=3SV=1

sp|Q73A47|ILVC2_BACC1Ketol-acidreductois


sp|A5I1A6|DRDI_CLOBH5-deoxyribose1-phosphateisomeraseOS=Clostridiumbotulinum(strainHall/ATCC3502/NCTC13319/TypeA)OX=441771GN=drdIPE=3SV=1

sp|O51602|GPMA_BORBU2,3-bisphosphoglycerate-dependentphosphoglyceratemutaseOS=Borreliaburgdorferi(strainATCC35210/B31/CIP102532/DSM4680)OX=224326GN=gpmAPE=1SV=2

sp|B5YY93|FADB_ECO5EFattyacidoxidationcomplexsubunitalphaOS=EscherichiacoliO157:H7(strainEC4115/EHEC)OX=444450GN=fadBPE=3SV=1

sp|Q63GB4|GSA1_BACCZGlutamate-1-semialdehyde2,1-aminomutase1OS=Bacilluscereus(strainZK/E33L)OX=288681GN=hemL1PE=3SV=1

sp|Q97FP8|G6PI_CLOABGlucose-6-phosphateisomeraseOS=Clostridiumacetobutylicum(strainATCC824/DSM792/JCM1419/LMG5710/VKMB-1787)OX=272562GN=pgiPE=3SV=1

sp|Q2YIQ6|ERYH_BRUA2L-erythrulose-1-phosphateisomeraseOS=Brucellaabortus(strain2308)OX=359391GN=eryHPE=1SV=1

sp|Q81J03|GLMM_BACCRPhosphoglucosaminemutaseOS=Bacilluscereus(strainATCC14579/DSM31/JCM2152/NBRC15305/NCIMB9373/NRRLB-3711)OX=226900GN=glmMPE=3SV=1

sp|B1XAK8|FADB_ECODHFattyacidoxidationcomple


sp|Q8XA10|RLUA_ECO57Dual-specificityRNApseudouridinesynthaseRluAOS=EscherichiacoliO157:H7OX=83334GN=rluAPE=3SV=3

sp|B7K6D3|RPIA_RIPO1Ribose-5-phosphateisomeraseAOS=Rippkaeaorientalis(strainPCC8801)OX=41431GN=rpiAPE=3SV=1

sp|B2K0R4|RPIA_YERPBRibose-5-phosphateisomeraseAOS=YersiniapseudotuberculosisserotypeIB(strainPB1/+)OX=502801GN=rpiAPE=3SV=1

sp|Q8RLY6|RPIA_ENTCLRibose-5-phosphateisomeraseAOS=EnterobactercloacaeOX=550GN=rpiAPE=3SV=1

sp|Q7VP95|RPIA_HAEDURibose-5-phosphateisomeraseAOS=Haemophilusducreyi(strain35000HP/ATCC700724)OX=233412GN=rpiAPE=3SV=1

sp|B0TY81|RPIA_FRAP2Ribose-5-phosphateisomeraseAOS=Francisellaphilomiragiasubsp.philomiragia(strainATCC25017)OX=484022GN=rpiAPE=3SV=1

sp|Q7NPM5|RPIA_GLOVIRibose-5-phosphateisomeraseAOS=Gloeobacterviolaceus(strainATCC29082/PCC7421)OX=251221GN=rpiAPE=3SV=1

sp|B8DTD1|TIG_BIFA0TriggerfactorOS=Bifidobacteriumanimalissubsp.lactis(strainAD011)OX=442563GN=tigPE=3SV=1

sp|Q2KYQ1|RPIA_BORA1Ribose-5-phosphateisomeraseAOS=Bordetellaavium(stra


sp|A8G507|TPIS_PROM2TriosephosphateisomeraseOS=Prochlorococcusmarinus(strainMIT9215)OX=93060GN=tpiAPE=3SV=1

sp|Q8P7U9|TRUB_XANCPtRNApseudouridinesynthaseBOS=Xanthomonascampestrispv.campestris(strainATCC33913/DSM3586/NCPPB528/LMG568/P25)OX=190485GN=truBPE=3SV=1

sp|A2RFQ3|TPIS_STRPGTriosephosphateisomeraseOS=StreptococcuspyogenesserotypeM5(strainManfredo)OX=160491GN=tpiAPE=3SV=1

sp|Q48UK4|TPIS_STRPMTriosephosphateisomeraseOS=StreptococcuspyogenesserotypeM28(strainMGAS6180)OX=319701GN=tpiAPE=3SV=1

sp|Q31XA7|TRUD_SHIBStRNApseudouridinesynthaseDOS=Shigellaboydiiserotype4(strainSb227)OX=300268GN=truDPE=3SV=1

sp|P66942|TPIS_STRPNTriosephosphateisomeraseOS=Streptococcuspneumoniaeserotype4(strainATCCBAA-334/TIGR4)OX=170187GN=tpiAPE=3SV=1

sp|A5USG0|TRUA_ROSS1tRNApseudouridinesynthaseAOS=Roseiflexussp.(strainRS-1)OX=357808GN=truAPE=3SV=1

sp|Q5LNR0|TRUA_RUEPOtRNApseudouridinesynthaseAOS=Ruegeriapomeroyi(strainATCC700808/DSM15171/DSS-3)OX=246200GN=truAPE=3SV=1

sp|Q8UJ53|TRUB_AGRFCtRNApseud

sp|O67108|TOP4A_AQUAEType2topoisomerasesubunitAOS=Aquifexaeolicus(strainVF5)OX=224324GN=gyrAPE=1SV=2

sp|B7I4X8|TIG_ACIB5TriggerfactorOS=Acinetobacterbaumannii(strainAB0057)OX=480119GN=tigPE=3SV=1

sp|B2S468|TRUA_TREPStRNApseudouridinesynthaseAOS=Treponemapallidumsubsp.pallidum(strainSS14)OX=455434GN=truAPE=3SV=1

sp|O05208|TOP6A_SACSHType2DNAtopoisomerase6subunitAOS=SaccharolobusshibataeOX=2286GN=top6APE=1SV=1

sp|A5GQ11|TIG_SYNR3TriggerfactorOS=Synechococcussp.(strainRCC307)OX=316278GN=tigPE=3SV=1

sp|Q70JN8|TPIS_KLUMATriosephosphateisomeraseOS=KluyveromycesmarxianusOX=4911GN=TPI1PE=3SV=1

sp|P48493|TPIS_LACSATriosephosphateisomerase,cytosolic(Fragment)OS=LactucasativaOX=4236PE=2SV=1

sp|P48499|TPIS_LEIMETriosephosphateisomeraseOS=LeishmaniamexicanaOX=5665PE=1SV=1

sp|P44990|SGBU_HAEINPutativeL-ribulose-5-phosphate3-epimeraseSgbUOS=Haemophilusinfluenzae(strainATCC51907/DSM11121/KW20/Rd)OX=71421GN=sgbUPE=3SV=1

sp|Q8UEY3|TPIS_AGRFCTriosephosphateisomeraseOS=Agrobacteriumfabrum(strainC


sp|Q87WQ1|TPIS_PSESMTriosephosphateisomeraseOS=Pseudomonassyringaepv.tomato(strainATCCBAA-871/DC3000)OX=223283GN=tpiAPE=3SV=1

sp|Q5F5V8|TRUA_NEIG1tRNApseudouridinesynthaseAOS=Neisseriagonorrhoeae(strainATCC700825/FA1090)OX=242231GN=truAPE=3SV=1

sp|A0B9D8|TRPF_METTPN-(5'-phosphoribosyl)anthranilateisomeraseOS=Methanothrixthermoacetophila(strainDSM6194/JCM14653/NBRC101360/PT)OX=349307GN=trpFPE=3SV=1

sp|A9M342|TRPF_NEIM0N-(5'-phosphoribosyl)anthranilateisomeraseOS=NeisseriameningitidisserogroupC(strain053442)OX=374833GN=trpFPE=3SV=1

sp|B1ID11|TRUA_STRPItRNApseudouridinesynthaseAOS=Streptococcuspneumoniae(strainHungary19A-6)OX=487214GN=truAPE=3SV=1

sp|Q9K0C6|TRPF_NEIMBN-(5'-phosphoribosyl)anthranilateisomeraseOS=NeisseriameningitidisserogroupB(strainMC58)OX=122586GN=trpFPE=3SV=1

sp|Q5R875|TMX3_PONABProteindisulfide-isomeraseTMX3OS=PongoabeliiOX=9601GN=TMX3PE=2SV=1

sp|Q1GZC0|SURA_METFKChaperoneSurAOS=Methylobacillusflagellatus(strainKT/ATCC51484/DSM6875)OX=265072GN=surAPE=3SV=1

sp|

sp|Q5V4R5|TOP6B_HALMAType2DNAtopoisomerase6subunitBOS=Haloarculamarismortui(strainATCC43049/DSM3752/JCM8966/VKMB-1809)OX=272569GN=top6BPE=3SV=1

sp|A0KTZ2|TPIS_SHESATriosephosphateisomeraseOS=Shewanellasp.(strainANA-3)OX=94122GN=tpiAPE=3SV=1

sp|Q9HR31|TOP6B_HALSAType2DNAtopoisomerase6subunitBOS=Halobacteriumsalinarum(strainATCC700922/JCM11081/NRC-1)OX=64091GN=top6BPE=3SV=1

sp|C3LNM7|TIG_VIBCMTriggerfactorOS=VibriocholeraeserotypeO1(strainM66-2)OX=579112GN=tigPE=3SV=1

sp|A8FYS4|TPIS_SHESHTriosephosphateisomeraseOS=Shewanellasediminis(strainHAW-EB3)OX=425104GN=tpiAPE=3SV=1

sp|Q487E7|TRUD_COLP3tRNApseudouridinesynthaseDOS=Colwelliapsychrerythraea(strain34H/ATCCBAA-681)OX=167879GN=truDPE=3SV=1

sp|Q8CXQ2|TRUB_MYCPEtRNApseudouridinesynthaseBOS=Mycoplasmapenetrans(strainHF-2)OX=272633GN=truBPE=3SV=1

sp|B9JG42|TRPF_AGRRKN-(5'-phosphoribosyl)anthranilateisomeraseOS=Agrobacteriumradiobacter(strainK84/ATCCBAA-868)OX=311403GN=trpFPE=3SV=1

sp|Q0I4X5|TRUA_HAES1tRNApseudouridinesynthaseAOS=Hae

sp|O25730|6PGL_HELPY6-phosphogluconolactonaseOS=Helicobacterpylori(strainATCC700392/26695)OX=85962GN=pglPE=3SV=1

sp|Q6M0E4|AMPPA_METMPAMPphosphorylaseOS=Methanococcusmaripaludis(strainS2/LL)OX=267377GN=MMP0327PE=3SV=1

sp|Q9LS01|AOC3_ARATHAlleneoxidecyclase3,chloroplasticOS=ArabidopsisthalianaOX=3702GN=AOC3PE=2SV=1

sp|P63537|END4_STAAMProbableendonuclease4OS=Staphylococcusaureus(strainMu50/ATCC700699)OX=158878GN=nfoPE=3SV=1

sp|A4QCJ4|G6PI_CORGBGlucose-6-phosphateisomeraseOS=Corynebacteriumglutamicum(strainR)OX=340322GN=pgiPE=3SV=1

sp|Q08A39|FADB_SHEFNFattyacidoxidationcomplexsubunitalphaOS=Shewanellafrigidimarina(strainNCIMB400)OX=318167GN=fadBPE=3SV=1

sp|A3CYJ4|FADB_SHEB5FattyacidoxidationcomplexsubunitalphaOS=Shewanellabaltica(strainOS155/ATCCBAA-1091)OX=325240GN=fadBPE=3SV=1

sp|B7NVV2|FUCM_ECO7IL-fucosemutarotaseOS=EscherichiacoliO7:K1(strainIAI39/ExPEC)OX=585057GN=fucUPE=3SV=1

sp|Q54SS0|ECH1_DICDIDelta(3,5)-Delta(2,4)-dienoyl-CoAisomerase,mitochondrialOS=Dictyosteliumdiscoid

sp|B1LHM8|GMHA_ECOSMPhosphoheptoseisomeraseOS=Escherichiacoli(strainSMS-3-5/SECEC)OX=439855GN=gmhAPE=3SV=1

sp|Q12QG1|DEOB_SHEDOPhosphopentomutaseOS=Shewanelladenitrificans(strainOS217/ATCCBAA-1090/DSM15013)OX=318161GN=deoBPE=3SV=1

sp|B7K1E7|DAPF_RIPO1DiaminopimelateepimeraseOS=Rippkaeaorientalis(strainPCC8801)OX=41431GN=dapFPE=3SV=1

sp|A1AWJ5|DAPF_RUTMCDiaminopimelateepimeraseOS=Ruthiamagnificasubsp.CalyptogenamagnificaOX=413404GN=dapFPE=3SV=1

sp|P81104|FKB70_PINPS70kDapeptidyl-prolylisomerase(Fragment)OS=PinuspinasterOX=71647PE=1SV=1

sp|Q8EHK2|DEOB_SHEONPhosphopentomutaseOS=Shewanellaoneidensis(strainMR-1)OX=211586GN=deoBPE=3SV=1

sp|C1CSS1|GPMA_STRZT2,3-bisphosphoglycerate-dependentphosphoglyceratemutaseOS=Streptococcuspneumoniae(strainTaiwan19F-14)OX=487213GN=gpmAPE=3SV=1

sp|A4WG08|DAPF_ENT38DiaminopimelateepimeraseOS=Enterobactersp.(strain638)OX=399742GN=dapFPE=3SV=1

sp|B3QSA6|GSA_CHLT3Glutamate-1-semialdehyde2,1-aminomutaseOS=Chloroherpetonthalassium(strainATCC35110/GB-78)O

In [6]:
for sq in iso_seqs:
    print(sq)
    print()

MPDIVNRKVEHVEIAAFENVDGLSSSTFLNDVILVHQGFPGISFSEINTKTKFFRKEISVPVMVTGMTGGRNELGRINKIIAEVAEKFGIPMGVGSQRVAIEKAEARESFAIVRKVAPTIPIIANLGMPQLVKGYGLKEFQDAIQMIEADAIAVHLNPAQEVFQPEGEPEYQIYALEKLRDISKELSVPIIVKESGNGISMETAKLLYSYGIKNFDTSGQGGTNWIAIEMIRDIRRGNWKAESAKNFLDWGVPTAASIMEVRYSVPDSFLVGSGGIRSGLDAAKAIALGADIAGMALPVLKSAIEGKESLEQFFRKIIFELKAAMMLTGSKDVDALKKTSIVILGKLKEWAEYRGINLSIYEKVRKRE

MNIRERKRKHLEACLEGEVAYQKTTTGLEGFRLRYQALAGLALGEVDLTTPFLGKTLKAPFLIGAMTGGEENGERINLALAEAAEALGVGMMLGSGRILLERPEALRSFRVRKVAPKALLIANLGLAQLRRYGRDDLLRLVEALEADALAFHVNPLQEAVQRGDTDFRGLVERLAELLPLPFPVMVKEVGHGLSREAALALRDLPLAAVDVAGAGGTSWARVEEWVRFGEVRHPELCEIGIPTARAILEVREVLPHLPLVASGGVYTGTDGAKALALGADLLAVARPLLRPALEGAERVAAWIGDYLEELRTALFAIGAKNPKEARGRVERV

MSQLLQDYLNWENYILRRVDFPTSYVVEGEVVRIEAMPRLYISGMGGSGVVADLIRDFSLTWNWEVEVIAVKDYFLKARDGLLIAVSYSGNTIETLYTVEYAKRRRIPAVAITTGGRLAQMGVPTVIVPKASAPRAALPQLLTAALHVVAKVYGIDVKIPEGLEPPNEALIHKLVEEFQKRPTIIAAESMRGVAYRVKNEFNENAKIEPSVEILPEAHHNWIEGSERAVVALTSPHIPKEHQERVKATVEIVGGSIYAVEMHPKGVLSFLRDVGIASVKLAEIRGVNPLATPRIDA

MKLKTMTMEWTGDSLILIDQRRLPFEEVYVTCADYRAVALSIKEMVVRGAPAIGATAAFGYVLGAKEILKKSHNYEQVVMQMKNVKETLAKTRPTAVNLFWALERMEKRLIRHGKYEGLVKVLEDEALKIAKEDIEVNKAIGRNGAQLLQDGFTVLTHCNAGALATVDYGTALGVLRAAKEQGKKIKVYADETRPYLQGARLTAWELMKDGFDVTLISDNMAGWVMKQGKINAVIVGADRIAANGDVANKIGTYMVAVLANRHGIPFYVAAPLSTIDMSIKSGKEIPIEERSHEEVLTCGGKRVAPNNVNVYNPAFDVTDHELVTAIITEKGVVYPPYEENIKKLFEEGI

MSLLEQLDKNIAASGGLIVSCQPVPGSPLDKPEIVAAMALAAEQAGAVAVRIEGIDNLRMTRSLVSVPIIGIIKRDLDESPVRITPFLDDVDALAQAGAAIIAVDGTARQRPVAVEALLARIHHHHLLAMADCSSVDDGLACQRLGADIIGTTMSGYTTPDTPEEPDLPLVKALHDAGCRVIAEGRYNSPALAAEAIRYGAWAVTVGSAITRLEHICGWYNDALKKAAS

MNKEQFLEKIKGGIIVSCQALPGEPLYSENGGVMPLMAKAAEQAGAVAIRANSVRDIKEIQKAVSLPIIGIIKKDYLPQKPFITATMKEIDELVATSCEVIALDCTLRERHDGLTINQFIKKIKEKYPTQILMADCSNFKECENAYSAGVDFVGTTLSGYTEESKKQDGPDFELLEKLVEAKIPVIAEGRIHSPEQAQYVQKIGVDGMVIGGAITRPLEIASRFVQAVESVEKL

MSSLEAIRYARGNLELLDQLALPLETKYIDVRDCNACWRCIKDMNVRGAPAIAIAAALALAVELEAKRGTLTTCEAAEAFVRERFDHMYTSRPTAVNLGEAKNRIQALAKRLSESGDVSGMIEGVIEGCEAMHAEDVASCRAIGDKGAAALLRACGAKDGENIKVMTCCNTGSLATAGYGT


MVRAKRKLDHIEYALSTGQSRTHGFHDIDFVHQSLPNSNYDTITCETKIGELSLSSPIFINAMTGGGGEKTLHINEQLAYVAKHHNLAMAVGSQMAALKDESEAASYKVIRKVNPNGIFFANLGSEATIEQAERAVDMIEANALQIHLNVIQELTMPEGDRDFTGVLQRIEKIVLNSKVPIIVKEVGFGMSKETMQQLVNVGVTAIDIGGQGGTNFAAVENERRQRMLSYFNNWGIQTATSIIEATSTNNNLSFIASGGIQTALDVAKAIALGANTTAFAGYFLRILMQDGIEKLVDEIELLHTDLKFIMTALGAKTIEELQSVPLVVKGETYHWLMQRGIDTAHYSRR

MGELAKEILPVNIEDELKQSYLDYAMSVIVGRALPDARDGLKPVHRRVLYAMSELGNDWNKPYKKSARVVGDVIGKYHPHGDTAVYDTIVRMAQPFSLRYMLVDGQGNFGSVDGDNAAAMRYTEVRMAKLAHELLADLEKETVDWVPNYDGTEQIPAVMPTKIPNLLVNGSSGIAVGMATNIPPHNLGEVIDGCLALMDNPDLTVDELMQYIPGPDFPTAGIINGRAGIIEAYRTGRGRIYIRARAVVEEMEKGGGREQIIITELPYQLNKARLIEKIAELVKEKKIEGISELRDESDKDGMRVVIELRRGEVGEVVLNNLYAQTQLQSVFGINVVALVDGQPRTLNLKDMLEVFVRHRREVVTRRTVYELRKARERGHILEGQAVALSNIDPVIELIKSSPTPAEAKERLIATAWESSAVEAMVERAGADACRPEDLDPQYGLRDGKYYLSPEQAQAILELRLHRLTGLEHEKLLSEYQEILNLIGELIRILTNPARLMEVIREELEAVKAEFGDARRTEIVASQVDLTIADLITEEDRVVTISHGGYAKSQPLAAYQAQRRGGKGKSATGMKDEDYIEHLLVANSHATLLLFSSKGKVYWLRTFEIPEASRTARGRPLVNLLPLDEGERITAMLQIDLEALQQNGG


MGTTSSQSKTLYQKLYDAHIVHEAPNETPLLYIDRHLVHEVTSPQAFDGLRAMGRPVRQPGKTFATMDHNVSTQTKDINASGEMARIQMQELIKNCAEFGVSLYDLNHPFQGIVHVIGPEQGMTLPGMTIVCGDSHTATHGAFGSLAFGIGTSEVEHVLATQTLKQGRAKTMRIEVNGTVGAGITAKDIVLAIIGKTGSAGGTGHVVEFCGSAIEALSMEGRMTLCNMAIEMGAKAGLVAPDDTTFAYLKGRQFAPTGEQWEQGVAYWRTLKSDADAQFDTIVTLDAADIAPQVTWGTNPGQVIAVNQIIPAPESFSDPVERASAEKALAYMDLRPGIKLTEVAIDKVFIGSCTNSRIEDLRAAAAIAQGRKVAKGVQAIVVPGSGPVKAQAEAEGLDKIFIAAGFEWRLPGCSMCLAMNNDRLEPGERCASTSNRNFEGRQGRGGRTHLVSPAMAAAAAVSGHFADVRELSAATH

MPEINTSHLDEKQVQLLAEMCILIDENDNKIGADTKKNCHLNENIDKGLLHRAFSVFLFNTENKLLLQQRSDAKITFPGCFTNSCCSHPLSNPGELEENNAIGVKRAAKRRLKAELGIPLEEVDLNEMDYLTRIYYKAQSDGIWGEHEVDYILFLRKNVTLNPDPNEIKSYCYVSKEEVREILKKAASGEIKLTPWFKIIADTFLFKWWDNLNHLSPFVDHEKIHRL

MQAYNTQTGIVCPLDRSNVDTDQIIPKQFLKSIKRTGFGVNLFDDWRYLDEGFPGQDNSKRPINPDFVLNKPRYQGATILLARDNFGCGSSREHAPWALSEYGFRTVIAPSFADIFYNNCFKNGMLPIVLTEAQVDDLFEQCLANEGYELTADLERQVVVTPDGTEYPFEVDEFRKHCLLNGLDDIGLTLQQSEAIKAYEAKMQQNTPWIFKEVRA

MSSRKLTLGVSLKMYFGYQQTLDWCQKIHEIAEQHPLASLPSARLFVLPAFPTLAPVVQRFAQSPVHVGAQDLH


MSRNQQLFEQSQKFIPGGVNSPVRAFKSVGGTPVFFRKGEGAYAWDADDKSYIDYVGSWGPLILGHAHPEVVDAVYAAAKNGLTFGAPTEAELEIAELLCRLVPSIEQVRLVSSGTEATMSAIRLARGYTARNRIIKFEGCYHGHDDALLVKAGSGALTFGHPSSAGVPVETASSTVVLDYNDLAGVEQAFNQFGAEIAAVIVEPVAGNMNLIAPQPGFLAGLRELCTRHGSVLIFDEVMTGFRVGLECAQGLYGIKPDLTTLGKVIGGGMPMAAFGGRREIMQCLAPVGAVYQAGTLSGNPVAVAAGLATLKLVQVPGFYDKLAARTRKLTEGLAAAAAKQGVVFCAEAVGGMFGLYFRESAPKSYAEVMSCDREAFNGFFHAMLEEGIYFAPSAFEAGFVSAAHGDAEISKTLATAEKIFARE

MSKYKRIFTIVIDSLGIGAMNDSEKYGDVNVDTLGHIAESVDTFNIPNLQKMGIANLHPIKHVAPVENPIGYQAKMAEASVGKDTMTGHWEMMGLHITKPFKTFTDTGFPQELLDELTARTGHKIVGNKSASGTEILDELGEHQIATGDMIVYTSADSVLQICGQEETFGLEELYRCCEIARELTLKDEWKVGRIIARPYLGTKKGEFKRTSNRHDYALKPYGRTVLNELKDNNFDVISVGKIKDIFDGEGITEGNKSKSSVHGMEQTLEIMDRDFTGFCFINLVDFDALWGHRRNPQGYAEELEKFDVNLGKVLEKLHEDDLLIITADHGNDPTYTGTDHTREYVPFLAYSPSMKGHGQLETPKTFATIGATIADNFGLKMPEGTIGESVLNKLV

MQKSEALFTRAQKTIPGGVNSPVRAFKAVGGTPRFITKADGAYMWDADGKQYIDYIQSWGPMILGHNNAPIREAVIEASCSGLSFGAPTEAEVIMAELVSEMVPSMEMVRMVNSGTEATMSAIRLARGYTSRNKIVKFEGCYHGHADSLLVKAGSGALTLGVPSSPGVPANVAE

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [7]:
print(len(iso_seqs))

18716


In [8]:
iso_seqs2 = list(set(iso_seqs))
print(len(iso_seqs2))

15534


In [9]:
from Bio import SeqIO
records = list(SeqIO.parse("uniprot-isomerase.fasta", "fasta"))


In [10]:
seq_ids =[]
seqs = []
#for record in SeqIO.parse("isomerase_2000.fasta", "fasta"):
for record in SeqIO.parse("uniprot-isomerase.fasta", "fasta"):
    print(repr(record.seq))
    print()
    seq_ids.append(record.id)
    seqs.append(str(record.seq))

Seq('MPDIVNRKVEHVEIAAFENVDGLSSSTFLNDVILVHQGFPGISFSEINTKTKFF...KRE', SingleLetterAlphabet())

Seq('MNIRERKRKHLEACLEGEVAYQKTTTGLEGFRLRYQALAGLALGEVDLTTPFLG...ERV', SingleLetterAlphabet())

Seq('MSQLLQDYLNWENYILRRVDFPTSYVVEGEVVRIEAMPRLYISGMGGSGVVADL...RLQ', SingleLetterAlphabet())

Seq('MALVASVRVPARVLLRAGARLPGAALGRTERAAGGGDGARRFGSQRVLVEPDAG...EKG', SingleLetterAlphabet())

Seq('MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALNQVKLLKKD...PKL', SingleLetterAlphabet())

Seq('MAMAYLAWRLARRSCPSSLQVTSFPVVQLHMNRTAMRASQKDFENSMNQVKLLK...SKL', SingleLetterAlphabet())

Seq('MAAVTWSRARCWCPSLLQVLRLPVTKLHLGRPAMRATQQDFENAMNQVKLLKKD...PKL', SingleLetterAlphabet())

Seq('MQTEHVILLNAQGVPTGTLEKYAAHTADTRLHLAFSSWLFNAKGQLLVTRRALS...QLK', SingleLetterAlphabet())

Seq('MTTNAGPLHPYWPQHLRLDNFVPNDRPTWHILAGLFSVTGVLVVTTWLLSGRAA...KKN', SingleLetterAlphabet())

Seq('MAKSQIWFGFALLALLLVSAVADDVVVLTDDSFEKEVGKDKGALVEFYAPWCGH...ASS', SingleLetterAlphabet())

Seq('MSQEIRQNEKISYRIEGPFFIIHLMNPDNLNALEGEDYIYLGELLELADRNRDV...HRL', Si

Seq('MRVLVTGGSGYIGSHTCVQLLQNGHDVVILDNLCNSKRSVLPVIERLGGKHPTF...YSD', SingleLetterAlphabet())

Seq('MYDYIIVGSGLFGAVCANELKKLNKKVLVIEKRNHIGGNAYTEDCEGIQIHKYG...STD', SingleLetterAlphabet())

Seq('MAEKVLVTGGAGYIGSHTVLELLEAGYLPVVIDNFHNAFRGGGSLPESLRRVQE...TQA', SingleLetterAlphabet())

Seq('MLQVYLVRHGETQWNAERRIQGQSDSPLTAKGEQQAMQVATRAKELGITHIISS...LQR', SingleLetterAlphabet())

Seq('MVKLVFARHGESEWNKANLFTGWADVDLSEKGTQQAIDAGKLIKEAGIEFDLAF...GGE', SingleLetterAlphabet())

Seq('MPRMFGTDGVRGLANRDLTAQLALDLGDAAVRVLGDDERSEFESRRRALVGRDT...LTL', SingleLetterAlphabet())

Seq('MSNRKYFGTDGIRGKVGDTPITPDFVLKLGWAAGKVLARHGSRKIIIGKDTRIS...AVS', SingleLetterAlphabet())

Seq('MSKSPVAIIILDGFGKRAETVGNAVAQANKPNFDRYWANFPHGELKAAGLDVGL...IQK', SingleLetterAlphabet())

Seq('MGRLFGTDGVRGVANADLTAELALGLSVAAAHVLAEAGTFAGHRPTAVVGRDPR...ALG', SingleLetterAlphabet())

Seq('MSKSPVAIIILDGFGKRAETVGNAVAQANKPNFDRYWADFPHGELKAAGLDVGL...IQK', SingleLetterAlphabet())

Seq('MSKSENLYSAARELIPGGVNSPVRAFTGVGGTPLFIEKADGAYLYDVDGKAYID...AKL', Si

Seq('MQRSPLEKASVVSKLFFSWTRPILRKGYRQRLELSDIYQIPSADSADNLSEKLE...TRL', SingleLetterAlphabet())

Seq('MQRSPLEKASVVSKLFFSWTRPVLKKGYRQRLELSDIYQIPSADSADNLSEKLE...TRL', SingleLetterAlphabet())

Seq('MTGRGIPGAVAVHHVAYTVPDLDQAVEFFTEVIGAELAYTLVQDAAGDWMTRKL...GAS', SingleLetterAlphabet())

Seq('MQAATVVINRRALRHNLQRLRELAPASKMVAVVKANAYGHGLLETARTLPDADA...YVD', SingleLetterAlphabet())

Seq('MAYVKLSEKALKHNLETVSKKAGGSDKIAAVLKDNAYGHGIEGFAKKISRLGIK...WID', SingleLetterAlphabet())

Seq('MEAASTWALLLALLLLLLLLSLTLFRTPARGYLPPGPTPLPLLGNLLQLRPGAL...QSR', SingleLetterAlphabet())

Seq('MATIIINKENFYHNLNQIALKTGSVEKIAIVLKDNAYGHGLQLMASLSAEFGIR...VDR', SingleLetterAlphabet())

Seq('MRPATALIDLDALRHNYRLARSRHGGRALAVVKANAYGHGAVRCAQALAAEADG...YRG', SingleLetterAlphabet())

Seq('MEHFYRDTWVEVDLDAIEQNVTNALRLYKDREMNLMAVVKANGYGHGAVEVSQA...NLL', SingleLetterAlphabet())

Seq('MRATKAIIHLDNLQYNIKEIKKRLNKNVKICLPVKADAYGHGAVRVAVAAIRAG...TIQ', SingleLetterAlphabet())

Seq('MPRPILATVHTAALRHNLDRARRAAVDARVWAVVKANAYGHGIERVYEGLRGAD...DGE', Si

Seq('MASERLADGDSRYYLLKVAHEQFGCAPGELSEDQLQQADRIIGRQRHIEDAVLR...AHG', SingleLetterAlphabet())

Seq('MNPWQRFARQRLARSRWNRDPAALDPADTPAFEQAWQRQCHMEQTIVARVPEGD...LCG', SingleLetterAlphabet())

Seq('MSNQEAIGLIDSGVGGLTVLKEALKQLPNERLIYLGDTARCPYGPRPAEQVVQF...END', SingleLetterAlphabet())

Seq('MPMKPLDGETTSTEKLSPVLRPTVLIFDSGVGGLSVYNEIRTLLPDLHYLYAFD...LAL', SingleLetterAlphabet())

Seq('MNPWQRFARQRLARSRWNRDPAALDPADTPAFEQAWQRQCHMEQTIVARVPEGD...LCG', SingleLetterAlphabet())

Seq('MSLEAIKYKKGRLEILNQLLLPHESTYEVVSDTEDGWKAIREMKVRGAPAIAIV...LKG', SingleLetterAlphabet())

Seq('MADSNSKHQDSLQSICYKRGSLQLLDQRKLPLETVYLEIKGADDGWHAIRDMVV...CCK', SingleLetterAlphabet())

Seq('MSLLAQLDQKIAANGGLIVSCQPVPDSPLDKPEIVAAMALAAEQAGAVAIRIEG...AVL', SingleLetterAlphabet())

Seq('MTSVRKRKMNRSSVKKVSRRNKDKQKKKLIACNPIIAKNWDYSLTLAQNYEKLG...NIA', SingleLetterAlphabet())

Seq('MSGAPIGVFDSGVGGLTVARAILDQLPGESLLYIGDTAHAPYGPRPIAEVRRYA...VGR', SingleLetterAlphabet())

Seq('MAEKTQRIGIFDSGLGGTTVLKELINSLPNEDYIYYGDNGNFPYGSGKTKNELQ...IPK', Si

Seq('MNTLFDKIWDAHVVTTVEDGPTQLYIDRLYCHEVTSPQAFAGLRARGIKVFRPE...ELM', SingleLetterAlphabet())

Seq('MVRAKRKLDHIEYALSTGQSRTHGFHDIDFVHQSLPNSSYETITCETKIGELSL...SRR', SingleLetterAlphabet())

Seq('MWRLKIAEGGNNPYIYSTNNFVGRQTWEFDPEAGTPEERAQVEEARENFWRDRF...KND', SingleLetterAlphabet())

Seq('MVRAKRKLDHIEYALSTGQSRTHGFHDIDFVHQSLPNSSYETITCETKIGELSL...SRR', SingleLetterAlphabet())

Seq('MSGPRTLFEKIWSTHVVCVPDDQPPILYIDRHYVHEVTSPQAFDGLRAAGRKVR...EYK', SingleLetterAlphabet())

Seq('MARVLKAAAANAVGLFSRLQAPIPTVRASSTSQPLDQVTGSVWNLGRLNHVAIA...EQA', SingleLetterAlphabet())

Seq('MSEHQSLPAPEASTEVRVAIVGVGNCASSLVQGVEYYYNADDTSTVPGLMHVRF...IIG', SingleLetterAlphabet())

Seq('MFIENFKVESPNVKYTESEIHSVYDYQTTELVHDEKNGTYQWTVKPKTVKYEFK...EYK', SingleLetterAlphabet())

Seq('MSGKTIFDKLWDQHVIAGNEGEPQLLYIDLHVIHEVTSPQAFQGLREAGRRVRR...VTD', SingleLetterAlphabet())

Seq('MAEKFIKHTGLVVPLDAANVDTDAIIPKQFLQKVTRTGFGAHLFNDWRFLDEKG...FMN', SingleLetterAlphabet())

Seq('MSAPRTLYDKIWDDHLVDEQADGTCLLYIDRHLVHEVTSPQAFEGLRMTGRKVR...EWN', Si

Seq('MAQAPVSPVVLVILDGWGYRQSTEANAIALAKTPVMDCLWTSYPRTLIHTSGKD...ISR', SingleLetterAlphabet())

Seq('MLQVYLVRHGETLWNAARRIQGQSDSPLTEIGIRQAHLVAQRVRNQGITHIISS...LQR', SingleLetterAlphabet())

Seq('MVFKNSIDLYKKAVELIPGGVNSPVRAFKSVNREAPIFIKKGQGAKIWDEDDNE...SGK', SingleLetterAlphabet())

Seq('MKNINPTQTSAWQALQKHYDEMKDVTIAELFANDSDRFAKFSATFDDLMLVDFS...WRA', SingleLetterAlphabet())

Seq('MSARKYFGTDGIRGRVGQGVISADFVLRLGNALGRVLTQIRGKRPLVLIGKDTR...DAA', SingleLetterAlphabet())

Seq('MSSRKYFGTDGIRGRVGQGVISADFVLRLGNALGRVLTAGRSKRPLVLIGKDTR...DAA', SingleLetterAlphabet())

Seq('MPTNLHAWNALQQHHDAIRDHQMADWFAENGADRVRQFSLEAAGLYLDYSKNRI...RNG', SingleLetterAlphabet())

Seq('MATKSNEENIAEFKGHNEIQIELMKEECIVVDNDDKPIRPGSKKETHLMVNINN...HRY', SingleLetterAlphabet())

Seq('MFKENTIKLGIAPIAWTNDDMPELGAENTFEQCISEMALAGFNGSEVGNKYPRN...AGL', SingleLetterAlphabet())

Seq('MAEKVLVTGGAGYIGSHTVLELLEAGYLPVVIDNFHNAFRGGGSLPESLRRVQE...TQA', SingleLetterAlphabet())

Seq('MPLTEVVLVDENDKPTGVMEKQEAHVKGELHRAITVYIFNSRQQLLLQQRAEEK...QSR', Si

Seq('MTKQHAYTREDLLRCARGELFGPGNAQLPAPNMLMVDRITHISDVGGKYGKGEM...DSF', SingleLetterAlphabet())

Seq('MTRQESYNFDELLLSGHGRLFGPGNPQLPAPQMLMFHRITQMSEEGGAYDKGQV...SQG', SingleLetterAlphabet())

Seq('MSNHKEFSEALKVIPGGVDSPVRAFKNVGSEPFMVQKGKGAYIYDIEGNKYLDF...NKK', SingleLetterAlphabet())

Seq('MYKLVLIRHGESTWNLDNRFTGWTDVDLTETGIAQAKNSGQLLKAEGYDFDLAY...VAA', SingleLetterAlphabet())

Seq('MQSIFGTDGIRGRFNEEITYSLAYKVGYALGSNLENNNPILIGRDTRISGDILL...IMN', SingleLetterAlphabet())

Seq('MTFLQYFKTDGIRGKVGVNPITPDFLLKLGWSIGIVLGKNKTQKIIIGRDTRIS...KLL', SingleLetterAlphabet())

Seq('MLQVYLVRHGETQWNAERRIQGQSDSPLTAKGEQQAMQVGERARSLGITHIISS...LQR', SingleLetterAlphabet())

Seq('MGRRYFGTDGIRGTVGDAPITPDFVLRLGYAAGKVLAGSADVAAGSRPTVLIGK...ATT', SingleLetterAlphabet())

Seq('MKLSFTKMHGAGNDFVVLDGYTRALPPLTGAQVRALADRHFGIGADQLLLVEKP...LPA', SingleLetterAlphabet())

Seq('MGMVVPSAAPASEELFRRAERVVPGGVNSPVRAFRAVGGTPRFMVSGNGPYLTD...GRS', SingleLetterAlphabet())

Seq('MTTAKRPLALLILDGWGYRENPHNNAIFHARTPVLDKLNAQFPNSLISGSGLDV...LKE', Si

Seq('MTTSYLTFLAVAVGPPLVALGVVRAARWDGDRARAAGVGILLALALSYTTPWDN...RWR', SingleLetterAlphabet())

Seq('MCTLSFMYPNSLLDGTCKTVALGDSKPRYNKQRSSCFDPLIIGNCTDQQQLCGL...TLA', SingleLetterAlphabet())

Seq('MQRSPLEKASIFSKLFFSWTRPILRKGYRQRLELSDIYHISSSDSADNLSEKLE...TRL', SingleLetterAlphabet())

Seq('MQRSPLEKASVVSKLFFSWTRPILKKGYRQRLELSDIYQIPSADSADNLSEKLE...TRL', SingleLetterAlphabet())

Seq('MPRPISATIHTAALANNLSVVRRHAAQSKVWAIVKANAYGHGLARVFPGLRGTD...RAE', SingleLetterAlphabet())

Seq('MPRPISATIHTAALANNLSVVRRHAAQSKVWAIVKANAYGHGLARVFPGLRGTD...RAE', SingleLetterAlphabet())

Seq('MRVVKVKKTVVIIGGGAAGMSAASRVKRLKPEWDVKVFEATEWVSHAPCGIPYV...LKF', SingleLetterAlphabet())

Seq('MLRLPAVLRQIRPVSRALAPHLTRAYAKDVKFGADARALMLQGVDLLADAVAVT...GMF', SingleLetterAlphabet())

Seq('MLNRLLIFITLALAVRALDCSSKELQQYNLESVKGTYSISNIKSTPPSKTNITW...SAV', SingleLetterAlphabet())

Seq('MDEIQIASILKSSELFPIPQGVKLSYGTAGFRGDAKLLESTVYRVGILSALRSL...GSS', SingleLetterAlphabet())

Seq('MVSKTWICGFISIITVVQALSCEKHDVLKKYQVGKFSSLTSTERDTPPSTTIEK...SAV', Si

Seq('MGRRYFGTDGIRGTVGEGPITPDFVLRLGYAAGKVLASSAEVAAGSRPTVLIGK...ATT', SingleLetterAlphabet())

Seq('MELKNKKWTDEEFHKQREEVLQQWPTGKEVDLQEAVDYLKKIPAEKNFAEKLVL...RPE', SingleLetterAlphabet())

Seq('MLQVYLVRHGETQWNAERRIQGQSDSPLTAKGEQQAMQVGERARSLSITHIISS...LQR', SingleLetterAlphabet())

Seq('MNDTKDNKVYSSARHSKYWQKLKAAAESKWSLAALFAQDNTRTQRFSAQSGALY...QLS', SingleLetterAlphabet())

Seq('MGRMFGTDGVRGVANTELTARIAYDLGRAGAYVLTEGAHKPKILVAKDTRISGD...KYN', SingleLetterAlphabet())

Seq('MKVNFTKMQGSGNDFVVIDATKTPFQLTTSQIQKMANRRFGVGFDQLLVIEPPK...WLD', SingleLetterAlphabet())

Seq('MVSSLSRNGNSHHNGVNVAITSGFSTKLAPLPNGPLFGTDGIRGKAGDLLTAPF...LAN', SingleLetterAlphabet())

Seq('MGRMFGTDGVRGVANKELTADLAYKLGKAGAFILTEGTHRPKILVGMDTRISGD...KVK', SingleLetterAlphabet())

Seq('MAELAQFAKMNGLGNAIIVADMRGRADRVRPEAAQRLASDPATHFDQIMAIHDP...GAA', SingleLetterAlphabet())

Seq('MSARKYFGTDGIRGRVGQGVISADFVLRLGNALGRVLTQGRSKRPLVLIGKDTR...DAA', SingleLetterAlphabet())

Seq('MSVSKKPMVLVILDGYGYREEQQDNAILNAKTPVMDALWAKRPHTLIDASGLEV...IVE', Si

Seq('MRQAWRWFGPEAGVPLDAVRQAGATDIVSALHEVPIGQEWTSAQIVERKNLIES...PNV', SingleLetterAlphabet())

Seq('MSFRWYGESDPVSLQYIRQIPGVTHIVSAIYDEPVGEVWPAHKIDALKATIERA...ATQ', SingleLetterAlphabet())

Seq('MNLSFRWYGADDAVKLQYIRQIPSIKSIVTAIYDVPVGEKWSIEAILKLKNEVE...EDK', SingleLetterAlphabet())

Seq('MSLTTLPGVCGIGLRAPHYREALDARPELGWVEVHSENFFDGGTPLAMLRRVAE...ERS', SingleLetterAlphabet())

Seq('MASQLRLHLAATPPLLPHRRPHLARPLCPTLNPIRAPLPPLSRVLSHARPARAV...LPW', SingleLetterAlphabet())

Seq('MAERIQKLLSQWGIASRRHAEEMILAGRVSVNGKVANLGDKADPQQDFLSVDGK...NLR', SingleLetterAlphabet())

Seq('MRLKCTISYDGHLFNGYQVQPGKRTVQDELEKALAVLHKSKDRIPVVSSGRTDS...YDN', SingleLetterAlphabet())

Seq('MIEFDNLTYLHGKPQGTGLLKANPEDFVVVEDLGFEPDGEGEHILVRILKNGCN...IAE', SingleLetterAlphabet())

Seq('MARRGKKKGRPISGWVIFDKPKGMGSTEAVSKIKWLFSAEKAGHAGTLDPLASG...TAG', SingleLetterAlphabet())

Seq('MRIALGIQYDGAAFCGWQSQPHGKTVQDALERSLAEFAQTSLHTTVAGRTDTGV...GRT', SingleLetterAlphabet())

Seq('MRKRLKTRIKICGMCSPEDMEMAALYGADAVGFITEVPIESPRKLDSDTAASLI...AFL', Si

Seq('MSDQQQPPVYKIALGIEYDGSKYYGWQRQNEVRSVQEKLEKALSQVANEPITVF...LAD', SingleLetterAlphabet())

Seq('MEGVVLLHKPKGMTSHDCVFKLRKILREKRIGHTGTLDPDVTGVLPICVGRATK...LKL', SingleLetterAlphabet())

Seq('MIVKICGLKKAVDVAAAVDNGADMIGFVFAKSKRQVTVEKAHELAKNIPANVKK...GVE', SingleLetterAlphabet())

Seq('MRKAIIAGNWKMNKTVDEAVKVVEELKPLVKDATCDVVVCPTFVCLDAVKKATA...VNY', SingleLetterAlphabet())

Seq('MEGVVLLHKPKGMTSHDCVFKLRKILREKRIGHTGTLDPDVTGVLPICVGRATK...LKL', SingleLetterAlphabet())

Seq('MRRPMVAGNWKMHGTRASVAELINGLRHLALPSGVDVAVFPPCLYINQVIDGLK...AGN', SingleLetterAlphabet())

Seq('MDDSSSLASPVQSILAAARRRDPPTRRCTVSARSLPAALHSAAADGRTPIIAEV...VTA', SingleLetterAlphabet())

Seq('MNHSRSHALFSQAQNLMPGGVNSPVRAFKSVGGEPFFVARADGAYLFDVDGNRY...VMR', SingleLetterAlphabet())

Seq('MSTHRLVMVRHGESTWNQENRFCGWFDAELSEKGAEEAKKAAQAIKDAKMEFDI...KAK', SingleLetterAlphabet())

Seq('MNKPIGVIDSGVGGLTVAKEIMRQLPNETIYYLGDIARCPYGPRPGEEVKKFTT...DDS', SingleLetterAlphabet())

Seq('MWGDKMRYFPIGVFDSGVGGLTVLKRLMEVLPQENYVYFGDTRRVPYGDRSKEE...TKY', Si

Seq('MTPFMTEDFLLDTEFARRLYHDYAKDQPIFDYHCHLPPQQIAEDYRFKNLYDIW...ELN', SingleLetterAlphabet())

Seq('MARPLSFHEDRLFPSDPATRSYARGLYALVKDLPIISPHGHTDPSWFATNAPFQ...AAE', SingleLetterAlphabet())

Seq('MKMTFRWYGEGNDSITLKQIKQIPGCSGLMGVIDQYAAGEVWEESVIAEYVEHV...SSK', SingleLetterAlphabet())

Seq('MEHTWRWFGPNDETTLTDIRQTGATGVVTALHEIPNGEVWPVEAIKARKAMIEA...STL', SingleLetterAlphabet())

Seq('MLSKQIPLGIYEKALPAGECWLERLRLAKTLGFDFVEMSVDETDERLSRLDWSR...EAA', SingleLetterAlphabet())

Seq('MLSKQIPLGIYEKALPAGECWLERLQLAKMLGFDFVEMSVDETDDRLSRLDWSR...EAA', SingleLetterAlphabet())

Seq('MVKTNVTIMRGGTSKGVFFHESAMPNNKNEWEPFLLDVMGSPDKRQIDGLGGGN...QMM', SingleLetterAlphabet())

Seq('MTDILNKILATKAQEVAAQKAAVNAEHIRTLAAEAAPVRSFIDSIRGKHRLNLP...KLF', SingleLetterAlphabet())

Seq('MARDEHDSRYLTAELPGTGGLFKETPEDFLVEEISLYLPCGEGEHTYAVIEKRG...NGA', SingleLetterAlphabet())

Seq('MPVIAADKPLHLTSHDVVNRARRALGTKRVGHTGTLDPLATGVVVLCVDDSTKL...AWA', SingleLetterAlphabet())

Seq('MQREDVLGEALKLLELQGIANTTLEMVAERVDYPLDELRRFWPDKEAILYDALR...ALT', Si

Seq('MYRYKISVEYLGTNLAGWQRQSGVMSVQQILEEAIYKFSGEQVTLFGSGRTDAG...ADY', SingleLetterAlphabet())

Seq('MAFAPMGPEASFFDVLDRHRESLLAALRRGGREPPTGGSRLASSSEVLASIENI...GWI', SingleLetterAlphabet())

Seq('MSDQQQPPVYKIALGIEYDGSKYYGWQRQNEVRSVQEKLEKALSQVANEPITVF...LAD', SingleLetterAlphabet())

Seq('MRKPIIAGNWKMNKTAAKAGQFAEDVKNNVPSSDAVESVVAAPALFLQELVRLT...AVK', SingleLetterAlphabet())

Seq('MGRPRRRGRDINGVLLLDKPLGLSSNDVLQKVKRLFSANRAGHTGALDPLATGM...NKK', SingleLetterAlphabet())

Seq('MSGRKFFVGGNWKMNGDKKSIEELANTLNSAKLNPDTEVVCGAPTIYLDYARSK...AKA', SingleLetterAlphabet())

Seq('MILLEKTQEKKINDKEELIVKEEVETNWDYGCNPYERKIEDLIKYGVVVVDKPR...RKK', SingleLetterAlphabet())

Seq('METFEISDFKEHAKKKSMWAGALNKVTISGLMGVFTEDEDLMALPIHRDHCPAL...KFH', SingleLetterAlphabet())

Seq('MRYKAVVEYEGTSFIGWQRQHGVVGKSVQESIERSIKEFCQQSVIVYAAGRTDA...VDY', SingleLetterAlphabet())

Seq('MPSVLENILKDKLLEVAVLKKNHALPINIAPSDRDFKKALLEKKTSFILEYKKA...REY', SingleLetterAlphabet())

Seq('MKNISTKESFFKFKFNNRSLYCWGAFMKLRMKPEDFIVEEIIDFNKIAGDRCYL...RKR', Si

Seq('SITELCKTLSAGPLDPNTEVVVGCPAPYLSLARSLLPETIGVAAQNCYKVAKGA...KPD', SingleLetterAlphabet())

Seq('MRRLIAANWKMNKTVKETEEYINTFLKFVEHPESREILICPPFTSLYVAGKMLQ...YSF', SingleLetterAlphabet())

Seq('MAPSRKFFVGGNWKMNGRKQNLGELIGTLNAAKVPADTEVVCAPPTAYIDFARQ...AKQ', SingleLetterAlphabet())

Seq('MSKTIIIGNWKMNKTFTQTKEFFSAFNQLYIENKNKINQNLDFAVALPAINVAA...LNK', SingleLetterAlphabet())

Seq('MPKKFIVANWKMHKTVREALAFLDEFIPITKGLNGREIGIAPTFICIESVGKVL...VKY', SingleLetterAlphabet())

Seq('MADPKTKGRGSGGNGSGRRLVIVESPTKARKLASYLGSGYIVESSRGHIRDLPR...KRD', SingleLetterAlphabet())

Seq('MSGDVAAENSIHIQNGGSCEVVQSNGVTTNGHGHHHHHHSSSSSSSKHKSSSKD...YRF', SingleLetterAlphabet())

Seq('MKRVALLVQYDGSHYSGWQKQKNATTVQEILDRALLKITNHTVQTFAAGRTDAG...GFS', SingleLetterAlphabet())

Seq('MAVSVKICGLTEAAGLAAAVDAGARYVGFVFFPKSPRHVTPGTAAELAAQVPLG...PIL', SingleLetterAlphabet())

Seq('MRVKICGITQPQQSIAIASLGATALGFICVPNSPRYVTTSQIRAAVAELPADID...SRE', SingleLetterAlphabet())

Seq('MARDEVRRILPADIKREVVVKDEKAETNPKWGFPPEKRPIEMHIQFGIINLDKP...EKE', Si

Seq('MKSKKQTKLLLILDGWGYSKTTKNNAIALANTSVWDRLNQTFPHSLIHTSGKDV...TFK', SingleLetterAlphabet())

Seq('MLKTINPTQTQAWNALTAHFESAQDMDLKDLFAQDAARFDKFSARFGSDILVDY...FKA', SingleLetterAlphabet())

Seq('MLKTINPTQTQAWNALTAHFESAQDMDLKDLFAQDAARFDKFSARFGSDILVDY...FKA', SingleLetterAlphabet())

Seq('MTQLVLIRHGESTWNLENRFTGWTDVELTPTGVAQAQQAGRLLKQAGIDFDTVY...ADG', SingleLetterAlphabet())

Seq('MARQYFGTDGIRGRVNTSPMTAETALRLSIAAARTFAPEGGREVVIGRDTRRSG...GNS', SingleLetterAlphabet())

Seq('MSYFGTDGIRGLFGKFPITPDFVLKLGYVTGQVLVEQNPNPKKKPSVVIGKDTR...VMA', SingleLetterAlphabet())

Seq('MSKKALLIICDGWGIGDKGKDDVIFNTPTPYWDELLKTYPASQLQASGENVGLP...LIK', SingleLetterAlphabet())

Seq('MSYFGTDGIRGKFGEMPITPEFALKLGFAAGKVLKRTSPINKPIVVLGKDTRLS...HAA', SingleLetterAlphabet())

Seq('MASPALISETEAWKDLKSHLEGIKKTHLRELMGDTERCQSMMVEFDNIFLDYSR...PNI', SingleLetterAlphabet())

Seq('MAEAKNRVCLVVIDGWGISNETKGNAILNAKTPVMDELCVMNSHPIQAHGLHVG...NEA', SingleLetterAlphabet())

Seq('MRALVTGAAGFIGSTLVDRLLADGHSVVGLDNFATGRATNLEHLADNSAHVFVE...HTD', Si

Seq('MIEFDNLTYLHGKPQGTGLLKANPEDFVVVEDLGFEPDGEGEHILVRILKNGCN...IAE', SingleLetterAlphabet())

Seq('MFIKICGIKTPNELKIIEKYGDFTGVILECVSKRKIGVESAKNLIEISNIPVFA...VKL', SingleLetterAlphabet())

Seq('MPRYKLTVEYDGTPYVGWQRQENGHAVQNAIELAFKKFCGEDLTLSAAGRTDAG...FPY', SingleLetterAlphabet())

Seq('MELDEVIITRAIFDEYSKTFLDYTDIDVALVGGGPANLVAAKYLAEAGVKVALY...KTI', SingleLetterAlphabet())

Seq('MKKFYIAANWKMNMNRAEAKQLATEMKAGLKDGKNKYMIAPSFTLLQDVASVLK...FSE', SingleLetterAlphabet())

Seq('MKHLIIVESPAKAKTIKNFLDKNYEVVASKGHVRDLSKFALGIKIDETGFTPNY...DNG', SingleLetterAlphabet())

Seq('MRRPVVMGNWKLNGSKAMVTDLLNGLNAELEGVEGVDVVVAPPAMYLDLAERLI...KKA', SingleLetterAlphabet())

Seq('MASKPQPIAAANWKCNGSESLLVPLIETLNAATFDHDVQCVVAPTFLHIPMTKA...ATK', SingleLetterAlphabet())

Seq('MSETTQAAAIQAAAVAENERAPLKIALGIEYDGSQYYGWQRQIDVASVQACLEK...LAD', SingleLetterAlphabet())

Seq('MRRPVVMGNWKLNGSKTMVAELLTGLNAELEGVEGVDVAVAPPALYIDLAERLI...MKA', SingleLetterAlphabet())

Seq('MTAEPSSAPPEQPILRRIALSLQYEGSDFCGWQRQNNARSVQAVLETAIAQLDP...ELQ', Si

Seq('MSTGFFGDIQKIKYEGPDSTNPLAFRHYQPDEIVLGKRMEDHLRFAVAYWHTFT...RYV', SingleLetterAlphabet())

Seq('MTDFFAGIPQIRYEGEGSSNEFAFRHYNPDEVILGKRMEDHLRFAVAWWHSFAW...RFV', SingleLetterAlphabet())

Seq('MQAYFDQLDRVRYEGPQSTNPLAFRHYNPDELVLGKRMEDHLRFAACYWHTFCW...FDK', SingleLetterAlphabet())

Seq('MQAYFDQLDRVRYEGPQSTNPLAFRHYNPDELVLGKRMEDHLRFAACYWHTFCW...FDK', SingleLetterAlphabet())

Seq('MKPQVYHVDAFTSQPFRGNSAGVVFPADNLSEAQMQLIARELGHSETAFLLHSD...IKL', SingleLetterAlphabet())

Seq('MKIVNITTQVESIELKTPFKTALRQTSHVEFVRVEVECDNGFVGIGEASATKVI...KPL', SingleLetterAlphabet())

Seq('MSTGFFGDIAKIKYEGPESTNPLAFRHYNPDEIVLGKRMEDHLRFAVAYWHTFV...RYV', SingleLetterAlphabet())

Seq('MIMRFGYVSHAMALWDCSPAKTMTFTSFKKLSKQEREDKLYHVIRQNLEHTIRI...LQW', SingleLetterAlphabet())

Seq('MNYQPTPEDRFTFGLWTVGWQGRDPFGDATRRALDPVETVQRLAGLGAHGVTFH...ARG', SingleLetterAlphabet())

Seq('MSYQPTPEDRFTFGLWTVGWQGRDPFGDATRRALDPVETVQRLAELGAHGVTFH...ARG', SingleLetterAlphabet())

Seq('MSFQPTPEDKFTFGLWTVGWQGRDPFGDATRPGLDPVETVQRLAELGAYGVTFH...RKR', Si


Seq('MQVGCHVSISGGIDRSVDNAVERGCTAFQIFSRNPRGWRAKEISKDEAGTFKKK...LAG', SingleLetterAlphabet())

Seq('MERAISPGLLVRALLLLLLLLGLAARTVAAGRARGLPAPTAEAAFGLGAAAAPT...PSN', SingleLetterAlphabet())

Seq('MIDEAIRSISKKYTIGGHISVAGGLHNGPARAAVFGFPTFQFFSKNQMRWSSPP...IGE', SingleLetterAlphabet())

Seq('MKFVGAHVSASGGVDNAPLNAMAIGAKAFAVFAKNQRQWVAKPLEEKTIEAFKK...VKK', SingleLetterAlphabet())

Seq('MAQLTEHPAHSALVAHHERVRDLHMRDLFAADSGRFDKLSLRLGDILFDYSKNR...LGD', SingleLetterAlphabet())

Seq('MAGPLPPLNQEPAFQKLQEYYDSKAKDLNIKDLFVKDSKRFSKYSLRLHTQNDG...NWK', SingleLetterAlphabet())

Seq('MVILKGIPSVLTPELLYVLAQMGHGDELVLADANFPTSSVCKCGPVEIRADGVR...EQC', SingleLetterAlphabet())

Seq('MEEFKNLKEHYENIGKNINMRKEFESNYGATRFKDFSKEVNISKQVGTILLDYS...SKL', SingleLetterAlphabet())

Seq('MTQINNRSVWFVIGTQHLYGVETLRQVERHGQQVVDSLNRSGILPFRLQIRPLV...NQR', SingleLetterAlphabet())

Seq('MSPPVSVTKMQVENYAFAPTVNPAGSTNTLFLAGAGHRGLEIEGKFVKFTAIGV...IGV', SingleLetterAlphabet())

Seq('MSRITTSLDHYEVWFLTGSQNLYGEETLQQVAEQSQEIARQLEEASDIPVRVVW...QGL', S

Seq('MNILKISKQTLRNNIKIIREYIGNAKMCFPVKANAYGHGIEDIVENTHDLVDFF...KII', SingleLetterAlphabet())

Seq('MDSRPTVVEIDLAALRHNFSLVQKRVPEGCGLLAVVKADAYGHGFQYVSEELEK...YLG', SingleLetterAlphabet())

Seq('MDSRPTVVEIDLAALRHNFSLVQKRVPEGCGLLAVVKADAYGHGFQYVSEELEK...YLG', SingleLetterAlphabet())

Seq('MLRRASFVEVDTSSLRHNFHAAKNAIPKDAHIMAVVKANAYGVGALKASEVFLQ...VYV', SingleLetterAlphabet())

Seq('MADIIARLREDGIQKRVIQEGRGALPDFQDGTKATFHYRTLCSDEEGAVLDDSR...FSH', SingleLetterAlphabet())

Seq('MGACAMNPQALKGSAMLAAAMLLASGAAMADVAPQAKAPTIAKELQQAKTYTIS...LQD', SingleLetterAlphabet())

Seq('MADQDQLKALRLRIDSLDEKLLELISERARCAQDVARVKTQTLGEGEAPVFYRP...AVL', SingleLetterAlphabet())

Seq('MAESINSFDSKSTLQVGEKSYDYFALDAVPGMEKLPYSLKVLGENLLRNEDGKN...SGK', SingleLetterAlphabet())

Seq('MVTGWHRPTWIEIDRAAIRENIKNEQNKLPENVDLWAVVKANAYGHGIIEVART...YIH', SingleLetterAlphabet())

Seq('MVTGWHRPTWIEIDRAAIRENIKNEQNKLPDKVALWAVVKANAYGHGIIETAKI...YIN', SingleLetterAlphabet())

Seq('MLKQVIKTVSSSQAPKKYFFKQFCTSTTEKKGRVGLVTLNRPKSLNALSDGLIS...HNK', Si

In [11]:
print(len(seq_ids))
print()
print(len(seqs))

18716

18716


In [12]:
seqs[0:3]

['MPDIVNRKVEHVEIAAFENVDGLSSSTFLNDVILVHQGFPGISFSEINTKTKFFRKEISVPVMVTGMTGGRNELGRINKIIAEVAEKFGIPMGVGSQRVAIEKAEARESFAIVRKVAPTIPIIANLGMPQLVKGYGLKEFQDAIQMIEADAIAVHLNPAQEVFQPEGEPEYQIYALEKLRDISKELSVPIIVKESGNGISMETAKLLYSYGIKNFDTSGQGGTNWIAIEMIRDIRRGNWKAESAKNFLDWGVPTAASIMEVRYSVPDSFLVGSGGIRSGLDAAKAIALGADIAGMALPVLKSAIEGKESLEQFFRKIIFELKAAMMLTGSKDVDALKKTSIVILGKLKEWAEYRGINLSIYEKVRKRE',
 'MNIRERKRKHLEACLEGEVAYQKTTTGLEGFRLRYQALAGLALGEVDLTTPFLGKTLKAPFLIGAMTGGEENGERINLALAEAAEALGVGMMLGSGRILLERPEALRSFRVRKVAPKALLIANLGLAQLRRYGRDDLLRLVEALEADALAFHVNPLQEAVQRGDTDFRGLVERLAELLPLPFPVMVKEVGHGLSREAALALRDLPLAAVDVAGAGGTSWARVEEWVRFGEVRHPELCEIGIPTARAILEVREVLPHLPLVASGGVYTGTDGAKALALGADLLAVARPLLRPALEGAERVAAWIGDYLEELRTALFAIGAKNPKEARGRVERV',
 'MSQLLQDYLNWENYILRRVDFPTSYVVEGEVVRIEAMPRLYISGMGGSGVVADLIRDFSLTWNWEVEVIAVKDYFLKARDGLLIAVSYSGNTIETLYTVEYAKRRRIPAVAITTGGRLAQMGVPTVIVPKASAPRAALPQLLTAALHVVAKVYGIDVKIPEGLEPPNEALIHKLVEEFQKRPTIIAAESMRGVAYRVKNEFNENAKIEPSVEILPEAHHNWIEGSERAVVALTSPHIPKEHQERVKATVEIVGGSIYAVEMHPKGVLSFLRDVGIASVKLAEIRGVNP

In [13]:
#print(len(records))

In [14]:
seqID = []
for item in records:
    seqID.append(item.id)
    print(item.id)
    print()

sp|P61615|IDI2_SACSH

sp|Q746I8|IDI2_THET2

sp|Q8ZWV0|PGMI_PYRAE

sp|P42126|ECI1_HUMAN

sp|Q9WUR2|ECI2_MOUSE

sp|O75521|ECI2_HUMAN

sp|Q5XIC0|ECI2_RAT

sp|Q46822|IDI_ECOLI

sp|Q15125|EBP_HUMAN

sp|O22263|PDI21_ARATH

sp|Q05871|ECI1_YEAST

sp|P26439|3BHS2_HUMAN

sp|P70245|EBP_MOUSE

sp|P14060|3BHS1_HUMAN

sp|Q62878|3BHS4_RAT

sp|P22071|3BHS1_RAT

sp|Q60555|3BHS1_MESAU

sp|P24815|3BHS1_MOUSE

sp|P27365|3BHS1_MACMU

sp|O35469|3BHS6_MOUSE

sp|P06744|G6PI_HUMAN

sp|P06745|G6PI_MOUSE

sp|P69922|FUCI_ECOLI

sp|P0A6T1|G6PI_ECOLI

sp|P34795|G6PI_ARATH

sp|Q9XI01|PDI11_ARATH

sp|Q13907|IDI1_HUMAN

sp|Q6P6V0|G6PI_RAT

sp|Q9FF55|PDI14_ARATH

sp|Q9SRG3|PDI12_ARATH

sp|Q8H103|G6PIP_ARATH

sp|P15496|IDI1_YEAST

sp|Q8VX13|PDI13_ARATH

sp|Q9Y237|PIN4_HUMAN

sp|Q9BXS1|IDI2_HUMAN

sp|P9WQP7|3BHS_MYCTU

sp|Q16518|RPE65_HUMAN

sp|P16250|HIS4_STRCO

sp|Q9WYP7|IOLO_THEMA

sp|Q28175|RPE65_BOVIN

sp|Q9YGX2|RPE65_CHICK

sp|Q91ZQ5|RPE65_MOUSE

sp|E1XUJ2|LDI_CASDE

sp|Q9TVB8|RPE65_CANLF

sp|O70276|RPE65_RAT

sp|Q

sp|P37747|GLF_ECOLI

sp|Q5R8D0|GALE_PONAB

sp|C4ZT77|GPMB_ECOBW

sp|Q1JL20|GPMA_STRPC

sp|A1A2Z2|GLMM_BIFAA

sp|Q6D9B6|GLMM_PECAS

sp|Q71WX0|GPMI_LISMF

sp|B1W3X3|GLMM_STRGG

sp|Q8Y4I4|GPMI_LISMO

sp|A7ZHP6|GSA_ECO24

sp|Q5ZY71|GPMI_LEGPH

sp|B7UIK1|GSA_ECO27

sp|Q2YLR4|DAPF_BRUA2

sp|Q2KVQ6|GLMM_BORA1

sp|Q1JG44|GPMA_STRPD

sp|B0CIT6|DAPF_BRUSI

sp|Q0TLH7|GSA_ECOL5

sp|A4W2Q0|GLMM_STRS2

sp|P56600|GAL10_CANMA

sp|A1AR93|GLMM_PELPD

sp|B3Q0A9|G6PI_RHIE6

sp|A0QVY0|DAPF_MYCS2

sp|Q10175|FKBPH_SCHPO

sp|P57649|DAPF_BUCAI

sp|P0CP95|FKBP_CRYNB

sp|Q0AV62|GLMM_SYNWW

sp|Q21M11|G6PI_SACD2

sp|Q4FTU8|GPMI_PSYA2

sp|A0RRK2|GLMM_CAMFF

sp|A0AKM3|GLMM_LISW6

sp|Q5L588|GLMM_CHLAB

sp|Q6LUX5|IDI_PHOPR

sp|Q99YH2|LACB1_STRP1

sp|Q44428|LEUD_ACTTI

sp|P31106|MIP_TATMI

sp|Q08E11|PPIC_BOVIN

sp|C6AR33|GSA_TERTT

sp|Q2FHT1|MURI_STAA3

sp|Q59641|PPIA_PSEAE

sp|Q9ZVQ4|GSTZ2_ARATH

sp|Q8Y759|PRSA1_LISMO

sp|Q9HUK1|PARC_PSEAE

sp|Q8IXY8|PPIL6_HUMAN

sp|Q93KF4|PARC_STAAN

sp|Q9SCY0|PGMP_ARATH

sp|A6Q504|M

sp|Q7VDA1|GSA_PROMA

sp|B9DZG0|GSA_CLOK1

sp|B5BC52|GPMA_SALPK

sp|Q57726|FKBP1_METJA

sp|B4SZH5|GPMA_SALNS

sp|Q5FQZ4|DAPF_GLUOX

sp|Q63B92|GPMA_BACCZ

sp|A5UDX5|DAPF_HAEIE

sp|P0CP97|FKBP2_CRYNB

sp|A1UTM4|GPMA_BARBK

sp|Q324G4|GPMA_SHIBS

sp|O32755|G3P_LACDE

sp|B7GIK0|GSA1_ANOFW

sp|Q16538|GP162_HUMAN

sp|P42799|GSA1_ARATH

sp|B6YS11|DAPF_AZOPC

sp|Q3A201|G6PI_PELCD

sp|Q1GPI7|DAPF_SPHAL

sp|A5V2D8|GLMM_SPHWW

sp|Q1D498|GLMM_MYXXD

sp|Q3B1A1|GSA_PELLD

sp|Q3B3Q8|G6PI_PELLD

sp|Q81XR2|DAPF_BACAN

sp|B7JNG6|GSA1_BACC0

sp|B2A4R9|GLMM_NATTJ

sp|Q5F746|GLMM_NEIG1

sp|Q8FD84|GLMM_ECOL6

sp|A9M1R2|GLMM_NEIM0

sp|B7HQZ0|GLMM_BACC7

sp|A9KBA0|GSA_COXBN

sp|Q1CFN6|GPMA_YERPN

sp|B6I1P9|GLMM_ECOSE

sp|Q9HN20|GLME_HALSA

sp|P0DKY4|CEEP_RUMAL

sp|P51078|CHS5_MEDSA

sp|Q2IBE4|CFTR_PONAB

sp|Q9HUN4|ALR1_PSEAE

sp|Q3BNX4|ALR_XANC5

sp|Q8U1M0|CDR_PYRFU

sp|Q8KB67|ALR_CHLTE

sp|Q93TU6|CAMK_RHOSO

sp|B0TYB0|ALR_FRAP2

sp|A7KAK7|ATG27_PICAN

sp|Q8RGA2|ALR_FUSNN

sp|Q9CYR6|AGM1_MOUSE

sp|A8AC98|APGM_I

sp|P9WNC5|FOLB_MYCTU

sp|B8EBE0|GMHA_SHEB2

sp|A8EVR5|GMHA_ARCB4

sp|B9KYT9|GPMI_THERP

sp|A1WBR8|GSA_ACISJ

sp|P0A0T0|G6PI_XANAC

sp|B1Y3R5|GPMA_LEPCP

sp|A1SE06|GSA_NOCSJ

sp|A4W6Q1|GSA_ENT38

sp|Q8FL16|GSA_ECOL6

sp|B9DSE8|GLMM_STRU0

sp|B7N823|GSA_ECOLU

sp|B2JJY4|DAPF_PARP8

sp|P95575|GLMM_PSEU2

sp|Q5R941|FKB14_PONAB

sp|Q17X40|GSA_HELAH

sp|A1SQZ9|DAPF_PSYIN

sp|Q116W4|GLMM_TRIEI

sp|A1AXW7|GPMI_RUTMC

sp|Q5PA34|GLMM_ANAMM

sp|Q7TYQ1|MBTI_MYCBO

sp|A6TEB3|LSRG_KLEP7

sp|A8A071|LSRG_ECOHS

sp|C5BG15|HIS4_EDWI9

sp|Q18C83|ILVC_CLOD6

sp|A4FYE4|ILVC_METM5

sp|B5EDR5|HIS4_GEOBB

sp|B1LXF3|ILVC_METRJ

sp|Q3IT10|HIS4_NATPD

sp|Q82WM5|HIS4_NITEU

sp|A1B387|HIS4_PARDP

sp|B2JHY0|HIS4_PARP8

sp|Q1CLU7|GSA_YERPN

sp|Q7NTL6|HLDD_CHRVO

sp|Q1LQG2|HLDD_CUPMC

sp|A6WP21|HIS4_SHEB8

sp|Q47GJ3|HLDD_DECAR

sp|P67913|HLDD_SALTI

sp|B7L745|HLDD_ECO55

sp|A1AHF5|HLDD_ECOK1

sp|A4XC73|GSA_SALTO

sp|A9GB86|HIS4_SORC5

sp|A8G9U7|GSA_SERP5

sp|A9R683|HLDD_YERPG

sp|Q51061|HLDD_NEIGO

sp|B2T625|HLDD_PAR


sp|Q034I2|KDUI_LACP3

sp|B3PR22|LEUC_RHIE6

sp|B7NHH9|LEUD_ECO7I

sp|B7MNS9|LEUD_ECO81

sp|C4ZPZ4|LEUD_ECOBW

sp|Q9FPK7|INO1_MAIZE

sp|Q553V2|MCEE_DICDI

sp|B7IN95|LEUC_BACC2

sp|B7HKC6|LEUC_BACC7

sp|B7JFZ4|HIS4_BACC0

sp|A7GMU3|LEUC_BACCN

sp|Q8W3Z2|LUPS_BETPL

sp|Q7XZF7|GYRA_ORYSJ

sp|Q28W60|LEUC_JANSC

sp|B5Y1W4|LEUC_KLEP3

sp|A6T4L7|LEUC_KLEP7

sp|Q9K8F0|LEUC_BACHD

sp|A9VMA7|IDI2_BACMK

sp|B8IPH4|HIS4_METNO

sp|A0R7G6|INO1_MYCS2

sp|B8GJY2|HIS4_METPE

sp|Q41107|INO1_PHAVU

sp|B1LG08|LEUD_ECOSM

sp|B7N7U6|LEUD_ECOLU

sp|A0A0U2UXG3|ITP1_USTMD

sp|Q0SMG9|IDI2_BORAP

sp|Q31MY7|ILVC_SYNE7

sp|Q9D1I5|MCEE_MOUSE

sp|C5B7R1|LEUD_EDWI9

sp|Q6AYK3|INO1_RAT

sp|Q9FYV1|INO1_SESIN

sp|O48965|IDI2_CAMAC

sp|B3EFC7|IDI2_CHLL2

sp|B7MDH8|HIS4_ECO45

sp|Q65GJ0|LEUC_BACLD

sp|B7L9Q1|HIS4_ECO55

sp|Q92IZ6|GYRA_RICCN

sp|Q2NC39|LEUD_ERYLH

sp|P62352|HIS4_BACC1

sp|B7INA2|HIS4_BACC2

sp|Q9UZ06|LEUD1_PYRAB

sp|A1BDG7|IDI2_CHLPD

sp|P41080|GYRA_RICPR

sp|Q98EF1|LEUC_RHILO

sp|Q92A29|ILVC_LISIN

sp|B2T

sp|B8E2W8|ILVC_DICTD

sp|A5IUK6|LEUD_STAA9

sp|P60582|HIS4_RHOPA

sp|O30725|HIS4_RHOCB

sp|A3PXP9|ILVC_MYCSJ

sp|Q2RNA6|HIS4_RHORT

sp|A1RJ17|HIS4_SHESW

sp|A4FPX3|GSA_SACEN

sp|Q1GNC2|HIS4_SPHAL

sp|A1S3U0|GSA_SHEAM

sp|B8EBU3|GSA_SHEB2

sp|Q40424|LCYB_NARPS

sp|Q2R1V8|GME2_ORYSJ

sp|Q44634|EPMB_BUCAP

sp|A0L900|G6PI_MAGMM

sp|Q5ZXU2|DEOB_LEGPH

sp|Q483D3|G6PI2_COLP3

sp|C1DQ86|GMHA_AZOVD

sp|C6DKL9|DEOB_PECCP

sp|B1LME7|FADJ_ECOSM

sp|A4SGN4|GMHA_CHLPM

sp|Q2J2I6|FABA_RHOP2

sp|B0VMS2|G6PI_ACIBS

sp|C4LC62|GMHA_TOLAT

sp|I1RVD9|FUSA2_GIBZE

sp|Q8KAW3|GMHA_CHLTE

sp|B4TDZ4|FABA_SALHS

sp|Q5E847|G6PI_ALIF1

sp|Q1CLE7|GMHA_YERPN

sp|B0USS3|GMHA_HISS2

sp|B2SRM8|GPMA_XANOP

sp|Q62JL8|G6PI_BURMA

sp|O67693|DAPF_AQUAE

sp|B7H1I0|FADB_ACIB3

sp|C3MBY8|GPMA_SINFN

sp|Q11MA0|G6PI_CHESB

sp|Q8CMH6|DEOB_STRA5

sp|B0T2Y6|G6PI_CAUSK

sp|A6UEW3|GPMA_SINMW

sp|Q04L81|DEOB_STRP2

sp|Q6D1H3|GMHA_PECAS

sp|B2IEV6|GPMA_BEII9

sp|B4SGD2|GMHA_PELPB

sp|Q9PK16|G6PI_CHLMU

sp|A1BFF9|G6PI_CHLPD

sp|Q9CNL2|G

sp|Q5L3G4|ALR_GEOKA

sp|Q03T06|ALR_LACBA

sp|P0A9K0|CMPDT_SHIFL

sp|Q887Q3|ALGG_PSESM

sp|P70920|ACNA_BRADU

sp|A3MWR8|APGM_PYRCJ

sp|Q9RTN7|ACNA_DEIRA

sp|A0QX20|ACNA_MYCS2

sp|Q68VV0|ACNA_RICTY

sp|Q8FLA6|CAID_ECOL6

sp|O59251|AMPPA_PYRHO

sp|Q8FB44|G6PI_ECOL6

sp|B8F6Y2|FUCM_HAEPS

sp|B1H1F9|ERO1A_XENTR

sp|Q042K0|G6PI_LACGA

sp|A1ST41|GLMM_PSYIN

sp|B8F7H5|GSA_HAEPS

sp|Q05026|GALE_NEIGO

sp|Q9KWF6|IDI2_KITGR

sp|A9HS54|LEUD_GLUDA

sp|Q2RV54|LEUD_RHORT

sp|Q4QLS1|LEUD_HAEI8

sp|A5F449|ILVC_VIBC3

sp|Q15RU5|HIS4_PSEA6

sp|Q92BX2|IDI2_LISIN

sp|A6UW89|IDI2_META3

sp|P33769|GYRB_BORBU

sp|A2RKR1|LEUD_LACLM

sp|B3VKQ2|IREB2_PIG

sp|Q3IUB0|IDI2_NATPD

sp|Q72RC5|LEUD_LEPIC

sp|P03639|LYS_BPPHS

sp|A0AK95|LEUD_LISW6

sp|A4WH01|IDI2_PYRAR

sp|Q16HW7|GNPI_AEDAE

sp|Q4PEN1|FER5_USTMA

sp|Q0A6I9|GMHA_ALKEH

sp|Q48K61|FABA_PSE14

sp|B2U3R4|GMHA_SHIB3

sp|B8IJW7|DEOB_METNO

sp|B1LYJ9|DEOB_METRJ

sp|A5W140|FABA_PSEP1

sp|Q5HFA5|GSA1_STAAC

sp|B7NFE7|FADB_ECOLU

sp|Q8DQD0|DEOB_STRR6

sp|Q632C7|DA

sp|Q1R246|GPMB_ECOUT

sp|Q8DP16|GLMM_STRR6

sp|Q1BXC7|GLMM_BURCA

sp|A1KM64|DAPF_MYCBP

sp|C0MCW5|GPMA_STRS7

sp|A3NTW6|GLMM_BURP0

sp|Q82U84|DAPF_NITEU

sp|Q944B0|FK161_ARATH

sp|A0Q628|GSA_FRATN

sp|Q8R840|GLMM_CALS4

sp|Q98QA7|GPMI_MYCPU

sp|B2UW73|GLMM_HELPS

sp|Q8RHY5|GLME_FUSNN

sp|Q9D1M7|FKB11_MOUSE

sp|Q5LIL0|GPMI_BACFN

sp|Q88DV3|GLMM_PSEPK

sp|P0A9C3|GALM_ECOLI

sp|A4TRI7|GLMM_YERPP

sp|A8GLC2|GPMI_SERP5

sp|Q3B8D5|KATL2_XENLA

sp|Q8G4W1|LEUD_BIFLO

sp|B9KK15|IDI_RHOSK

sp|P50310|PGK1_CRIGR

sp|D2QYP6|NNRE_PIRSD

sp|A2RG97|MURI_STRPG

sp|A0LT86|ARAA_ACIC1

sp|Q8G7J3|ARAA_BIFLO

sp|A7ZHF3|ARAA_ECO24

sp|B3PD57|ARAA_CELJU

sp|Q5PDF2|ARAA_SALPA

sp|P28012|CFI1_MEDSA

sp|O33947|CATC1_ACILW

sp|Q8W3Z3|CAS2_BETPL

sp|O23390|BARS1_ARATH

sp|Q87EG7|6PGL_XYLFT

sp|Q46ZL2|3HAPM_CUPPJ

sp|Q60326|APGM2_METJA

sp|Q9RSA0|APGM_DEIRA

sp|Q9YBI2|APGM_AERPE

sp|Q74C57|APGM_GEOSL

sp|A1AEY9|FUCI_ECOK1

sp|B5XUY5|FUCI_KLEP3

sp|Q8FEE7|FUCI_ECOL6

sp|Q57KE0|FUCI_SALCH

sp|B5RFL6|FADB_SALG2

sp|B0

sp|A7FLX6|TRUD_YERP3

sp|B7JW76|TIG_RIPO1

sp|Q2W3H8|TIG_MAGSA

sp|Q0AQ08|TIG_MARMM

sp|Q9RT21|TIG_DEIRA

sp|A9MWX9|TIG_SALPB

sp|A9C1U7|TIG_DELAS

sp|B4M0H8|SPAST_DROVI

sp|Q4L716|TIG_STAHJ

sp|P0A9L2|SLYD_SHIFL

sp|Q833J0|TPIS_ENTFA

sp|C7J0A2|TOP3A_ORYSJ

sp|Q97P31|TRPF_STRPN

sp|Q73E74|TOP3_BACC1

sp|P96583|TOP3_BACSU

sp|Q318K9|TRUA_PROM9

sp|A5IKT2|TRPF_THEP1

sp|B6JPU4|TPIS_HELP2

sp|Q8P7R6|TRPF_XANCP

sp|Q2P0U0|TRPF_XANOM

sp|B2SVN9|TRPF_XANOP

sp|Q0IE18|TIG_SYNS3

sp|Q8DLI3|TIG_THEEB

sp|Q5FL49|TPIS_LACAC

sp|Q03SL6|TPIS_LACBA

sp|B3WCW6|TPIS_LACCB

sp|Q5JH82|TOP6A_THEKO

sp|B9KAT4|TIG_THENN

sp|A5IJ92|TIG_THEP1

sp|A8YUE4|TPIS_LACH4

sp|Q8XEV3|SURA_SALTI

sp|Q6Q883|SIRP_LEPMC

sp|Q0A773|TPIS_ALKEH

sp|C3NDW7|TOP6B_SULIY

sp|B6YRU5|TPIS_AZOPC

sp|A1WSF3|TRUA_VEREI

sp|B7JFG5|TPIS_BACC0

sp|Q8KEH1|TRPF_CHLTE

sp|B1XV49|TRUA_POLNS

sp|A3D7L0|TPIS_SHEB5

sp|A5IM09|TRUA_THEP1

sp|Q87EV2|TRUB_XYLFT

sp|B2UNB6|TRUA_AKKM8

sp|Q650L8|TRUB_BACFR

sp|B7J0P9|TRUB_BORBZ

sp|Q2G5F3|TRUB_NO

sp|Q7VKN0|NANE_HAEDU

sp|A9VFF7|NAGB_BACMK

sp|B7J183|NAGB_BORBZ

sp|T2KN98|PLH5_FORAG

sp|B9KQ55|MTNA_RHOSK

sp|Q9CKB7|NANM_PASMU

sp|Q0SQB4|NAGB_CLOPS

sp|Q8SVM5|PMM_ENCCU

sp|B2FKI7|MTNA_STRMK

sp|Q5WA72|PDI15_ORYSJ

sp|A1SS81|NAGB_PSYIN

sp|A9MUG8|NAGB_SALPB

sp|Q83LF6|RLUE_SHIFL

sp|Q82TX6|RPIA_NITEU

sp|Q96AT9|RPE_HUMAN

sp|P26405|RFBK_SALTY

sp|Q0T7E7|TIG_SHIF8

sp|C3MB98|TRPF_SINFN

sp|P66934|TIG_STAAM

sp|P67007|TRPF_STAAW

sp|B1YLE0|TPIS_EXIS2

sp|A5FE44|TPIS_FLAJ1

sp|Q0BTX8|TPIS_GRABC

sp|Q0B004|TRPF_SYNWW

sp|B5YKL4|TRPF_THEYD

sp|P0C2W6|TOP3_STAAB

sp|Q7A455|TOP3_STAAN

sp|Q4L8B8|TOP3_STAHJ

sp|C1CPR1|TIG_STRZT

sp|Q67SK1|TIG_SYMTH

sp|P55275|TPIS_HELVI

sp|B0R4D2|TOP6A_HALS3

sp|A1JHZ7|TPIS_YERE8

sp|Q468I6|TOP6A_METBF

sp|Q3IPW6|TOP6B_NATPD

sp|Q5LZV2|TRUB_STRT1

sp|P46155|TOP1_KLEAE

sp|Q83GI1|TPIS_TROWT

sp|Q5JD25|THI4_THEKO

sp|A0Q7W5|TRUB_FRATN

sp|Q73KE3|TRUA_TREDE

sp|Q4L5V2|TOP1_STAHJ

sp|Q73F64|TRUA1_BACC1

sp|Q32K41|SURA_SHIDS

sp|P41512|TOP1_XENLA

sp|A8GUM4|T


sp|Q9CWX4|RUSD4_MOUSE

sp|B1Z9C6|TIG_METPB

sp|P04789|TPIS_TRYBB

sp|Q9X4E3|TRPF_RHOS4

sp|Q7JNS1|TPIS_DROSI

sp|O61078|TOP2_LEICH

sp|Q2FXQ6|TIG_STAA8

sp|P0AA43|RSUA_ECOLI

sp|P67005|TRPF_STAAM

sp|A5ITK0|TIG_STAA9

sp|Q49YA7|TIG_STAS1

sp|C0M8K8|TIG_STRE4

sp|B1I7S8|TRPF_STRPI

sp|P96763|TPIS_FRATH

sp|Q65N90|TOP3_BACLD

sp|Q8Z6F5|TOP3_SALTI

sp|P43704|TOP3_HAEIN

sp|Q10XS2|TRPF_TRIEI

sp|Q8PJ26|TRPF_XANAC

sp|Q7X222|TPIS_KLEPN

sp|Q0C197|TPIS_HYPNA

sp|B5YI37|TIG_THEYD

sp|Q468I5|TOP6B_METBF

sp|Q2GJB3|TIG_ANAPZ

sp|Q2NJE2|TIG_AYWBP

sp|H2VFI5|SIAA_NEIMB

sp|Q6MSF2|TPIS_MYCMS

sp|Q8THB0|TPIS_METAC

sp|A6KXL2|TPIS_BACV8

sp|B7HEA5|TIG_BACC4

sp|Q12UK2|TPIS_METBU

sp|Q0S212|TRUB_RHOJR

sp|A3CV22|TPIS_METMJ

sp|Q254S9|TRPF_CHLFF

sp|Q1D8I9|TPIS_MYXXD

sp|Q92SW2|TRUB_RHIME

sp|A2RCK6|TRUA_STRPG

sp|Q0I7Z1|TRUB_SYNS3

sp|Q1J4W4|TRUA_STRPF

sp|Q87T31|TPIS_VIBPA

sp|Q975L5|TRUB_SULTO

sp|C6A238|TRUA_THESM

sp|Q07UH2|TRPF_RHOP5

sp|C4LF13|TRUA_TOLAT

sp|Q0BN06|TRUB_FRATO

sp|Q57TG8|SURA_S

sp|O95900|TRUB2_HUMAN

sp|P0ABZ9|SURA_SHIFL

sp|P46799|TOP1_THEMA

sp|Q6A6S8|TRUA_CUTAK

sp|C0PZZ5|TRUA_SALPC

sp|Q38W87|TRUB_LACSS

sp|Q71ZZ4|TRUB_LISMF

sp|Q609C2|TRUB_METCA

sp|P34203|TOP2_ASFM2

sp|Q5E455|TRUA_ALIF1

sp|Q6FF44|TPIS_ACIAD

sp|B0VE65|TPIS_ACIBY

sp|B7GNA1|TRUB_BIFLS

sp|P59877|TRUB_BLOFL

sp|P59879|TRUB_MYCGA

sp|Q9Z8L9|TRUB_CHLPN

sp|P45291|TRUA_HAEIN

sp|P55985|TRUD_HELPY

sp|B0URU7|TRUD_HISS2

sp|Q3YUF1|ULAE_SHISS

sp|B7M9G0|ULAF_ECO8A

sp|C4ZR75|ULAF_ECOBW

sp|Q0TBN7|XYLA_ECOL5

sp|B7N095|UXAC_ECO81

sp|Q65V66|UXUA_MANSM

sp|B7ND80|UXAC_ECOLU

sp|P65843|Y1738_MYCBO

sp|A9KNI7|UXAC_LACP7

sp|Q6A5D0|UXAC_CUTAK

sp|B4THM6|UXUA_SALHS

sp|P0A2M8|UXUA_SALTI

sp|Q82KZ4|Y2218_STRAW

sp|A5IL06|UXAC_THEP1

sp|Q8EH68|TYRA_SHEON

sp|Q7MBZ3|UXAC_VIBVY

sp|Q02473|PRSA_LACPA

sp|Q2YWT2|PPI1_STAAB

sp|A6V291|HLDD_PSEA7

sp|P0AFL5|PPIA_ECO57

sp|B2S303|MURI_TREPS

sp|O83421|MURI_TREPA

sp|P84343|PPIA_NEOCA

sp|P65763|PPIA_MYCBO

sp|A2ZCP0|MTNA_ORYSI

sp|Q9VTZ6|PMM_DROME

sp|O4397

sp|Q9XB60|CARB_PECCC

sp|B6D9A8|BEA1_BEABA

sp|Q2QLF9|CFTR_CALJA

sp|Q8XAC2|AGAS_ECO57

sp|O87872|BAMA_THAAR

sp|O43102|CBF5_ASPFU

sp|Q2Y763|ALR_NITMU

sp|Q98A05|ALR1_RHILO

sp|P0A6B5|ALR1_SHIFL

sp|A9L126|ALR_SHEB9

sp|Q07XT3|ALR_SHEFN

sp|A8H8K6|ALR_SHEPA

sp|A4Y3F6|ALR_SHEPC

sp|Q8EXP7|CBIC_LEPIN

sp|Q4V7F2|CREL1_RAT

sp|Q03DZ2|ALR_PEDPA

sp|Q0HF52|ALR_SHESM

sp|B9XEK4|AXEP_PEDPL

sp|B2TX62|ALR_SHIB3

sp|P29012|ALR2_ECOLI

sp|Q3V7I5|ALR_PHOPR

sp|Q9HTQ2|ALR2_PSEAE

sp|Q327W0|ALR_SHIDS

sp|Q5MQ85|CPS2_ORYSI

sp|Q3JZK5|ALR_STRA1

sp|Q8E3M9|ALR_STRA3

sp|Q9CYA0|CREL2_MOUSE

sp|P63483|ALR2_STAAN

sp|Q48PY9|ALR_PSE14

sp|Q0JF02|CPS4_ORYSJ

sp|Q8DSF4|ALR_STRMU

sp|Q42560|ACO1_ARATH

sp|P33162|ALR_PSEFL

sp|B8ZMG1|ALR_STRPJ

sp|B0VNF6|ALR_ACIBS

sp|A5G2H9|ALR_ACICJ

sp|Q48RM8|ALR_STRPM

sp|Q4ZZW2|ALR_PSEU2

sp|B5XI21|ALR_STRPZ

sp|A4W3N5|ALR_STRS2

sp|P50983|CA1_CONIM

sp|A0LFQ9|ALR_SYNFM

sp|A1WQY3|ALR_VEREI

sp|Q2KXU7|ALR_BORA1

sp|H1ZZA4|AUAG_STIAU

sp|Q9PP26|ALR_CAMJE

sp|A8Z076|CDR_S


sp|Q55508|TRPC_SYNY3

sp|Q3YU59|UXUA_SHISS

sp|Q1CG57|UXUA_YERPN

sp|O34893|YNGF_BACSU

sp|Q8XQR9|Y4651_RALSO

sp|Q68WY2|Y383_RICTY

sp|Q9ABF0|TRUA_CAUVC

sp|A6VCJ9|TRUB_PSEA7

sp|B8F382|TRUA_HAEPS

sp|Q319E0|TRUB_PROM9

sp|G3XD61|WBPI_PSEAE

sp|P09095|TYCA_BREPA

sp|P47610|Y370_MYCGE

sp|P0C1B2|ULAE_SHIDS

sp|O67500|Y1546_AQUAE

sp|Q814P5|Y5378_BACCR

sp|Q9HY42|Y3578_PSEAE

sp|Q9PJZ7|Y679_CHLMU

sp|A1STH1|Y944_PSYIN

sp|B3WAI6|TRUA_LACCB

sp|P60343|TRUB_CORDI

sp|Q5L629|TRUB_CHLAB

sp|B8GYF0|TRUA_CAUVN

sp|A4QEY6|TRUB_CORGB

sp|Q47D96|TRUB_DECAR

sp|A5W822|TRUD_PSEP1

sp|A9MQL9|UXAC_SALAR

sp|C0PYB4|UXUA_SALPC

sp|Q57JX8|UXAC_SALCH

sp|Q57JY0|UXUA_SALCH

sp|P9WHQ3|Y1540_MYCTU

sp|Q8A7U2|UXUA_BACTN

sp|Q8EFG5|Y2008_SHEON

sp|B7GJ99|TRUA_ANOFW

sp|B0BTK6|TPIS_ACTPJ

sp|A0QIX1|TRUB_MYCA1

sp|Q65JH8|TRUB_BACLD

sp|Q8ZH72|TRUC_YERPE

sp|Q196X4|TOP2_IIV3

sp|Q2IFQ4|TRUD_ANADE

sp|Q89BP1|TRUA_BRADU

sp|P60346|TRUB_NANEQ

sp|A7NCA0|TRUA_FRATF

sp|Q1IF41|TRUB_PSEE4

sp|B0USP1|TRUA_HISS2

sp|A

## Create pandas dataframe using seqID & sequences

In [15]:
# create dataframe of 2000 additional isomerase sequences
#d_iso = {'name':seq_ids,'sequence':seqs}
d_iso = {'name':seqID,'sequence':iso_seqs}
df = pd.DataFrame(d_iso)
df.head()

Unnamed: 0,name,sequence
0,sp|P61615|IDI2_SACSH,MPDIVNRKVEHVEIAAFENVDGLSSSTFLNDVILVHQGFPGISFSE...
1,sp|Q746I8|IDI2_THET2,MNIRERKRKHLEACLEGEVAYQKTTTGLEGFRLRYQALAGLALGEV...
2,sp|Q8ZWV0|PGMI_PYRAE,MSQLLQDYLNWENYILRRVDFPTSYVVEGEVVRIEAMPRLYISGMG...
3,sp|P42126|ECI1_HUMAN,MALVASVRVPARVLLRAGARLPGAALGRTERAAGGGDGARRFGSQR...
4,sp|Q9WUR2|ECI2_MOUSE,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...


In [16]:
d_iso2 = {'name':seq_ids,'sequence':seqs}

df2 = pd.DataFrame(d_iso2)
df2.head()

Unnamed: 0,name,sequence
0,sp|P61615|IDI2_SACSH,MPDIVNRKVEHVEIAAFENVDGLSSSTFLNDVILVHQGFPGISFSE...
1,sp|Q746I8|IDI2_THET2,MNIRERKRKHLEACLEGEVAYQKTTTGLEGFRLRYQALAGLALGEV...
2,sp|Q8ZWV0|PGMI_PYRAE,MSQLLQDYLNWENYILRRVDFPTSYVVEGEVVRIEAMPRLYISGMG...
3,sp|P42126|ECI1_HUMAN,MALVASVRVPARVLLRAGARLPGAALGRTERAAGGGDGARRFGSQR...
4,sp|Q9WUR2|ECI2_MOUSE,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...


In [17]:
df.shape

(18716, 2)

In [18]:
df2.shape

(18716, 2)

In [19]:
# remove non-coding amino acid letters like X,B,U
df['sequence'] = df['sequence'].str.replace('U','G')
df['sequence'] = df['sequence'].str.replace('X','G')
df['sequence'] = df['sequence'].str.replace('B','G')

In [20]:
# remove non-coding amino acid letters like X,B,U
df2['sequence'] = df2['sequence'].str.replace('U','G')
df2['sequence'] = df2['sequence'].str.replace('X','G')
df2['sequence'] = df2['sequence'].str.replace('B','G')

In [21]:
#df['MW'] = df['sequence'].apply(lambda seq: ProteinAnalysis(seq).molecular_weight())
#df.head()

In [22]:
#df2['MW'] = df2['sequence'].apply(lambda seq: ProteinAnalysis(seq).molecular_weight())
#df2.head()

In [23]:
# Create 2 columns for the dataframe
df['classification'] = 'ISOMERASE'
df['type'] = 2

df.head(10)

Unnamed: 0,name,sequence,classification,type
0,sp|P61615|IDI2_SACSH,MPDIVNRKVEHVEIAAFENVDGLSSSTFLNDVILVHQGFPGISFSE...,ISOMERASE,2
1,sp|Q746I8|IDI2_THET2,MNIRERKRKHLEACLEGEVAYQKTTTGLEGFRLRYQALAGLALGEV...,ISOMERASE,2
2,sp|Q8ZWV0|PGMI_PYRAE,MSQLLQDYLNWENYILRRVDFPTSYVVEGEVVRIEAMPRLYISGMG...,ISOMERASE,2
3,sp|P42126|ECI1_HUMAN,MALVASVRVPARVLLRAGARLPGAALGRTERAAGGGDGARRFGSQR...,ISOMERASE,2
4,sp|Q9WUR2|ECI2_MOUSE,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,ISOMERASE,2
5,sp|O75521|ECI2_HUMAN,MAMAYLAWRLARRSCPSSLQVTSFPVVQLHMNRTAMRASQKDFENS...,ISOMERASE,2
6,sp|Q5XIC0|ECI2_RAT,MAAVTWSRARCWCPSLLQVLRLPVTKLHLGRPAMRATQQDFENAMN...,ISOMERASE,2
7,sp|Q46822|IDI_ECOLI,MQTEHVILLNAQGVPTGTLEKYAAHTADTRLHLAFSSWLFNAKGQL...,ISOMERASE,2
8,sp|Q15125|EBP_HUMAN,MTTNAGPLHPYWPQHLRLDNFVPNDRPTWHILAGLFSVTGVLVVTT...,ISOMERASE,2
9,sp|O22263|PDI21_ARATH,MAKSQIWFGFALLALLLVSAVADDVVVLTDDSFEKEVGKDKGALVE...,ISOMERASE,2


In [24]:
df.shape

(18716, 4)

In [25]:
# remove the duplicate protein sequences
df = df.drop_duplicates(subset='sequence', keep="first")
df.shape

(15534, 4)

In [26]:
df.head(10)

Unnamed: 0,name,sequence,classification,type
0,sp|P61615|IDI2_SACSH,MPDIVNRKVEHVEIAAFENVDGLSSSTFLNDVILVHQGFPGISFSE...,ISOMERASE,2
1,sp|Q746I8|IDI2_THET2,MNIRERKRKHLEACLEGEVAYQKTTTGLEGFRLRYQALAGLALGEV...,ISOMERASE,2
2,sp|Q8ZWV0|PGMI_PYRAE,MSQLLQDYLNWENYILRRVDFPTSYVVEGEVVRIEAMPRLYISGMG...,ISOMERASE,2
3,sp|P42126|ECI1_HUMAN,MALVASVRVPARVLLRAGARLPGAALGRTERAAGGGDGARRFGSQR...,ISOMERASE,2
4,sp|Q9WUR2|ECI2_MOUSE,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,ISOMERASE,2
5,sp|O75521|ECI2_HUMAN,MAMAYLAWRLARRSCPSSLQVTSFPVVQLHMNRTAMRASQKDFENS...,ISOMERASE,2
6,sp|Q5XIC0|ECI2_RAT,MAAVTWSRARCWCPSLLQVLRLPVTKLHLGRPAMRATQQDFENAMN...,ISOMERASE,2
7,sp|Q46822|IDI_ECOLI,MQTEHVILLNAQGVPTGTLEKYAAHTADTRLHLAFSSWLFNAKGQL...,ISOMERASE,2
8,sp|Q15125|EBP_HUMAN,MTTNAGPLHPYWPQHLRLDNFVPNDRPTWHILAGLFSVTGVLVVTT...,ISOMERASE,2
9,sp|O22263|PDI21_ARATH,MAKSQIWFGFALLALLLVSAVADDVVVLTDDSFEKEVGKDKGALVE...,ISOMERASE,2


## Save dataframe of isomerase sequences as a csv file

In [28]:
df.to_csv('uniprot-isomerase.csv',index = None)

proteins = pd.read_csv('uniprot-isomerase.csv')

proteins.head()

Unnamed: 0,name,sequence,classification,type
0,sp|P61615|IDI2_SACSH,MPDIVNRKVEHVEIAAFENVDGLSSSTFLNDVILVHQGFPGISFSE...,ISOMERASE,2
1,sp|Q746I8|IDI2_THET2,MNIRERKRKHLEACLEGEVAYQKTTTGLEGFRLRYQALAGLALGEV...,ISOMERASE,2
2,sp|Q8ZWV0|PGMI_PYRAE,MSQLLQDYLNWENYILRRVDFPTSYVVEGEVVRIEAMPRLYISGMG...,ISOMERASE,2
3,sp|P42126|ECI1_HUMAN,MALVASVRVPARVLLRAGARLPGAALGRTERAAGGGDGARRFGSQR...,ISOMERASE,2
4,sp|Q9WUR2|ECI2_MOUSE,MAAVTWSRARCWCPSVLQVFRLQVAKLHLGRPTMRASQQDFENALN...,ISOMERASE,2
