Import all raw sequence and accession code data for further analysis

In [1]:
import urllib, urllib.parse, urllib.request, re, csv
import pandas as pd
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna, generic_rna, generic_protein
from Bio import pairwise2, SeqIO

## IMPORT SEER Library accession numbers from Chris Gregg's excel sheet
SEERids = []
with open('SEER Library IDs.csv','r') as f:
    reader = csv.reader(f)
    for row in reader:
        for i in row:
            SEERids.append(i)
SEERids = list(set(SEERids))

## IMPORT SEER Library sequences from Chris Gregg's excel sheet    
SEERseqs = []
with open('SEER Library seqs.csv','r') as f:
    reader = csv.reader(f)
    for row in reader:
        for i in row:
            SEERseqs.append(i)
SEERseqs = list(set(SEERseqs))
           
## Dictionary for Import of FASTA files
fasta_files = {'PF03837f': "PF03837_full.fasta",
               #'PF03837u': "PF03837_ncbi.fasta",
               'PF03837n': "PF03837_uniprot.fasta",
               'PF04098f': "PF04098_full.fasta",
               #'PF04098u': "PF04098_ncbi.fasta",
               'PF04098n': "PF04098_uniprot.fasta",
               'PF04404f': "PF04404_full.fasta",
               #'PF04404u': "PF04404_ncbi.fasta",
               'PF04404n': "PF04404_uniprot.fasta",
               'COG3723': "RecT_family_COG3723_alignment.fasta",
               'PRK09846': "RecT_family_PRK09846_alignment.fasta",
               'TIGR01913': "RecT_family_TIGR01913_alignment.fasta",
               'uprotRecTs': "uniprotSearch-rect.fasta",
               'uprotBets': "uniprotSearch-bet+recombinase.fasta"
              }

fasta_seqs = {}
all_IDs = []

for fasta in fasta_files.items():
    fasta_seqs[fasta[0]] = {}
    
    ## Import data
    temp_seqs = SeqIO.index(fasta[1], "fasta")
    
    ## Clean up IDs and create sequence dictionary from fasta seq record from UniProt search
    if fasta[0][-1] == 's':
        for seq in temp_seqs:
            name = re.split('\|',temp_seqs[seq].id)[1]
            fasta_seqs[fasta[0]][name] = temp_seqs[seq].seq
    
    ## Clean up IDs and create sequence dictionary from fasta seq record from pFam alignment
    elif fasta[0][-1] in ['f','u','n']:
        for seq in temp_seqs:
            name = re.split('\/',temp_seqs[seq].id)[0]
            fasta_seqs[fasta[0]][name] = temp_seqs[seq].seq
    
    ## Create sequence dictionary from fasta seq record for files that don't need cleanup
    else:
        for seq in temp_seqs:
            name = temp_seqs[seq].id
            fasta_seqs[fasta[0]][name] = temp_seqs[seq].seq
            
    all_IDs += list(fasta_seqs[fasta[0]].keys())
    n = len(fasta_seqs[fasta[0]].keys())
    print("There are",n,"unique protein accession codes from",fasta[1])

all_IDs = list(set(all_IDs))
print("Total unique accessions: ",len(all_IDs))


There are 6 unique protein accession codes from RecT_family_TIGR01913_alignment.fasta
There are 3012 unique protein accession codes from uniprotSearch-rect.fasta
There are 2686 unique protein accession codes from PF03837_uniprot.fasta
There are 150 unique protein accession codes from uniprotSearch-bet+recombinase.fasta
There are 1175 unique protein accession codes from PF04404_uniprot.fasta
There are 122 unique protein accession codes from PF04404_full.fasta
There are 9 unique protein accession codes from RecT_family_COG3723_alignment.fasta
There are 487 unique protein accession codes from PF04098_full.fasta
There are 17 unique protein accession codes from RecT_family_PRK09846_alignment.fasta
There are 222 unique protein accession codes from PF03837_full.fasta
There are 1409 unique protein accession codes from PF04098_uniprot.fasta
Total unique accessions:  6992


Search list of sequences and accession codes from the SEER library and other databases against UniProt Protein accession codes.  The goal here is to standardize the format and information for each of the groups of protein sequence libraries and families.

In [2]:
all_IDs

['F6ALE4',
 'A0A0Y7I8I4',
 'D7FGZ9',
 'A0A073G0S0',
 'A0A017K4Y7',
 'C9CD62',
 'S0K3A7_9ENTE',
 'M2SML7',
 'F1RT75',
 'A4IS95',
 'X1JQN3',
 'A0A0F9U6F5',
 'Q8ZQA9_SALTY',
 'R6QSG7_9CLOT',
 'A0A0F8ZT94',
 'Q55E38_DICDI',
 'A0A0F9A4A2',
 'S4B661',
 'A0A0H0CVU4',
 'Q38143',
 'A0A0U5EW20',
 'A0A0P7LMT4',
 'Q7P5T6',
 'A1BYL2',
 'L4VJW7',
 'A0A0N1QUW4',
 'U0EK68',
 'A0A0W0LN32',
 'A0A0D3WSW7',
 'X0Y9E7',
 'I0VN51',
 'A0A0A7FSW5_9CLOT',
 'K0KFN7',
 'K4X819',
 'A0A072WTC5',
 'A0A0F9Q289',
 'A0A0F9GA79',
 'A0A0V8JPF1',
 'S1DY43',
 'R7QUX8',
 'A0A0E2HL91',
 'H4ZQX4',
 'A0A072MS00',
 'H2AYA3_KAZAF',
 'W2CA08',
 'N1ZFD4',
 'Q1QRC9',
 'L5LFX3',
 'A0A0E8ZK55',
 'G5AX77',
 'A0A0L7ZBW6',
 'U2Q670',
 'J8N9T7',
 'F7A626_MACMU',
 'S7UCS4',
 'Q833D2_ENTFA',
 'H8X6X7',
 'E8YC23',
 'Q9KJ60',
 'W2G657',
 'A0A0F3FRD4',
 'W5SPG7',
 'U5U416',
 'V6JX32',
 'Q32GM7',
 'G5BWP4_HETGA',
 'U5PVJ2',
 'C2LH21',
 'A0A0C5J0W6',
 'W1ERA1',
 'M1E1W1',
 'A0A0E2BL88',
 'H5DZN1',
 'Q24LE7',
 'V2N829',
 'A0A0F7JNA6',
 'G8NBB7',

In [3]:
## Query list of UniProt IDs to pull down UniParc codes.  Output list of matched and 
## unmatched IDs.

url = 'http://www.uniprot.org/mapping/'
contact = "wannier@gmail.com" 
query = '\t'.join(all_IDs)
params = {'from':'ACC','to':'UPARC','format':'tab','query':query}

data = urllib.parse.urlencode(params)
binary_data = data.encode('ascii')
request = urllib.request.Request(url,binary_data)
request.add_header('User-Agent', 'Python %s' % contact)
response = urllib.request.urlopen(request)
page = str(response.read(200000))
decoded_page = bytes(page, "utf-8").decode("unicode_escape")

matched = {}
unmatched = []

for line in decoded_page.splitlines(): #parse response and create list of matches
    split = line.split('\t')
    if len(split) > 1 and split[1] != 'To':
        matched[split[0]] = split[1]
        
for item in all_IDs: #append unmatched items to list
    if item not in matched.keys():
        unmatched.append(item)

print("Matched",len(matched.keys()),"IDs with UniProtKB AC accession codes.")
print(len(unmatched),"IDs remain unmatched")

Matched 6255 IDs with UniProtKB AC accession codes.
737 IDs remain unmatched


In [4]:
## Query list of RefSeq IDs to pull down UniParc codes.  Create new list to store IDs matched to uniProt codes
## instead of uniParc codes. 
url = 'http://www.uniprot.org/mapping/'
contact = "wannier@gmail.com" 
query = '\t'.join(unmatched)
params = {'from':'P_REFSEQ_AC','to':'ACC','format':'tab','query':query}

data = urllib.parse.urlencode(params)
binary_data = data.encode('ascii')
request = urllib.request.Request(url, binary_data)
request.add_header('User-Agent', 'Python %s' % contact)
response = urllib.request.urlopen(request)
page = str(response.read(200000))
decoded_page = bytes(page, "utf-8").decode("unicode_escape")

semi_matched = {}
temp_dict = {}
for line in decoded_page.splitlines():
    split = line.split('\t')
    if len(split) > 1 and split[1] != 'To':
        temp_dict[split[1]] = split[0]
        
semi_matched.update(temp_dict)

for item in all_IDs:
    if item in temp_dict.values():
        unmatched.remove(item)

print("Matched",len(temp_dict.keys()),"IDs with RefSeq Protein accession codes.")
print(len(unmatched),"items remain unmatched")

Matched 45 IDs with RefSeq Protein accession codes.
717 items remain unmatched


In [5]:
## Query list of EMBL IDs to pull down UniParc codes.  Output list of matched and 
## unmatched IDs.
url = 'http://www.uniprot.org/mapping/'
contact = "wannier@gmail.com" 
query = '\t'.join(unmatched)
params = {'from':'EMBL','to':'ACC','format':'tab','query':query}

data = urllib.parse.urlencode(params)
binary_data = data.encode('ascii')
request = urllib.request.Request(url, binary_data)
request.add_header('User-Agent', 'Python %s' % contact)
response = urllib.request.urlopen(request)
page = str(response.read(200000))
decoded_page = bytes(page, "utf-8").decode("unicode_escape")

temp_dict = {}
for line in decoded_page.splitlines():
    split = line.split('\t')
    if len(split) > 1 and split[1] != 'To':
        temp_dict[split[1]] = split[0]

semi_matched.update(temp_dict)

for item in all_IDs:
    if item in temp_dict.values():
        unmatched.remove(item)

#check for missing items that are not UniParc codes
for item in unmatched:
    if item[0:3] != 'UPI':
        print('missing:',item)

print("Matched",len(temp_dict.keys()),"IDs with EMBL/GenBank/DDBJ CDS accession codes.")

missing: A0A017K4Y7
missing: C9CD62
missing: H4ZQX4
missing: A0A0E8ZK55
missing: S7UCS4
missing: H5DZN1
missing: V2N829
missing: E6IQ47
missing: A0A0E8Z799
missing: A0A0F6UNN4
missing: T2P1F7
missing: A0A0A1SLA0
missing: I4J2Z2
missing: A0A0E8L1Z1
missing: A0A017JXQ6
missing: S4FQ15
missing: J7AV53
missing: A0A0C7PI27
missing: C7VGA1
missing: A0A0C6G4B2
missing: R0MRP8
missing: R3Y166
missing: E4JB24
missing: E6HNV5
missing: H4ZX22
missing: I4V7E2
missing: L8A101
missing: R3BLI3
missing: A0A0G2MW14
missing: D3LK12
missing: V7RHQ2
missing: S0XVW4
missing: C3NU24
missing: R3YA23
missing: C7YBX3
missing: A0A010EMH0
missing: A0A0E2K793
missing: A0A0C6G5V6
missing: A0A0F6WKD4
missing: V5GTD0_PSEBG
missing: X6RP69
missing: K8C0F6
missing: X6RUL6
missing: A0A010EUI5
missing: X7I0D6
missing: A0A052ILE8_9BORD
missing: S0VKT4
missing: U8MQ31
missing: A0A0E8AAH6
missing: W8GKQ4
missing: A0A011IPJ5
missing: S1GT82
missing: A0A098C944
missing: A0A066Z9B9
missing: A0A0E2R5U2
missing: R0VI54
missing:

Finalize list of Uniparc codes

In [6]:
url = 'http://www.uniprot.org/mapping/'
contact = "wannier@gmail.com" 
query = '\t'.join(semi_matched.keys())
params = {'from':'ACC','to':'UPARC','format':'tab','query':query}

data = urllib.parse.urlencode(params)
binary_data = data.encode('ascii')
request = urllib.request.Request(url, binary_data)
request.add_header('User-Agent', 'Python %s' % contact)
response = urllib.request.urlopen(request)
page = str(response.read(200000))
decoded_page = bytes(page, "utf-8").decode("unicode_escape")

parcIDs = list(unmatched) + list(matched.values())
for line in decoded_page.splitlines():
    split = line.split('\t')
    if len(split) > 1 and split[1] != 'To':
        parcIDs.append(split[1])
        matched[split[0]] = split[1]

parcIDfinal = set(parcIDs)

print(len(parcIDs),'total entries matched to UniParc codes',len(parcIDs)-len(parcIDfinal),
      'of which are repetitive, leaving a total library size of:',len(parcIDfinal))

7017 total entries matched to UniParc codes 1982 of which are repetitive, leaving a total library size of: 5035


Match each UniParc reference to a RefSeq number, which is the NCBI equivalent to a UniParc code.

In [7]:
query = '\t'.join(parcIDfinal)
params = {'from':'ACC','to':'P_REFSEQ_AC','format':'tab','query':query}

data = urllib.parse.urlencode(params)
binary_data = data.encode('ascii')
request = urllib.request.Request(url, binary_data)
request.add_header('User-Agent', 'Python %s' % contact)
response = urllib.request.urlopen(request)
page = str(response.read(200000))
decoded_page = bytes(page, "utf-8").decode("unicode_escape")

parcRefSeq = {}
for line in decoded_page.splitlines():
    split = line.split('\t')
    if len(split) > 1 and split[1] != 'To':
        parcRefSeq[split[0]] = split[1]

print(len(parcRefSeq.keys()),'RefSeq entries found from the UniParc list')

2821 RefSeq entries found from the UniParc list


Download all Parc sequences and search for all UniProt entries associated with each UniParc protein sequence.  Then in the next cell, translate the set of SEER sequences and check them against the UniParc sequences.  There are expected to be many sequences that are effectively broken because of a mistake in the text hashing by Chris Gregg in ordering Lib1


In [8]:
parcSeqs = {}
parcACCs = {}

## Get UniParc sequences
base_url = 'http://www.uniprot.org/uniparc/'

for acc in parcIDfinal:
    url = base_url + str(acc) + '.fasta'
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    page = str(response.read(200000))
    parcSeqs[acc] = ''.join(re.split('\\\\n',page)[1:-1]) #store sequence from fasta file in dic

## Get UniProt accessions
query = '\t'.join(parcIDfinal)
url = 'http://www.uniprot.org/mapping/'
params = {'from':'UPARC','to':'ACC','format':'tab','query':query}

data = urllib.parse.urlencode(params)
binary_data = data.encode('ascii')
request = urllib.request.Request(url, binary_data)
request.add_header('User-Agent', 'Python %s' % contact)
response = urllib.request.urlopen(request)
page = str(response.read(200000))
decoded_page = bytes(page, "utf-8").decode("unicode_escape")

for line in decoded_page.splitlines():
    split = line.split('\t')
    if len(split) > 1 and split[1] != 'To':
        if split[0] in parcACCs.keys():
            parcACCs[split[0]].append(split[1])
        else:
            parcACCs[split[0]] = [split[1]]

HTTPError: HTTP Error 500: Internal Server Error

In [None]:
seqsTranslated = {}
i = []
for s in SEERseqs:
    i = Seq(s, generic_dna)
    seqsTranslated[s] = str(i.translate())

incount = 0
partcount = 0
outcount = 0
parcPartialSeq = {}
parcPresentSeq = {}
parcSEERNT = {}
parcCut = {}
for s in seqsTranslated.items():
    part = 0
    whole = 0
    pProt = ''
    pSeq = ''
    wProt = ''
    wSeq = ''
    for t in parcSeqs.items():
        if s[1][:-1] in t[1]:
            whole = 1
            wProt = t[0]
            wSeq = s[1][:-1]
        elif s[1][60:-1] in t[1]:
            part = 1
            pProt = t[0]
            pSeq = s[1][:-1]  
            wSeq = t[1]
    if whole == 1:
        incount += 1
        parcPresentSeq[wProt] = wSeq
        parcSEERNT[wProt] = s[0]
    elif part == 1:
        partcount += 1
        align = pairwise2.align.globalms(wSeq,pSeq, 2, -1, -.5, -.1)[0][1]
        parcPartialSeq[pProt] = align
        parcCut[pProt] = str(align.find('-')+1) + '-' + str(align.rfind('-')+1)
        parcSEERNT[pProt] = s[0]
    else:
        outcount += 1

print("there are ",incount,"complete sequence(s) found in the parcSeqs,",
      partcount,"partial sequence(s) found (which have gaps near the N-terminus",
      "because of a mistaken synthesis order, and",outcount,"sequence(s) missing."
     )

The next series of cells deals with getting info from UniProt entries associated with each UniParc entry and assigning them to dictionaries associated with both UniProt and UniParc entries.  Dictionaries are called either protX or parcX depending on if they use the UniProt accession number as they key or the UniParc accession number as the key.  Any UniParc entry comprises only one protein sequence, while many UniProt entries may refer to the same UniParc sequence, as identical protein sequences may come from a variety of different organisms.  

The UniParc entries for which no UniProt entry can be found are listed at the ened.

In [None]:
## Get UniProt entries in text format
base_url = 'http://www.uniprot.org/uniprot/'
protEntry = {}

for parc in parcACCs.keys():
    for acc in parcACCs[parc]:
        url = base_url + str(acc) + '.txt'
        request = urllib.request.Request(url)
        response = urllib.request.urlopen(request)
        page = str(response.read(20000))
        protEntry[acc] = page
        

In [None]:
## Get UniParc entries in text format
base_url = 'http://www.uniprot.org/uniparc/'
parcSeq = {}

for parc in parcIDfinal:
    url = base_url + str(parc) + '.fasta'
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    page = str(response.read(20000))
    parcSeq[parc] = "".join(re.split("\\\\n",page)[1:-1])

In [None]:
protPfam = {}
protOrganism = {}
protPhylo = {}
protHost = {}
protProtein = {}

for entry in protEntry.items():
    organism = ''
    pfam = ''
    phylo = ''
    host = ''
    protein = ''
    for line in re.split('\\\\n',entry[1]):
        if line[0:2] == 'OS':
            organism += line[5:]
        elif line[0:9] == 'DR   Pfam':
            pfam += line[11:]
        elif line[0:2] == 'OC':
            phylo += line[5:]
        elif line[0:2] == 'OH':
            host += line[5:]
        elif line[0:13] == 'DE   SubName:':
            protein = re.search('(?<=\=)(.*?)(?=[\w\s][;{])',line).group(0)
    if organism != '':
        protOrganism[entry[0]] = organism
    if pfam != '':
        protPfam[entry[0]] = pfam.split(';')[0:2]
    if phylo != '':
        protPhylo[entry[0]] = list(map(lambda x: re.sub('[\s]+','',x), phylo.split(';')))
    if host != '':
        protHost[entry[0]] = host.split(';')
    if protein != '':
        protProtein[entry[0]] = protein

In [None]:
parcOrganism = {}
parcPfam = {}
parcBPhylum = {}
parcBClass = {}
parcBOrder = {}
parcBFamily = {}
parcEFamily = {}
parcVPhylo = {}
parcHost = {}
parcProtein = {}

for parc in parcACCs.keys(): #go through every uniParc entry
    organism = []
    pfam = []
    Bphylo = []
    Vphylo = []
    Ephylo = []
    host = []
    name = []
    for prot in parcACCs[parc]: #go through every uniProt entry associated with each uniParc one
        if prot in protOrganism.keys():
            organism.append(protOrganism[prot])
        if prot in protPfam.keys():
            pfam.append(','.join(protPfam[prot]))
        if prot in protPhylo.keys():
            i = protPhylo[prot]
            if i[0] == 'Bacteria' and len(i) > 4:
                Bphylo.append(i[1:5])
            if i[0] == 'Viruses' and len(i) > 3:
                Vphylo.append(i[3])
            if i[0] == 'Eukaryota' and len(i) > 3:
                Ephylo.append(i[-2:-1])
        if prot in protHost.keys():
            host.append(','.join(protHost[prot]))
        if prot in protProtein.keys():
            name.append(protProtein[prot])
        if prot in protSeq.keys():
            seq.append(protSeq[prot])
    if len(organism) > 0:
        parcOrganism[parc] = ','.join(list(set(organism)))
    if len(pfam) > 0:
        parcPfam[parc] = ','.join(list(set(pfam)))
    if len(Bphylo) > 0:
        parcBPhylum[parc] = ','.join(list(set([item[0] for item in Bphylo])))
        parcBClass[parc] = ','.join(list(set([item[1] for item in Bphylo])))
        parcBOrder[parc] = ','.join(list(set([item[2] for item in Bphylo])))
        parcBFamily[parc] = ','.join(list(set([item[3] for item in Bphylo])))
    if len(Ephylo) > 0:
        parcEFamily[parc] = ','.join(list(set([item[0] for item in Ephylo])))
    if len(Vphylo) > 0:
        parcVPhylo[parc] = ','.join(list(set(Vphylo)))
    if len(host) > 0:
        parcHost[parc] = ','.join(list(set(host)))
    if len(name) > 0:
        parcProtein[parc] = ','.join(list(set(name)))

Find UniParc IDs for which there are no associated UniProt IDs, and instead query the RefSeq database for relevant information.

In [None]:
missingIDs = set(parcIDfinal)-set(parcACCs.keys())
print("UParc accessions with no UProt entry:\n",missingIDs,'\n')

## Get RefSeq entries from the web
base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&rettype=gp&retmode=text&id='

for i in missingIDs:
    ref = parcRefSeq[i]
    url = base_url + str(parcRefSeq[i])
    request = urllib.request.Request(url)
    response = urllib.request.urlopen(request)
    page = str(response.read(200000))
    ## Parse information out of GenBank files
    phylo = []
    name = ''
    organism = ''
    j = 0
    for line in re.split('\\\\n',page):
        split = re.findall('[\w\-.]+',line)
        if j == 1:
            for term in split:
                phylo.append(term)
            j += 1
        if len(split) > 0:
            if split[0] == 'Bacteria' or split[0] == 'Viruses':
                for term in split:
                    phylo.append(term)
                j += 1
            elif split[0] == 'DEFINITION':
                name = ' '.join(split[1:])
            elif split[0] == 'ORGANISM':
                organism = ' '.join(split[1:])
    if phylo[0] == 'Bacteria':
        parcBPhylum[i] = phylo[1]
        parcBClass[i] = phylo[2]
        parcBOrder[i] = phylo[3]
        parcBFamily[i]  = phylo[4]
    if phylo[0] == 'Viruses':
        parcVPhylo[i] = phylo[4]
    if len(organism) > 0:
        parcOrganism[i] = organism
    if len(name) > 0:
        parcProtein[i] = name

Figure out the source of each of the unique protein entries, and often they are present in multiple entry databases.  Here, these databases include the SEER library from Chris Gregg, and many protein family databases found online.

In [None]:
# initiate a sample source dictionary with zeros, and then populate it into all IDs
s = dict((source,0) for source in fasta_files.keys())
parcSource = dict((x,dict(s)) for x in parcSeqs.keys())

for item in matched.items():
    for source in fasta_files.keys():
        if item[0] in fasta_seqs[source].keys():
            parcSource[item[1]][source] = 1
for item in semi_matched.items():
    for source in fasta_files.keys():
        if item[1] in fasta_seqs[source].keys():
            parcSource[matched[item[0]]][source] = 1
for item in unmatched:
    for source in fasta_files.keys():
        if item in fasta_seqs[source].keys():
            parcSource[item][source] = 1
##    if item[0] in PF03837seqs.keys():
##        parcSource[item[1]]['PF03837'] = 1
##    if item[0] in COG3723seqs.keys():
##        parcSource[item[1]]['COG3723'] = 1
##    if item[0] in PRK09846seqs.keys():
##        parcSource[item[1]]['PRK09846'] = 1
##    if item[0] in TIGR01913seqs.keys():
##        parcSource[item[1]]['TIGR01913'] = 1
##    if item[0] in uprotRecTseqs.keys():
##        parcSource[item[1]]['uprotRecT'] = 1
##for item in semi_matched.items():
##    if item[1] in SEERids:
##        parcSource[matched[item[0]]]['SEER'] = 1
##    if item[1] in PF03837seqs.keys():
##        parcSource[matched[item[0]]]['PF03837'] = 1
##    if item[1] in COG3723seqs.keys():
##        parcSource[matched[item[0]]]['COG3723'] = 1
##    if item[1] in PRK09846seqs.keys():
##        parcSource[matched[item[0]]]['PRK09846'] = 1
##    if item[1] in TIGR01913seqs.keys():
##        parcSource[matched[item[0]]]['TIGR01913'] = 1
##    if item[1] in uprotRecTseqs.keys():
##        parcSource[matched[item[0]]]['uprotRecT'] = 1 
##for item in unmatched:
##    if item in SEERids:
##        parcSource[item]['SEER'] = 1
##    if item in PF03837seqs.keys():
##        parcSource[item]['PF03837'] = 1
##    if item in COG3723seqs.keys():
##        parcSource[item]['COG3723'] = 1
##    if item in PRK09846seqs.keys():
##        parcSource[item]['PRK09846'] = 1
##    if item in TIGR01913seqs.keys():
##        parcSource[item]['TIGR01913'] = 1
##    if item in uprotRecTseqs.keys():
##        parcSource[item]['uprotRecT'] = 1

Create Pandas Dataframe to contain all of the relevant info acquired. Output all of this data to a csv file.

In [None]:
d = {}
parcDATA = {}
for item in parcACCs.items():
    d[item[0]] = ','.join(item[1])

for parc in parcIDfinal:
    parcDATA[parc] = {'UniProt ACCs':d.get(parc,'-'),
                      'Protein Name':parcProtein.get(parc,'-'),
                      'Pfam ID':parcPfam.get(parc,'-'),
                      'Organism':parcOrganism.get(parc,'-'),
                      'Phylum (Bacterial)':parcBPhylum.get(parc,'-'),
                      'Class (Bacterial)':parcBClass.get(parc,'-'),
                      'Order (Bacterial)':parcBOrder.get(parc,'-'),
                      'Family (Bacterial)':parcBFamily.get(parc,'-'),
                      'Family (Eukaryotic)':parcEFamily.get(parc,'-'),
                      'Phylogeny (Viral)':parcVPhylo.get(parc,'-'),
                      'Viral Host':parcHost.get(parc,'-'),
                      'Truncated Sequence in SEER Lib':parcPartialSeq.get(parc,'-'),
                      'Truncation Coordinates':parcCut.get(parc,'-'),
                      'SEER NT Sequence':parcSEERNT.get(parc,'-'),
                      'RefSeq ID':parcRefSeq.get(parc,'-'),
                      'Sequence':parcSeq.get(parc,'-'),
                      'Source: SEER':parcSource.get(parc,'-')['SEER'],
                      'Source: PF03837':parcSource.get(parc,'-')['PF03837'],
                      'Source: COG3723':parcSource.get(parc,'-')['COG3723'],
                      'Source: PRK09846':parcSource.get(parc,'-')['PRK09846'],
                      'Source: TIGR01913':parcSource.get(parc,'-')['TIGR01913'],
                      'Source: uprotRecT':parcSource.get(parc,'-')['uprotRecT']
                     }

parcDF = pd.DataFrame(parcDATA).transpose()

print("SEER library protein families:\n",parcDF['Pfam ID'].value_counts())
print("\nSEER library Bacterial Phyla:\n",
      parcDF['Phylum (Bacterial)'].str.split(',').apply(lambda x: pd.Series(x).value_counts()).sum().sort_values(ascending = False)
     )
print("\nSEER library Bacterial Classes:\n",
      parcDF['Class (Bacterial)'].str.split(',').apply(lambda x: pd.Series(x).value_counts()).sum().sort_values(ascending = False)
     )
print("\nSEER library Bacterial Orders:\n",
      parcDF['Order (Bacterial)'].str.split(',').apply(lambda x: pd.Series(x).value_counts()).sum().sort_values(ascending = False)
     )
print("\nSEER library Bacterial Families:\n",
      parcDF['Family (Bacterial)'].str.split(',').apply(lambda x: pd.Series(x).value_counts()).sum().sort_values(ascending = False)
     )

parcDF.to_csv('SEER Data.csv')


In [None]:
protEntry['A0A109PXU5']

In [None]:
"".join(re.split("\\\\n",parcSeq['UPI0001A17AF7'])[1:-1])

In [None]:
parcSeq['UPI000237E76E']