In [151]:
import re
import requests
from bs4 import BeautifulSoup
bacteria_regex = r'[A-Z]\. [a-z]+'

In [265]:
def get_sequence(splits):
    return splits[splits.index('Sequence:') + 1]

def get_additional_info(splits):
    return splits[splits.index('Additional info:') + 1]

def _parse_strains_from_additional_info(info):

    bacteria = {} 
    fields = info.split(')')

    for f in fields:
        unit_splits = f.split('MIC')
        if len(unit_splits) < 2:
            continue
        unit = unit_splits[1]
        unit_string = unit
        for c in unit:
            if not(c.isalnum() or c in ['.', '/', ' ', '-']):
                unit_string = unit_string.replace(c, '')
        if not any(c.isnumeric() for c in unit_string):
            continue
        unit_string = unit_string.strip()  # Remove start and ending spaces
        unit_string_splits = unit_string.split()
        if len(unit_string_splits) < 2:
            continue
        elif len(unit_string_splits) == 2:
            value = unit_string_splits[0]
        else:
            value = [unit_string_splits[0] + unit_string_splits[1]]
        if '-' in value:  # Sometimes they give a range, like "11-22 uM"
            bounds = value.split('-')
            value = str(sum(float(b.strip()) for b in bounds)/len(bounds))  # Take the average

        unit_dict = {
            'value': value,
            'unit': unit_string_splits[1]
        }
        bacteria_split = unit_splits[0]
        for bacteria_list in bacteria_split.split(','):  # Commas not allowed in bacteria names
            for bacterium in re.findall(bacteria_regex, bacteria_list):
                bacteria[bacterium] = unit_dict
    return bacteria

In [249]:
def parse_strains(splits):
    info = get_additional_info(splits)
    strain_data = _parse_strains_from_additional_info(info)
    return strain_data

In [252]:
DATABASE_NAME = 'UNMC'
url_base = 'http://aps.unmc.edu/AP/database/query_output.php?ID='

NUM_BACTERIA = 2887
all_bacteria = {}

In [250]:
"""i = 1
url = url_base + str(i)
content = requests.get(url).content
soup = BeautifulSoup(content, 'html.parser')
text = soup.text
splits = text.split('\n')
sequence = get_sequence(splits)
info = get_additional_info(splits)

bacteria = {} 
fields = info.split(')')
print("FIELDS", fields)
for f in fields[:3]:
    unit_splits = f.split('MIC')
    if len(unit_splits) < 2:
        continue
    unit = unit_splits[1]
    unit_string = unit
    for c in unit:
        if not(c.isalnum() or c in ['.', '/', ' ', '-']):
            unit_string = unit_string.replace(c, '')
    if not any(c.isnumeric() for c in unit_string):
        continue
    unit_string = unit_string.strip()  # Remove start and ending spaces
    unit_string_splits = unit_string.split()
        if len(unit_string_splits) < 2:
            continue
        if len(unit_string_splits) == 2:
            value = unit_string_splits[0]
        elif len(unit_string_splits) > 2:
            value = [unit_string_splits[0] + unit_string_splits[1]]
    if '-' in value:  # Sometimes they give a range, like "11-22 uM"
        bounds = value.split('-')
        value = str(sum(float(b.strip()) for b in bounds if b)/len(bounds))  # Take the average
    unit_dict = {
        'value': value,
        'unit': unit_string_splits[1]
    }
    bacteria_split = unit_splits[0]
    for bacteria_list in bacteria_split.split(','):  # Commas not allowed in bacteria names
        for bacterium in re.findall(bacteria_regex, bacteria_list):
            bacteria[bacterium] = unit_dict"""

('FIELDS', [u'A frog used for "hunting magic" by several groups of Panoan-speaking Indians in the borderline between Brazil and Peru is identified as Phyllomedusa bicolor. This natural peptide, isolated from that frog skin,  may contain a D amino acid residue, since it is not identical in chromatographic properties to the synthetic peptide (Proc Natl Acad Sci U S A. 1992 Nov 15;89(22', u':10960-3', u'. Synthetic adenoregulin enhanced the binding of agonists to several G-protein-coupled receptors in rat brain membranes. Active against M. canis IP 1194, T. rubrum IP 1400-82, A. simii IP 1063-74, A. caviae IP 67-16 P, E. coli IP 76-24, N. brasiliensis IP 16-80, C. neoformans IP 960-67, C. neoformans IP 962-67, and C. albicans (MIC 10-60 ug/ml', u'. A helix-hinge-helix structural motif (helix 1: 1-8; helix 2: 11-31', u' was found in complex with SDS2003 micelles. The N-terminal segment residues 1-11 is critical for antibacterial activity (Lequin O et al. 2003 Biochemistry 42: 10311-23', u'

In [267]:
for i in range(1, NUM_BACTERIA):
    url = url_base + str(i)
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.text
    splits = text.split('\n')
    sequence = get_sequence(splits)
    all_bacteria[sequence] = parse_strains(splits)
    #if i % 100 == 0:
    #   print(i)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800


In [268]:
for a in all_bacteria:
    print(a, all_bacteria[a], '\n')

(u'GLPRKILCAIAKKKGKCKGPLKLVCKC', {u'B. subtilis': {'unit': u'uM', 'value': u'0.4'}}, '\n')
(u'WNDTGKDADGSEY', {}, '\n')
(u'GFGCPGDAYQCSEHCRALGGGRTGGYCAGPWYLGHPTCTCSF', {u'S. agalactiae': {'unit': u'ug/ml', 'value': '0.53'}, u'E. faecium': {'unit': u'ug/ml', 'value': u'16'}, u'S. pneumonia': {'unit': u'ug/ml', 'value': u'0.25'}, u'S. epidermidis': {'unit': u'ug/ml', 'value': u'16'}, u'S. pyogenes': {'unit': u'ug/ml', 'value': '0.53'}, u'S. aureus': {'unit': u'ug/ml', 'value': u'16'}}, '\n')
(u'GFFSLIKGVAKIATKGLAKNLGKMGLDLVGCKISKEC', {}, '\n')
(u'LVKDNPLDISPKQVQALCTDLVIRCMCCC', {u'C. michiganensis': {'unit': u'uM', 'value': u'3.1'}, u'C. albicans': {'unit': u'uM', 'value': u'25'}, u'S. aureus': {'unit': u'uM', 'value': u'3.1'}, u'R. solani': {'unit': u'uM', 'value': u'25'}}, '\n')
(u'SDCNINSNTAADVILCFNQVGSCALCSPTLVGGPVP', {}, '\n')
(u'VKLIQIRIWIQYVTVLQMFSMKTKQ', {}, '\n')
(u'YPELQQDLIARLL', {u'B. subtilis': {'unit': u'uM', 'value': u'3.1'}, u'E. coli': {'unit': u'uM', 'value': u'50'}, u'

(u'VNPSYRLDPESRPQCEAHCGQLGMRLGAIVIMGTATGCVCEPKEAATPESR', {}, '\n')
(u'QYRHRCCAWGPGRKYCKRWC', {}, '\n')
(u'RRICRCRIGRCLGLEVYFGVCFLHGRLARRCCR', {}, '\n')
(u'ASVATELRCQCLQTLQGIHPKNIQSVNVKSPGPHCAQTEVIATLKNGRKACLNPASPIVKKIIEKMLNSDKSN', {}, '\n')
(u'MTPFWRGVSLRPIGASCRDDSECITRLCRKRRCSLSVAQE', {}, '\n')
(u'CGETCIYIPCFTEAVGCKCKDKVCYKN', {}, '\n')
(u'RWRWWRWRR', {}, '\n')
(u'GYYCPFRQDKCHRHCRSFGRKAGYCGNFLKRTCICVKK', {u'P. aeruginosa': {'unit': u'uM', 'value': u'30'}, u'S. aureus': {'unit': u'uM', 'value': u'60'}, u'L. monocytogenes': {'unit': u'uM', 'value': u'15'}, u'L. grayi': {'unit': u'uM', 'value': u'8'}, u'L. fleischmannii': {'unit': u'uM', 'value': u'60'}, u'L. seeligeri': {'unit': u'uM', 'value': u'15'}}, '\n')
(u'FLPFLAGLFGKIF', {u'E. faecalis': {'unit': u'uM', 'value': u'18.8'}, u'S. aureus': {'unit': u'uM', 'value': u'150'}, u'N. asteroides': {'unit': u'uM', 'value': u'75'}}, '\n')
(u'GLPVCGETCFGGTCNTPGCSCTDPICTRD', {}, '\n')
(u'KRGLWESLKRKATKLGDDIRNTLRNFKIKFPVPRQG', {}, '\n')
(u'FQTSE

In [271]:
with open(DATABASE_NAME, 'w') as f:
    f.write(str(all_bacteria))