In [151]:
import re
import requests
from bs4 import BeautifulSoup
bacteria_regex = r'[A-Z]\. [a-z]+'

In [200]:
def get_sequence(splits):
    return splits[splits.index('Sequence:') + 1]

def get_additional_info(splits):
    return splits[splits.index('Additional info:') + 1]

def _parse_strains_from_additional_info(info):
    bacteria = {} 
    fields = info.split(')')
    for f in fields:
        unit_splits = f.split('MIC')
        if len(unit_splits) < 2:
            continue
        unit = unit_splits[1]
        unit_string = unit
        for c in unit:
            if not(c.isalnum() or c in ['.', '/', '-']):
                unit_string = unit_string.replace(c, '')
        if not any(c.isnumeric() for c in unit_string):
            continue
        unit_string = unit_string.strip()  # Remove start and ending spaces
        unit_string_splits = unit_string.split()
        if len(unit_string_splits) < 2:
            continue
        value = unit_string_splits[0]
        if '-' in value:  # Hyphenated
            bounds = value.split('-')
            value = str(sum(float(b) for b in bounds)/len(bounds))
        unit_dict = {
            'value': value,
            'unit': unit_string_splits[1]
        }
        bacteria_split = unit_splits[0]
        for bacteria_list in bacteria_split.split(','):  # Commas not allowed in bacteria names
            for bacterium in re.findall(bacteria_regex, bacteria_list):
                bacteria[bacterium] = unit_dict
    return bacteria

In [201]:
def parse_strains(splits):
    info = get_additional_info(splits)
    strain_data = _parse_strains_from_additional_info(info)
    return strain_data

In [202]:
NUM_BACTERIA = 2887
url_base = 'http://aps.unmc.edu/AP/database/query_output.php?ID='
all_bacteria = {}

In [None]:
for i in range(1, NUM_BACTERIA):
    url = url_base + str(i)
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.text
    splits = text.split('\n')
    sequence = get_sequence(splits)
    all_bacteria[sequence] = parse_strains(splits)
    if i % 100 == 0:
        print(i)

100
200


In [172]:
for a in all_bacteria:
    print(a, all_bacteria[a])

(u'GLPRKILCAIAKKKGKCKGPLKLVCKC', {u'B. subtilis': {'unit': u'uM', 'value': u'0.4'}})
(u'WNDTGKDADGSEY', {})
(u'GFGCPGDAYQCSEHCRALGGGRTGGYCAGPWYLGHPTCTCSF', {u'S. agalactiae': {'unit': u'ug/ml', 'value': u'0.061'}, u'E. faecium': {'unit': u'ug/ml', 'value': u'16'}, u'S. pneumonia': {'unit': u'ug/ml', 'value': u'0.25'}, u'S. epidermidis': {'unit': u'ug/ml', 'value': u'16'}, u'S. pyogenes': {'unit': u'ug/ml', 'value': u'0.061'}, u'S. aureus': {'unit': u'ug/ml', 'value': u'16'}})
(u'GFFSLIKGVAKIATKGLAKNLGKMGLDLVGCKISKEC', {})
(u'LVKDNPLDISPKQVQALCTDLVIRCMCCC', {u'C. michiganensis': {'unit': u'uM', 'value': u'3.1'}, u'C. albicans': {'unit': u'uM', 'value': u'25'}, u'S. aureus': {'unit': u'uM', 'value': u'3.1'}, u'R. solani': {'unit': u'uM', 'value': u'25'}})
(u'SDCNINSNTAADVILCFNQVGSCALCSPTLVGGPVP', {})
(u'VKLIQIRIWIQYVTVLQMFSMKTKQ', {})
(u'YPELQQDLIARLL', {u'B. subtilis': {'unit': u'uM', 'value': u'3.1'}, u'E. coli': {'unit': u'uM', 'value': u'50'}, u'S. aureus': {'unit': u'uM', 'value': u

(u'TPPQS', {})
(u'CIGNGGRCNENVGPPYCCSGFCLRQPNQGYGVCRNR', {})
(u'FFGHLFRGIINVGKHIHGLLSG', {u'E. coli': {'unit': u'uM', 'value': u'5'}, u'E. cloacae': {'unit': u'uM', 'value': u'25'}})
(u'FCTMIPIPRCY', {})
(u'NEMGGPLVVEARTCESQSHKFKGTCLSDTNCANVCHSERFSGGKCRGFRRRCFCTTHC', {})
(u'FPPPGESAVDMSFFYALSNP', {})
(u'LLEDGTTEILDHVCNFRVMPRLRSWELYFRGDVWCPGWTVIKGESLTRSRTRVVNKAVADFAQKALAQGLITQEDAQPLLE', {})
(u'KTKKKLLKKT', {})
(u'GLFTLIKCAYQLIAPTVACN', {})
(u'SALVGCWTKSYPPNPCFGRG', {u'S. aureus': {'unit': u'uM', 'value': u'11.7'}, u'C. albicans': {'unit': u'uM', 'value': u'46.7'}})
(u'GDPTFCGETCRVIPVCTYSAALGCTCDDRSDGLCKRN', {})
(u'INMKASAAVAKKLL', {u'P. aeruginosa': {'unit': u'ug/ml', 'value': u'120'}, u'E. ocli': {'unit': u'ug/ml', 'value': u'60'}, u'E. coli': {'unit': u'ug/ml', 'value': u'15'}, u'S. aureus': {'unit': u'ug/ml', 'value': u'1.93.75'}, u'C. albicans': {'unit': u'ug/ml', 'value': u'15'}})
(u'GWMSKIASGIGTFLSGMQQ', {})
(u'GLLSGILNTAGGLLGNLIGSLSN', {})
(u'ISQSDAILSAIWSGIKSLF', {u'M. luteus': 

In [181]:
url = 'http://aps.unmc.edu/AP/database/query_output.php?ID=01846'
content = requests.get(url).content
soup = BeautifulSoup(content, 'html.parser')
text = soup.text
splits = text.split('\n')
sequence = get_sequence(splits)
print(parse_strains(splits))

{u'B. pyocyaneus': {'unit': u'uM', 'value': '9.9'}, u'C. albicans': {'unit': u'uM', 'value': '9.9'}, u'S. aureus': {'unit': u'uM', 'value': '9.9'}, u'E. coli': {'unit': u'uM', 'value': '9.9'}}


In [199]:
import pdb; pdb.pm()

> <ipython-input-196-a947b888cd3b>(28)<genexpr>()
-> value = str(sum(float(b) for b in bounds)/len(bounds))
(Pdb) up
> <ipython-input-196-a947b888cd3b>(28)_parse_strains_from_additional_info()
-> value = str(sum(float(b) for b in bounds)/len(bounds))
(Pdb) unit_string
u'11- 22 uM'
(Pdb) list 22
 17  	            if not(c.isalnum() or c in ['.', '/', ' ', '-']):
 18  	                unit_string = unit_string.replace(c, '')
 19  	        if not any(c.isnumeric() for c in unit_string):
 20  	            continue
 21  	        unit_string = unit_string.strip()  # Remove start and ending spaces
 22  	        unit_string_splits = unit_string.split()
 23  	        if len(unit_string_splits) < 2:
 24  	            continue
 25  	        value = unit_string_splits[0]
 26  	        if '-' in value:  # Hyphenated
 27  	            bounds = value.split('-')
(Pdb) list 17
 12  	        if len(unit_splits) < 2:
 13  	            continue
 14  	        unit = unit_splits[1]
 15  	        unit_string

KeyboardInterrupt: 