In [None]:
import re
import requests
from bs4 import BeautifulSoup
bacteria_regex = r'[A-Z]\. [a-z]+'

In [None]:
def get_sequence(splits):
    return splits[splits.index('Sequence:') + 1]

def get_additional_info(splits):
    return splits[splits.index('Additional info:') + 1]

def _parse_strains_from_additional_info(info):

    bacteria = {} 
    fields = info.split(')')

    for f in fields:
        unit_splits = f.split('MIC')
        if len(unit_splits) < 2:
            continue
        unit = unit_splits[1]
        unit_string = unit
        for c in unit:
            if not(c.isalnum() or c in ['.', '/', ' ', '-']):
                unit_string = unit_string.replace(c, '')
        if not any(c.isnumeric() for c in unit_string):
            continue
        unit_string = unit_string.strip()  # Remove start and ending spaces
        unit_string_splits = unit_string.split()
        if len(unit_string_splits) < 2:
            continue
        elif len(unit_string_splits) == 2:
            value = unit_string_splits[0]
        else:
            value = [unit_string_splits[0] + unit_string_splits[1]]
        if '-' in value:  # Sometimes they give a range, like "11-22 uM"
            bounds = value.split('-')
            bound_values = [float(b.strip()) for b in bounds]
            if len(bounds) != 2:
                print('Bounds too long or too short')

            def _geometric_mean(b0, b1):
                return (b0 * b1) ** (0.5)

            b0, b1 = float(bounds[0]), float(bounds[1])
            value = str(_geometric_mean(b0, b1))  # Take geometric mean (log scale)

        unit_dict = {
            'value': value,
            'unit': unit_string_splits[1]
        }
        bacteria_split = unit_splits[0]
        for bacteria_list in bacteria_split.split(','):  # Commas not allowed in bacteria names
            for bacterium in re.findall(bacteria_regex, bacteria_list):
                bacteria[bacterium] = unit_dict
    return bacteria

In [None]:
def parse_strains(splits):
    info = get_additional_info(splits)
    strain_data = _parse_strains_from_additional_info(info)
    return strain_data

In [None]:
DATABASE_NAME = 'UNMC'
url_base = 'http://aps.unmc.edu/AP/database/query_output.php?ID='

NUM_BACTERIA = 2887
all_bacteria = {}

In [None]:
for i in range(1, NUM_BACTERIA):
    url = url_base + str(i)
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.text
    splits = text.split('\n')
    sequence = get_sequence(splits)
    all_bacteria[sequence] = parse_strains(splits)
    if i % 100 == 0:
       print(i)

In [None]:
with open(DATABASE_NAME, 'w') as f:
    f.write(str(all_bacteria))