In [None]:
import re
import requests
from bs4 import BeautifulSoup
DATABASE_NAME = 'DRAMP'
ANTIBACTERIA_LINK_PAGES = 326
bacteria_regex = r'(?P<bacterium>[A-Z]\. [a-z]+)(?P<strain>\s?[A-Z]+\s?[0-9]+)?'

In [None]:
# Find links to each DRAMP peptide page
find_links_base = 'http://dramp.cpu-bioinfor.org/browse/ActivityData.php?order=antibacterial&pageNow='
drampids = []
for i in range(1, ANTIBACTERIA_LINK_PAGES):
    url = find_links_base + str(i)
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'html.parser')
    matches = re.finditer('value\=\"(?P<drampid>DRAMP[0-9]+)\"', str(soup))
    drampids += [m.groupdict()['drampid'] for m in matches]

In [None]:
# Get a list of bacteria names
bacteria_list_url_base = 'http://www.thelabrat.com/protocols/Bacterialspecies/byname'
bacteria_names = []
import string
for letter in string.ascii_uppercase:
    url = bacteria_list_url_base + letter + '.shtml'
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'html.parser')
    text = soup.text
    # start = text.index('Y | Z \n\n\') + len("Y | Z \n\n")
    end = start + text[start:].index('\n\n<!--\ngoogle_ad_client')
    for name in text[start:end].split('\n'):
        if name and name.strip() and all(c.isalpha() or c == ' ' for c in name):
            bacteria_names.append(name)

In [None]:
bacteria_list_url_base_2 = 'https://www.ncbi.nlm.nih.gov/books/NBK'
for i in range(818, 844):
    url = bacteria_list_url_base_2 + str(i)
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'html.parser')
    str_soup = str(soup)
    for match in re.finditer('targettype=tax', str_soup):
        start = match.start()
        while str_soup[start] != '>':
            start += 1
        start += 1
        end = start
        while str_soup[end] != '<':
            end += 1
        if str_soup[start].isupper():
            bacteria_names.append(str_soup[start:end])

In [None]:
bacteria_names = list(set(bacteria_names + bnames))

In [None]:
import pickle
pickle.dump(bacteria_names, open("bacteria_list.txt", "w"))

In [None]:
def get_line_splits(soup):
    text = soup.text
    while text != text.replace('\n\n', '\n'):
        text = text.replace('\n\n', '\n')
    return text.split('\n')

def get_sequence(soup_split_by_line):
    return soup_split_by_line[soup_split_by_line.index('Sequence Length') - 1]

def get_modifications(soup):
    ptm_string = '"comments-PTM">PTM</h4></li><li><h5> '
    text = str(soup)
    if ptm_string not in text:
        return []
    rest = text[text.index(ptm_string) + len(ptm_string):]
    ptm = rest[:rest.index('</')]
    if 'erminal amidation' not in rest:
        return ["Unknown modification: " + ptm]
    return ['C-Terminal Amidation']

def get_references(soup_split_by_line):
    references = []
    reference_splits = soup_split_by_line[soup_split_by_line.index('Literature Information'):]
    for i, line in enumerate(reference_splits):
        if line == 'Reference':
            pubmed_id = reference_splits[i - 1]
            reference = reference_splits[i + 3]
            author = reference_splits[i + 1]
            title = reference_splits[i + 5]
            references.append(
                "PubMed ID: %s. Reference: %s. Author: %s. Title: %s"
                % (pubmed_id, reference, author, title)
            )
    return references

def find_longest_bacteria_matches(bacteria_names, text_section):
    # Find bacteria names that match, using only longest form
    # This way we don't keep the one-word abbreviations when two names are present
    bacteria_matches = [b for b in bacteria_names if ' ' + b in text_section.lower()]
    longest_matches = []
    for match in bacteria_matches:
        is_sub = False
        for other_match in bacteria_matches:
            if match in other_match and len(other_match) > len(match):
                is_sub = True
        if not is_sub:
            longest_matches.append(match)
    return longest_matches

def bacteria_text_to_bacteria_and_strain(bacteria_names, line):
    all_matches = []
    matches_from_full_names = find_longest_bacteria_matches(bacteria_names, line)

    strain = re.search('ATCC\s?[0-9]+', line)
    if strain:
        strain = strain.group(0)

    for match in matches_from_full_names:
        if len(match.split()) > 1:
            try:
                all_matches.append((
                    match[0].upper() + '. ' + match.split()[1],
                    strain.strip() if strain else ''
                ))
            except:
                print("????", match, line)
        else:
            splits = line.lower().split()
            for i, split in enumerate(splits):
                if split == match:
                    try:
                        all_matches.append((
                            match[0].upper() + '. ' + splits[i + 1],
                            strain.strip() if strain else ''
                        ))
                    except:
                        print("?????????", match, line)
            
    for regex_match in re.finditer(bacteria_regex, line):
        bacterium = regex_match.groupdict()['bacterium']
        strain = regex_match.groupdict()['strain']
        all_matches.append((
            bacterium,
            strain.strip() if strain else ''
        ))

    return all_matches

def extract_unit_and_value(line):
    value = re.search('[0-9]+\.?[0-9]*', line)
    if not value:
        return ('', '')
    unit = line[value.end():].strip().replace(')', '')
    return (value.group(0), unit)

def get_mic_data(soup_split_by_line, _bacteria_names):
    bacteria_names_lower = [b.lower() for b in bacteria_names]
    all_bacteria = {}
    mic_line_index = sorted(
        [i for i in range(len(soup_split_by_line)) if soup_split_by_line[i] == 'Target Organism']
    )[1] + 1
    mic_line = soup_split_by_line[mic_line_index]
    bacterium_or_unit_and_value_fields = re.split('(\(MIC\s?.*?\))', mic_line)  # Alternating between unit/values and bacteria
    for i, mic_split in enumerate(bacterium_or_unit_and_value_fields):
        if re.search('(\(MIC\s?.*?\))', mic_split):
            bacteria_and_strains = bacteria_text_to_bacteria_and_strain(bacteria_names_lower, bacterium_or_unit_and_value_fields[i - 1])
            unit, value = extract_unit_and_value(mic_split)
            for (bacterium, strain) in bacteria_and_strains:
                all_bacteria[(bacterium, strain)] = {
                    'unit': unit,
                    'value': value
                }
    return all_bacteria

def get_hemolysis_data(soup):  # Return the sentence with the hemolytic data, leaving parsing for later
    text = soup.text
    if 'hemoly' in text:
        for sentence in text.split('.'):
            if 'hemoly' in sentence:
                return sentence
    return None

In [None]:
amps = {}

In [None]:
def parse_soup(soup):
    ssplits = get_line_splits(soup)
    sequence = get_sequence(ssplits)
    if not sequence:
        return None, None
    if 'Patent Information' in str(soup):
        # print('Patent Information')
        return None, None
    modifications = get_modifications(soup)
    references = get_references(ssplits)
    mic_data = get_mic_data(ssplits, bacteria_names)
    url_sources = [url]
    hemolysis_data = get_hemolysis_data(soup)
    return (
        sequence, 
        {
            'modifications': modifications,
            'references': references,
            'bacteria': mic_data,
            'url_sources': url_sources,
            'hemolysis': hemolysis_data,
        }
    )

In [None]:
for i, drampid in enumerate(drampids[len(amps):]):
    url = url_base + drampid
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'html.parser')

    ssplits = get_line_splits(soup)

    sequence = get_sequence(ssplits)
    if not sequence:
        continue

    sequence, results = parse_soup(soup)
    if sequence:
        amps[sequence] = results

    if i % 10 == 0:
        print(i)

In [375]:
with open(DATABASE_NAME + ".data", 'w') as f:
    f.write(str(amps))

In [376]:
amps

{u'LKLLKKLLKLLKKLGK': {'bacteria': {},
  'hemolysis': None,
  'modifications': [],
  'references': [u'PubMed ID: 7726486. Reference: Haynie SL, Crum GA, Doele BA.. Author: Antimicrob Agents Chemother. 1995 Feb;39(2):301-307.. Title: Antimicrobial activities of amphiphilic peptides covalently bonded to a water-insoluble resin.'],
  'url_sources': ['http://dramp.cpu-bioinfor.org/browse/All_Information.php?id=DRAMP04039']},
 u'GLPRKILCAIAKKKGKCKGPLKLVCKC': {'bacteria': {},
  'hemolysis': u'addTo("chartContainer");\n\t\t\t\t            };\n\t\t\t\t        \n\n\t\t\t\t             browser doesn\'t support html5\n\t\t\t\t        \nAmino Acid Distribution\n \n\n\n\n\nComments Information\n\n\n\nFunction The synthetic lasiocepsin possessed antimicrobial activity against both Gram-positive and -negative bacteria, antifungal activity against Candida albicans, and no hemolytic activity against human erythrocytes',
  'modifications': [],
  'references': [u'PubMed ID: 22038181. Reference: Monincov\