In [None]:
import re
import requests
from bs4 import BeautifulSoup
from selenium import webdriver  # We need this for the Javascript
bacteria_regex = r'[A-Z]\. [a-z]+'
bacteria_mic_regex = r'MIC ' + bacteria_regex
DATABASE_NAME = 'DBAASP'
chrome_driver_excecutable_path = '/Users/zswitten/Documents/chromedriver'

In [None]:
def find_monomer_table(soup):
    for table in soup.find_all('table'):
        if 'monomerDetailTable' in str(table):
            return table
    return None

def find_unusual_acid_table(soup):
    for table in soup.find_all('table'):
        if 'Unusual or Modified' in str(table):
            return table
    return None

def find_sequence(soup):
    monomer_table = find_monomer_table(soup)
    for i, th in enumerate(monomer_table.find_all('th')):
        if th.string == 'Sequence':
            sequence_index = i - 1  # Subtract one for table name
    if not monomer_table.find('td'):
        return None
    return monomer_table.find_all('td')[sequence_index].string

def find_references(soup):
    _r = 'References'
    text = soup.text
    reference_text = text[text.index(_r) + len(_r):text.index('\n\n\n Latest Publications\n\n')].replace('\n', '')
    reference_text = reference_text.replace('.Pubmed', '.')
    if re.search('http://www.ncbi.nlm.nih.gov/pubmed/[0-9]*', str(soup)):
        for reference_link in re.finditer('http://www.ncbi.nlm.nih.gov/pubmed/[0-9]*', str(soup)):
            reference_text += ' | ' + reference_link.group(0)
    return reference_text

def find_mic_table(soup):
    for table in soup.find_all('table'):
        if 'Activity Against Target Species' in str(table):
            return table
    return None

def find_hemolysis_table(soup):
    for table in soup.find_all('table'):
        if 'Hemolytic and Cytotoxic' in str(table):
            return table
    return None

In [None]:
def find_modifications(soup):
    return modifications_from_monomer_table(soup) + modifications_from_unusual_acids(soup)


def modifications_from_monomer_table(soup):
    monomer_table = find_monomer_table(soup)
    modifications = []
    for i, th in enumerate(monomer_table.find_all('th')):
        if th.string == 'C Terminus':
            cterminus_index = i - 1
        elif th.string == 'N Termimus':  # [sic]
            nterminus_index = i - 1

    tds = monomer_table.find_all('td')
    if not tds:
        return []
    
    nterminus_string = tds[nterminus_index].string
    if nterminus_string:
        modifications.append('N-Terminus: ' + nterminus_string)
    
    cterminus_string = tds[cterminus_index].string
    if cterminus_string:
        modifications.append('C-Terminus: ' + cterminus_string)
    
    return modifications

def modifications_from_unusual_acids(soup):
    unusual_acid_table = find_unusual_acid_table(soup)
    if unusual_acid_table.find_all('td'):
        return ['Unusual Acid']
    return []

In [None]:
def species_to_bacteria(species_cell):
    bacterium_match = re.search('[A-Z][a-z]+\s[a-z]+', species_cell.string or '')
    if not bacterium_match:
        return None
    bacterium_match = bacterium_match.group(0)
    bacterium = bacterium_match[0] + '.' + bacterium_match.split()[1]
    strain = re.search('[A-Z]+\s?[0-9]+', species_cell.string)
    if strain:
        strain = strain.group(0)
    return (bacterium, strain)

def find_mic_data(soup):
    all_bacteria = {}

    mic_table = find_mic_table(soup)
    if not mic_table:
        return None

    headers = [th.string for th in mic_table.find_all('th')[1:]]
    target_species_index = headers.index('Target Species')
    activity_measure_index = headers.index('Activity Measure')
    activity_index = headers.index('Activity')
    unit_index = headers.index('Unit')

    for row in mic_table.find_all('tr')[2:]:
        tds = row.find_all('td')
        bacterium = species_to_bacteria(tds[target_species_index])
        if not bacterium:
            continue

        unit = tds[unit_index].string
        value_string = re.search('[0-9]+\.?[0-9]*', tds[activity_index].string or '')
        if value_string:
            value = value_string.group(0)
        else:
            continue
        activity_measure = tds[activity_measure_index].string
        if activity_measure in ['MIC', 'LC50', 'IC50', 'LD50']:
            all_bacteria[bacterium] = {'unit': unit, 'value': value}
    return all_bacteria

def find_hemolysis_data(soup):
    hemolysis_table = find_hemolysis_table(soup)
    if not hemolysis_table:
        return None
    
    hemolysis_data = []
    headers = [th.string for th in hemolysis_table.find_all('th')[1:]]
    target_cell_index = headers.index('Target Cell')
    activity_measure_index = headers.index('Activity Measure for Lysis')
    peptide_concentration_index = headers.index('Peptide Concentration')
    unit_index = headers.index('Unit')
    for row in hemolysis_table.find_all('tr')[2:]:
        tds = row.find_all('td')
        target_cell = tds[target_cell_index].string
        unit = tds[unit_index].string
        peptide_concentration = tds[peptide_concentration_index].string
        activity_measure = tds[activity_measure_index].string
        hemolysis_data.append({
            'target_cell': target_cell,
            'unit': unit,
            'peptide_concentration': peptide_concentration,
            'activity_measure': activity_measure
        })
    return hemolysis_data

In [None]:
with open("DBAASP_monomers.txt", 'r') as f:
    text = f.read()
    ids = [line[:line.find('\t')] for line in text.split('\n')[4:]]

In [None]:
amps = {}

In [None]:
browser = webdriver.Chrome(executable_path=chrome_driver_excecutable_path)

In [None]:
for id_no in range(len(amps), len(ids)):
    peptide_id = ids[id_no]
    url_base = 'https://dbaasp.org/peptide-card?id='
    url = url_base + str(peptide_id)
    browser.get(url)
    soup = BeautifulSoup(browser.page_source, 'html.parser')
    sequence = find_sequence(soup)
    modifications = find_modifications(soup)
    references = find_references(soup)
    mic_data = find_mic_data(soup)
    hemolysis_data = find_hemolysis_data(soup)
    if sequence:
        amps[sequence] = {
            'hemolysis': hemolysis_data,
            'bacteria': mic_data,
            'references': references,
            'modifications': modifications,
            'url_sources': [url],
        }
    if id_no % 50 == 0:
        print(id_no, peptide_id)

In [None]:
with open(DATABASE_NAME + ".data", 'w') as f:
    f.write(str(amps))