In [82]:
import re
import requests
from bs4 import BeautifulSoup
DATABASE_NAME = 'DADP'

In [None]:
with open("DADP_list.txt", "r") as f:
    initial_links_text = f.read()
links = list(re.findall(r"SP_[a-zA-Z0-9_]+", initial_links_text))

In [None]:
def get_sequences(data_table):
    rows = data_table.findAll('tr')
    label_row = [td.text.strip() for td in rows[0].findAll('td')]
    value_rows = []
    for row in rows[1:]:
        value_rows.append([td.text.strip() for td in row.findAll('td')])
    sequence_index = label_row.index('Sequence')
    return [value_row[sequence_index] for value_row in value_rows]

def get_modifications(data_table):
    rows = data_table.findAll('tr')
    label_row = [td.text.strip() for td in rows[0].findAll('td')]
    value_rows = []
    for row in rows[1:]:
        value_rows.append([td.text.strip() for td in row.findAll('td')])

    modifications_index = label_row.index('Amidated')

    def _get_modifications(cell):
        if cell == 'Yes':
            return 'C-Terminal'
        else:
            return []
    return [
        _get_modifications(value_row[modifications_index])
        for value_row in value_rows
    ]
    

def get_mic_data(data_table):
    mic_data = {}

    rows = data_table.findAll('tr')
    label_row = [td for td in rows[0].findAll('td')]
    value_rows = []
    for row in rows[1:]:
        value_rows.append([td.text.strip() for td in row.findAll('td')])

    bacteria_columns = {}
    hemolysis_column = []
    for column_index, column in enumerate(label_row):
        contents = column.contents
        bacteria_column = []
        if 'MIC' in contents[0].string:
            bacteria = contents[1].string
            mic_data[bacteria] = {}
            unit = contents[2][2:-1]
            for value_row in value_rows:
                mic_value = value_row[column_index]
                bacteria_column.append(mic_value)
            bacteria_columns[bacteria] = bacteria_column
        elif 'HC50' in contents[0].string:
            unit = contents[0].string
            for value_row in value_rows:
                hemolysis_matches = re.findall(r'[0-9]+.[0-9]+', value_row[column_index])
                hemolysis_value = hemolysis_matches[0] if hemolysis_matches else ''
                hemolysis_column.append(hemolysis_value)

    return bacteria_columns, hemolysis_column

def get_references(reference_table):
    is_reference_tr = False
    references = []
    for tr in reference_table.findAll('tr'):
        for cell in tr.findAll('td'):
            cell_string = cell.string
            if is_reference_tr:
                if re.search(r'\d\.', cell_string[:2]):
                    continue
                else:
                    references.append(cell_string)
            if cell_string == 'References:':
                is_reference_tr = True
        if is_reference_tr:
            return references
    return references

In [None]:
amps = {}
unit = 'uM'

In [None]:
url_base = 'http://split4.pmfst.hr/dadp/?a=kartica&id='
for i, link in enumerate(links[len(amps):]):
    url = url_base + link
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'html.parser')

    tables = soup.findAll('table')
    reference_table = tables[1]
    data_table = tables[2]
    sequences = get_sequences(data_table)
    bacteria_columns, hemolysis_column = get_mic_data(data_table)
    modifications_column = get_modifications(data_table)
    for s_index, sequence in enumerate(sequences):
        amps[sequence] = {
            'hemolysis': hemolysis_column[s_index],
            'url_sources': [url],
            'modifications': modifications_column[s_index],
            'references': get_references(reference_table),
            'bacteria':{
                (bacterium, None): {'unit': unit, 'value': bacteria_columns[bacterium][s_index]}
                for bacterium in bacteria_columns
            }
        }
    if i % 10 == 0:
        print(i)

In [83]:
with open(DATABASE_NAME + ".data", 'w') as f:
    f.write(str(amps))

In [84]:
amps['GLFDVVKGVLKGAGKNVAGSLLEQLKCKLSGGC']

{'bacteria': {(u'E. coli', None): {'unit': 'uM', 'value': u'30'},
  (u'S. aureus', None): {'unit': 'uM', 'value': u'30'}},
 'hemolysis': '',
 'modifications': [],
 'references': [u'J.M. Conlon et al. / Toxicon 50 (2007) 746-756. Cytolytic peptides belonging to the brevinin-1 and brevinin-2 families isolated from the skin of the Japanese brown frog, Rana dybowskii'],
 'url_sources': ['http://split4.pmfst.hr/dadp/?a=kartica&id=SP_P0C5X2']}

In [None]:
amps