In [None]:
import re
import requests
from bs4 import BeautifulSoup
DATABASE_NAME = 'PEP_LIFE'
MIN_PEPTIDE_ID = 1001
MAX_PEPTIDE_ID = 3212

In [None]:
def get_row(soup, row_string):
    for row in soup.find_all('tr'):
        if row_string in str(row):
            return row
        if row.find_all('td'):
            for td in row.find_all('td'):
                if td.string and row_string in td.string:
                    return row
    return None

def get_sequence(soup):
    sequence_row = get_row(soup, 'SEQUENCE')
    if not sequence_row:
        return
    return sequence_row.find_all('td')[1].string or sequence_row.find_all('td')[1].text

def get_half_life_data(soup):
    half_life_seconds_row = get_row(soup, 'Converted Half-')
    half_life_seconds = half_life_seconds_row.find_all('td')[1].string or half_life_seconds_row.find_all('td')[1].text

    test_sample_row = get_row(soup, 'TEST SAMPLE')
    test_sample = test_sample_row.find_all('td')[1].string or test_sample_row.find_all('td')[1].text

    vivo_vitro_row = get_row(soup, 'IN VIVO')
    vivo_vitro = vivo_vitro_row.find_all('td')[1].string or vivo_vitro_row.find_all('td')[1].text

    return {
        'half_life_seconds': half_life_seconds,
        'test_sample': test_sample,
        'vivo_vitro': vivo_vitro
    }
    
def get_references(soup):
    references_row = get_row(soup, 'PMID')
    return [references_row.find('a').get('href')]

def get_modifications(soup):
    modifications = []

    nterminal_modification_row = get_row(soup, 'N-TER MODIFICATION')
    nterminal = nterminal_modification_row.find_all('td')[1].string or nterminal_modification_row.find_all('td')[1].text
    cterminal_modification_row = get_row(soup, 'C-TER MODIFICATION')
    cterminal = cterminal_modification_row.find_all('td')[1].string or cterminal_modification_row.find_all('td')[1].text
    linear_cyclic_row = get_row(soup, 'LINEAR/CYCLIC')
    linear_cyclic = linear_cyclic_row.find_all('td')[1].string or linear_cyclic_row.find_all('td')[1].text
    stereochemistry_row = get_row(soup, 'STEREO-CHEMISTRY')
    stereochemistry = stereochemistry_row.find_all('td')[1].string or stereochemistry_row.find_all('td')[1].text
    chemical_modifications_row = get_row(soup, 'CHEMICAL MODIFICATIONS')
    chemical_modifications = chemical_modifications_row.find_all('td')[1].string or chemical_modifications_row.find_all('td')[1].text
    
    if 'Am' in cterminal:
        modifications.append('C-Terminal Amidation')
    if nterminal != 'Free':
        modifications.append('N-Terminal')
    if 'yclic' in linear_cyclic.lower():
        modifications.append('Cyclic')
    if stereochemistry != 'L':
        modifications.append('Stereochemistry')
    if chemical_modifications != "None":
        modifications.append('Chemical Modification: ' + chemical_modifications)
    return modifications

In [None]:
amps = {}

In [None]:
for i in range(MIN_PEPTIDE_ID + len(amps), MAX_PEPTIDE_ID + 1):
    url_base = 'http://crdd.osdd.net/raghava/peplife/display_sub.php?details='
    url = url_base + str(i)
    content = requests.get(url).content
    soup = BeautifulSoup(content, 'html.parser')

    sequence = get_sequence(soup)
    if not sequence:
        continue

    modifications = tuple(get_modifications(soup))
    references = get_references(soup)
    half_life_data = get_half_life_data(soup)
    if (sequence, modifications) not in amps:
        amps[(sequence, modifications)] = {
            'modifications': modifications,
            'references': references,
            'half_life_data': [(half_life_data, url)],
            'url_sources': [url]
        }
    if i % 100 == 0:
        print(i)

In [None]:
with open(DATABASE_NAME + ".data", 'w') as f:
    f.write(str(amps))