### Import and Logger

In [1]:
import csv
import requests
import pandas as pd
from bs4 import BeautifulSoup

from log import CrawlerLog

DISEASE_NAME = 'mers'
RAW_DATA_PATH = './rawdata'
crawer_log = CrawlerLog(DISEASE_NAME)

## Load Data

In [2]:
data = pd.read_csv(f'{RAW_DATA_PATH}/{DISEASE_NAME}.csv')
print(data.shape)

(39, 27)


### append column with initialize

In [3]:
data['field1'] = ''
data['inclusion_creteria'] = ''
data['field2'] = ''
data['exclusion_creteria'] = ''

## Crawl Data

### 데이터 요청 및 수집

In [4]:
# cretria 영역 파싱
def get_data(text):
    soup = BeautifulSoup(text, 'lxml')
    creteria_area = soup.find('div', string='Criteria').find_next_siblings()[0]
    contents = list(creteria_area.find_all(True, recursive=False))
    return contents

def parse_creteria(li_tags):
    return '\n'.join(map(lambda li: li.text, li_tags)) or ''

### creteria contents 예외 처리

In [5]:
for index in range(1, data.shape[0]):
    # check 200
    url = data.at[(index, 'URL')]
    response = requests.get(url)
    if response.status_code != 200:
        crawer_log.logger.error(f'{url},INVALID URL,{response.status_code}')
    else:
        contents = get_data(response.text)
    # 컬럼 지정
    try:
        data.loc[index, 'field1'] = contents[0].text or ''
        data.loc[index, 'inclusion_creteria'] = parse_creteria(contents[1].select('li'))
    except:
        crawer_log.logger.debug(f'{url},Inclusion')

    try:
        data.loc[index, 'field2'] = contents[2].text or ''
        data.loc[index, 'exclusion_creteria'] = parse_creteria(contents[3].select('li'))
    except:
        crawer_log.logger.debug(f'{url},Exclusion')

https://ClinicalTrials.gov/show/NCT02788188,Exclusion
https://ClinicalTrials.gov/show/NCT00843882,Exclusion


In [6]:
data.to_csv(f'./result/{DISEASE_NAME}.csv')