In [None]:
# import necessary modules

from dataclasses import dataclass
from typing import List, Tuple 
from datetime import datetime, date
import requests

from bs4 import BeautifulSoup
from pprint import pprint

In [None]:
# template for COVID reports (follows the information from the URL)

@dataclass
class Route:
    from_time: datetime
    to_time: datetime
    description: str
    addr_gps: Tuple[float, float] = None

@dataclass
class Case:
    id: int
    name: str
    addr: str   
    age: str
    date: datetime
    reason: str
    hospital: str
    comment: str
    addr_gps: Tuple[float, float] = None
    routes: List[Route] = None
    routes_raw: str = None
    symptom: str = None

In [None]:
# Target URL 

url = 'https://www.yongin.go.kr/health/ictsd/INC_selectIctsdPatntList.do?q_currPage=1&q_rowPerPage=10'

In [None]:
# text detection to get a sample for data scraping algorithm

text = requests.get(url).text
text[:100]

'\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n<script type="text/javascript">\n\t$(document).ready(function(){\r\n\t\t$(\'.btn_plus\').cli'

In [None]:
# check texts

html = BeautifulSoup(text)
items = html.find_all(attrs={"class": "txt_le expand"})
items

[<td class="txt_le expand" style="text-align:center !important;">435</td>,
 <td class="txt_le expand" style="text-align:center !important;">434</td>,
 <td class="txt_le expand" style="text-align:center !important;">433</td>,
 <td class="txt_le expand" style="text-align:center !important;">432</td>,
 <td class="txt_le expand" style="text-align:center !important;">431</td>,
 <td class="txt_le expand" style="text-align:center !important;">430</td>,
 <td class="txt_le expand" style="text-align:center !important;">429</td>,
 <td class="txt_le expand" style="text-align:center !important;">428</td>,
 <td class="txt_le expand" style="text-align:center !important;">427</td>,
 <td class="txt_le expand" style="text-align:center !important;">426</td>]

In [None]:
# detect parents of the items

item = items[0]
parent = item.parent
parent

<tr>
<td class="txt_le expand" style="text-align:center !important;">435</td>
<td class="txt_le">용인-383</td>
<td class="txt_le">기흥구 영덕동</td>
<td class="txt_le" style="text-align:center !important;">
							여성<br/>(50대)
						</td>
<td class="txt_le" style="text-align:center !important;">10-06</td>
<td class="txt_le">용인외-40번 접촉자</td>
<td class="txt_le"></td>
<td class="txt_le"></td>
<td><span class="btn_plus">더보기</span><span class="btn_minus">닫기</span></td>
</tr>

In [None]:
# check contents of the parent text files

detail = parent.parent.find_all(attrs={"class": "view_content"})[-1].text
print(detail)


[환자현황]
○ 용인-376 : 남, 10대
○ 주소 : 수지구 푸른솔로 20, 꽃메마을현대홈타운4차4단지(죽전동)
○ 동거인 : 3명(검사예정)
○ 증상(9.26) : 미각소실, 후각소실
○ 특이사항 : 용인-369 접촉자
 
[발생경위 및 조치사항]
○ 9.30. (수) : 수지구보건소 선별진료소 검체채취
○ 10.1. (목) 09:40 : 민간검사기관(녹십자)에서 양성판정
                          경기도역학조사관 발생보고, 격리병상배정 요청
 
 
추가 조치사항, 세부동선 및 접촉자 정보는 역학조사 완료 후 공개하도록 하겠습니다.
 
- 질병관리본부 ☎ 1339
- 용인시 콜센터 ☎ 1577-1122
- 처인구 보건소 ☎ 031-324-4981
- 기흥구 보건소 ☎ 031-324-6975
- 수지구 보건소 ☎ 031-324-8566
 



In [None]:
# scraping formats for the URL
# returns routes of the COVID cases

### NOTE THAT THIS CODE ONLY WORKS FOR THE TEST URL
### FOR FUTURE USE, MODIFY IT ACCORDINGLY FOLLOWING THE TEXT FORMATS OF THE URL

detail_text = detail.text

pos = detail_text.find('시간대별 주요 동선 및 접촉자 현황')
routes = []

if pos >= 0:
    route_strs = detail_text[pos:].split('○')[1:]
    for route_str in route_strs:
        date_str, *time_strs = route_str.split('-')
        month, day = [int(it) for it in date_str.split('.')[:2]]
        for time_str in time_strs:
            try:
                *heads, description = time_str.split(':')
                time_str = ':'.join(heads)
                from_str, *to_strs = time_str.split('~')
                fhour, fminute = [int(it) for it in from_str.strip().split(':')]
                from_time = datetime(year=2020, month=month, day=day, hour=fhour, minute=fminute)
                to_time = None
                if [it for it in to_strs if it.strip()]:
                    to_str = to_strs[0]
                    thour, tminute = [int(it) for it in to_str.strip().split(':')]
                    to_time = datetime(year=2020, month=month, day=day, hour=thour, minute=tminute)

                description = description.strip()
                print(f'{from_time=}, {to_time=}, {reason=}')
                routes.append(Route(
                    from_time,
                    to_time,
                    description,
                ))
            except:
                pass

routes

In [None]:
# strip the scraped values by category
children = item.parent.find_all(attrs={"class": "txt_le"})
id, name, addr, age, date_str, reason, hospital, comment, *_  = [it.text.strip() for it in children]

# check the date value
date_str

'09-24'

In [None]:
child = children[0]

# check the child text
child.text

'403'

In [None]:
def parse_routes(case: Case):
  '''
  separate the location data by timeline from routes files;
  again, note that this works exclusively for the test URL and for further use, it should be accordingly modified
  '''
    pos = case.routes_raw.find('시간대별 주요 동선')
    routes = []
    if pos < 0:
        return routes
    
    route_strs = detail_text[pos:].split('○')[1:]
    for route_str in route_strs:
        date_str, *time_strs = route_str.split('-')
        month, day = [int(it) for it in date_str.split('.')[:2]]
        for time_str in time_strs:
            try:
                *heads, description = time_str.split(':')
                time_str = ':'.join(heads)
                from_str, *to_strs = time_str.split('~')
                fhour, fminute = [int(it) for it in from_str.strip().split(':')]
                from_time = datetime(year=2020, month=month, day=day, hour=fhour, minute=fminute)
                to_time = None
                if [it for it in to_strs if it.strip()]:
                    to_str = to_strs[0]
                    thour, tminute = [int(it) for it in to_str.strip().split(':')]
                    to_time = datetime(year=2020, month=month, day=day, hour=thour, minute=tminute)

                description = description.split('\n')[0].strip()
                routes.append(Route(
                    from_time,
                    to_time,
                    description,
                ))
            except:
                pass
    return routes

In [None]:
# specify the scraped location data with the geocode through Naver API
# for usage outside South Korea, use a different geocode API

cases = []

for i, elem in enumerate(html.find_all(attrs={"class": "txt_le expand"})):
    parent = elem.parent
    children = parent.find_all(attrs={"class": "txt_le"})
    id, name, addr, age, date_str, reason, hospital, comment, *_  = [it.text.strip() for it in children]
    month,day = date_str.split('-')
    addr_gps = None
    
    detail = parent.parent.find_all(attrs={"class": "view_content"})[i].text
    pos = detail.find('[환자현황]')
    #print(detail[:200])
    
    symptom=''
    if pos >= 0:
        addr = detail.split('○')[2].split(':')[-1].strip()
        symptom = detail.split('○')[4].split(':')[-1].strip().split('\n')[0].strip()
        
    res = requests.get('https://naveropenapi.apigw.ntruss.com/map-geocode/v2/geocode-js',
             params={
                 'query': addr,
                 'X-NCP-APIGW-API-KEY-ID': '83bfuniegk'
             }).json()
    
    addr_gpos = None
    if res.get('status') == 'OK' and res.get('addresses'):
        real_addr = res['addresses'][0]
        addr_gps = (float(real_addr['y']), float(real_addr['x']))
        
    
    case = Case(
        id=int(id),
        age=age,
        name=name,
        addr=addr,
        addr_gps=addr_gps,
        date=date(year=2020, month=int(month), day=int(day)),
        reason=reason,
        hospital=hospital,
        symptom=symptom,
        comment=comment,
        routes_raw=detail
    )
    case.routes = parse_routes(case)
    cases.append(case)

In [None]:
# check the categorized scraped data

[{'title': case.name, 'content': f'''
- 이름: {case.name}<br/>
- 날짜: {case.date}<br/>
- 증상: {case.symptom}<br/>
- 병원: {case.hospital}
'''.strip(), 'latlng': case.addr_gps} for case in cases]

[{'title': '용인-356',
  'content': '- 이름: 용인-356<br/>\n- 날짜: 2020-09-24<br/>\n- 증상: <br/>\n- 병원:',
  'latlng': (37.303883, 127.12345)},
 {'title': '용인외-36',
  'content': '- 이름: 용인외-36<br/>\n- 날짜: 2020-09-21<br/>\n- 증상: 감기 기운<br/>\n- 병원: 안성생활치료센터',
  'latlng': (37.3322771, 127.1101478)},
 {'title': '용인-355',
  'content': '- 이름: 용인-355<br/>\n- 날짜: 2020-09-20<br/>\n- 증상: 발열, 근육통<br/>\n- 병원: 경기도의료원 안성병원',
  'latlng': (37.2770259, 127.1534814)},
 {'title': '용인-354',
  'content': '- 이름: 용인-354<br/>\n- 날짜: 2020-09-19<br/>\n- 증상: 무증상<br/>\n- 병원: 안산생활치료센터',
  'latlng': (37.3357099, 127.085475)},
 {'title': '용인외-35',
  'content': '- 이름: 용인외-35<br/>\n- 날짜: 2020-09-19<br/>\n- 증상: 미각 소실<br/>\n- 병원: 경기도의료원 수원병원',
  'latlng': (37.3072931, 127.0735193)},
 {'title': '용인-353',
  'content': '- 이름: 용인-353<br/>\n- 날짜: 2020-09-19<br/>\n- 증상: 발열, 근육통, 설사<br/>\n- 병원: 경기도의료원 이천병원',
  'latlng': (37.2946074, 127.1270503)},
 {'title': '용인-352',
  'content': '- 이름: 용인-352<br/>\n- 날짜: 2020-09-18<br/>\n- 증상: 무증상<br/>

In [None]:
# check the case numbers, contraction, and time data

case = cases[5]
print(case.name, case.addr)
case.routes

용인-353 기흥구 언남로 15, 하마비마을 동일하이빌2차(언남동)


[Route(from_time=datetime.datetime(2020, 9, 10, 0, 0), to_time=datetime.datetime(2020, 9, 10, 14, 0), description='자택 기거', addr_gps=None),
 Route(from_time=datetime.datetime(2020, 9, 10, 15, 0), to_time=datetime.datetime(2020, 9, 10, 0, 30), description='타지역 동선(광명시, 자차 이동)', addr_gps=None),
 Route(from_time=datetime.datetime(2020, 9, 11, 1, 30), to_time=datetime.datetime(2020, 9, 11, 14, 0), description='자택 기거', addr_gps=None),
 Route(from_time=datetime.datetime(2020, 9, 11, 15, 0), to_time=datetime.datetime(2020, 9, 11, 0, 30), description='타지역 동선(광명시, 자차 이동)', addr_gps=None),
 Route(from_time=datetime.datetime(2020, 9, 12, 1, 30), to_time=datetime.datetime(2020, 9, 12, 11, 45), description='자택 기거', addr_gps=None),
 Route(from_time=datetime.datetime(2020, 9, 12, 12, 0), to_time=datetime.datetime(2020, 9, 12, 15, 0), description='타지역 동선(서울 금천구, 자차 이동)', addr_gps=None),
 Route(from_time=datetime.datetime(2020, 9, 12, 16, 45), to_time=None, description='귀가 후 자택(자차 이동)', addr_gps=None),
 

In [None]:
# check a test address
case.addr

'기흥구 언남로 15, 하마비마을 동일하이빌2차(언남동)'

In [None]:
# check the Naver geocode API template

res = requests.get('https://naveropenapi.apigw.ntruss.com/map-geocode/v2/geocode-js',
             params={
                 'query': '경기도 용인시',
                 'X-NCP-APIGW-API-KEY-ID': '83bfuniegk'
             }).json()
res

{'status': 'OK',
 'meta': {'totalCount': 1, 'page': 1, 'count': 1},
 'addresses': [{'roadAddress': '경기도 용인시',
   'jibunAddress': '경기도 용인시',
   'englishAddress': 'Yongin-si, Gyeonggi-do, Republic of Korea',
   'addressElements': [{'types': ['SIDO'],
     'longName': '경기도',
     'shortName': '경기도',
     'code': ''},
    {'types': ['SIGUGUN'], 'longName': '용인시', 'shortName': '용인시', 'code': ''},
    {'types': ['DONGMYUN'], 'longName': '', 'shortName': '', 'code': ''},
    {'types': ['RI'], 'longName': '', 'shortName': '', 'code': ''},
    {'types': ['ROAD_NAME'], 'longName': '', 'shortName': '', 'code': ''},
    {'types': ['BUILDING_NUMBER'],
     'longName': '',
     'shortName': '',
     'code': ''},
    {'types': ['BUILDING_NAME'], 'longName': '', 'shortName': '', 'code': ''},
    {'types': ['LAND_NUMBER'], 'longName': '', 'shortName': '', 'code': ''},
    {'types': ['POSTAL_CODE'], 'longName': '', 'shortName': '', 'code': ''}],
   'x': '127.1775942',
   'y': '37.2410999',
   'distance'

In [None]:
# matching a location ID from the addresses

res.get('addresses')

[{'roadAddress': '경기도 용인시 기흥구 언남로 15 하마비마을동일하이빌2차아파트',
  'jibunAddress': '경기도 용인시 기흥구 언남동 496 하마비마을동일하이빌2차아파트',
  'englishAddress': '15, Eonnam-ro, Giheung-gu, Yongin-si, Gyeonggi-do, Republic of Korea',
  'addressElements': [{'types': ['SIDO'],
    'longName': '경기도',
    'shortName': '경기도',
    'code': ''},
   {'types': ['SIGUGUN'],
    'longName': '용인시 기흥구',
    'shortName': '용인시 기흥구',
    'code': ''},
   {'types': ['DONGMYUN'], 'longName': '언남동', 'shortName': '언남동', 'code': ''},
   {'types': ['RI'], 'longName': '', 'shortName': '', 'code': ''},
   {'types': ['ROAD_NAME'], 'longName': '언남로', 'shortName': '언남로', 'code': ''},
   {'types': ['BUILDING_NUMBER'],
    'longName': '15',
    'shortName': '15',
    'code': ''},
   {'types': ['BUILDING_NAME'],
    'longName': '하마비마을동일하이빌2차아파트',
    'shortName': '하마비마을동일하이빌2차아파트',
    'code': ''},
   {'types': ['LAND_NUMBER'],
    'longName': '496',
    'shortName': '496',
    'code': ''},
   {'types': ['POSTAL_CODE'],
    'longName': '16918',
 