## Get data from NID Japan

In [70]:
import requests
from bs4 import BeautifulSoup
import re
import os
from datetime import datetime

def extract_info(text, year):
    match = re.search(r'week (\d+)\（(\d+/\d+)～(\d+/\d+)\）', text)

    if match:
      start_date = match.group(2)
      start_date = datetime.strptime(str(year) + '/' + start_date, "%Y/%m/%d")
      end_date = match.group(3)
      end_date = datetime.strptime(str(year) + '/' + end_date, "%Y/%m/%d")
      return start_date, end_date
    else:
        return None, None

def extract_title(text):
    match = re.search(r'(\d{4}) week (\d+)', text)
    if match:
        year = match.group(1)
        week = match.group(2)
        return year, week
    else:
        return None, None

def get_jp_data(i):
  url = f"https://www.niid.go.jp/niid/en/survaillance-data-table-english.html?start={i}"

  response = requests.get(url)
  response.raise_for_status()

  soup = BeautifulSoup(response.text, 'html.parser')

  title = soup.find('h2').text
  title = re.sub(r'[\n\t]', '', title)
  year, week = extract_title(title)

  # date info is in the first paragraph
  date_info = soup.find('p').text
  start_date, end_date = extract_info(date_info, year)

  # get csv links
  csv_links = []
  for link in soup.find_all('a', href=True):
      href = link['href']
      if href.lower().endswith('.csv'):
          if not href.startswith('http'):
              href = requests.compat.urljoin(url, href)
          csv_links.append(href)

  # download the first csv file
  for i, csv_url in enumerate(csv_links):
    response = requests.get(csv_url)
    response.raise_for_status()

    # get file extension
    _, file_extension = os.path.splitext(csv_url)

    with open(f'JP/raw data/{i} IDWR {year} week {week}{file_extension}', 'wb') as f:
        f.write(response.content)
  
  outcome = {
      "title": title,
      "year": year,
      "week": week,
      "start_date": start_date,
      "end_date": end_date,
      "url": url
  }

  return outcome

In [71]:
get_jp_data(426)

{'title': 'IDWR Surveillance Data Table 2016 week 01',
 'year': '2016',
 'week': '01',
 'start_date': datetime.datetime(2016, 1, 4, 0, 0),
 'end_date': datetime.datetime(2016, 1, 10, 0, 0),
 'url': 'https://www.niid.go.jp/niid/en/survaillance-data-table-english.html?start=426'}

In [72]:
from multiprocessing import Pool
import multiprocessing

id_range = range(1, 605, 1)

num_processes = int(0.9 * multiprocessing.cpu_count())
with Pool(processes=num_processes) as pool:
    results = pool.map(get_jp_data, id_range)

## Reading Data

In [53]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [81]:
def clean_data(i):
  _, year, week, _, end_date, url = results[i].values()
  # get file name
  if int(year) >= 2018:
    file_name = f'JP/raw data/0 IDWR {year} week {week}.csv'
  else:
    file_name = f'JP/raw data/1 IDWR {year} week {week}.csv'

  try:
    df = pd.read_csv(file_name, header= None, skiprows=3)
  except:
    print(f"Error reading {file_name}, i = {i}")
    data = {
      "Area": [None],
      "Date": [end_date],
      "Year": [year],
      "Week": [week],
      "Disease": [None],
      1: [None],
      2: [None],
      "Cases": [None],
      "URL": [url]
    }
    df = pd.DataFrame(data)
    return df

  # find the data start row
  for i in range(len(df)):
    values = df.iloc[i, 1]
    if re.match(r'^\d+$', str(values)):
      start_row = i
      break

  # column names
  df_names = df.iloc[:start_row]
  df_names = df_names.T
  df_names = df_names.reset_index()
  df_names['index'] = df_names['index']
  df_names = df_names.ffill(axis=0)

  # adjust the column names
  df.columns = range(df.shape[1])
  df = df.iloc[start_row:]
  df = df.melt(id_vars=[0], value_vars=range(1, df.shape[1]), var_name='State', value_name='Cases')
  df = df.rename(columns={0: 'Area'})

  # merge with df_names by index and State
  df = pd.merge(df, df_names, left_on='State', right_on='index', how='left')
  df = df.rename(columns={0: 'Disease'})
  # add column 2 if not present
  if 1 not in df.columns:
      df[1] = ''
  if 2 not in df.columns:
      df[2] = ''

  # add Date
  df['Date'] = end_date
  df['Year'] = year
  df['Week'] = week

  # add URL
  df['URL'] = url

  df = df[['Area', 'Date', 'Year', 'Week', 'Disease', 1, 2,  'Cases', 'URL']]

  return df

In [82]:
results[425]

{'title': 'IDWR Surveillance Data Table 2016 week 01',
 'year': '2016',
 'week': '01',
 'start_date': datetime.datetime(2016, 1, 4, 0, 0),
 'end_date': datetime.datetime(2016, 1, 10, 0, 0),
 'url': 'https://www.niid.go.jp/niid/en/survaillance-data-table-english.html?start=426'}

In [83]:
df = clean_data(425)
# unique values in the Disease column
df['Disease'].unique()

array(['Influenza(excld. avian influenza and pandemic influenza)',
       'Respiratory syncytial virus infection',
       'Pharyngoconjunctival fever', 'Group A streptococcal pharyngitis',
       'Infectious gastroenteritis', 'Chickenpox',
       'Hand, foot and mouth disease', 'Erythema infection',
       'Exanthem subitum', 'Pertussis', 'Herpangina', 'Mumps',
       'Acute hemorrhagic conjunctivitis',
       'Epidemic keratoconjunctivitis', 'Bacterial meningitis',
       'Aseptic meningitis', 'Mycoplasma pneumonia',
       'Chlamydial pneumonia(excluding psittacosis)',
       'Infectious gastroenteritis (only by Rotavirus)'], dtype=object)

In [84]:
from multiprocessing import Pool
import multiprocessing

id_range = range(0, len(results), 1)

num_processes = int(0.9 * multiprocessing.cpu_count())
with Pool(processes=num_processes) as pool:
    alldata = pool.map(clean_data, id_range)

final_df = pd.concat(alldata, ignore_index=True)

# save to csv
final_df.to_csv('./JP/AllData.csv', index=False)

Error reading JP/raw data/1 IDWR 2012 week 32.csv, i = 603


  final_df = pd.concat(alldata, ignore_index=True)


In [85]:
# head of the data
final_df.tail()

Unnamed: 0,Area,Date,Year,Week,Disease,1,2,Cases,URL
3208028,Oita,NaT,2012,33,Chlamydial pneumonia(excluding psittacosis),per sentinel,,-,https://www.niid.go.jp/niid/en/survaillance-da...
3208029,Miyazaki,NaT,2012,33,Chlamydial pneumonia(excluding psittacosis),per sentinel,,-,https://www.niid.go.jp/niid/en/survaillance-da...
3208030,Kagoshima,NaT,2012,33,Chlamydial pneumonia(excluding psittacosis),per sentinel,,-,https://www.niid.go.jp/niid/en/survaillance-da...
3208031,Okinawa,NaT,2012,33,Chlamydial pneumonia(excluding psittacosis),per sentinel,,-,https://www.niid.go.jp/niid/en/survaillance-da...
3208032,,NaT,2012,32,,,,,https://www.niid.go.jp/niid/en/survaillance-da...


In [86]:
# unique date
yw_df = final_df[['Year', 'Week']].drop_duplicates()
# reorder
yw_df = yw_df.sort_values(by=['Year', 'Week'])
# reset index
yw_df = yw_df.reset_index(drop=True)
yw_df['value'] = 1
# long table to wide table
yw_df = yw_df.pivot(index='Year', columns='Week', values='value')
# fill na with 0
yw_df = yw_df.fillna(0)
# save to csv
yw_df.to_csv('./JP/YearWeek.csv')

In [91]:
# Drop Disease is NA /Nan
pertussis_df = final_df.dropna(subset=['Disease'])
# Filter Disease contains pertussis
pertussis_df = pertussis_df[pertussis_df['Disease'].str.contains('Pertussis', case=False)]
pertussis_df = pertussis_df[pertussis_df[1].str.contains('Current', case=False)]
# filter Area is in Total
pertussis_df = pertussis_df[pertussis_df['Area'].isin(['Total No.'])]
# filter Year > 2013
pertussis_df['Year'] = pertussis_df['Year'].astype(int)
pertussis_df = pertussis_df[pertussis_df['Year'] >= 2015]

# Arrange by Year and Week
pertussis_df = pertussis_df.sort_values(by=['Year', 'Week'])
# save to csv
pertussis_df.to_csv('./JP/pertussis.csv', index=False)