## Get data

- https://www.health.gov.au/resources/collections/nndss-fortnightly-reports

In [24]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import os
from datetime import datetime
from multiprocessing import Pool
import multiprocessing

def get_file_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    for link in soup.find_all('a', href=True):
        href = link.get('href')
        if href.endswith('.csv') or href.endswith('.xlsx') or href.endswith('.xls'):
            href = 'https://www.health.gov.au' + href
            return href
    
    return None

def get_file_lists(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    links = soup.find_all('a', href=True)

    links_with_text = []
    
    for link in links:
        href = link.get('href')
        text = link.get_text(strip=True)

        pattern = '/resources/publications/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports'

        if pattern in href:
            href = 'https://www.health.gov.au' + href
            file_link = get_file_links(href)
            links_with_text.append((text, href, file_link))
    
    return links_with_text

def download_file(link_with_text):
    folder_path = './AU'
    file_name = link_with_text[0]
    url = link_with_text[2]
    _, file_extension = os.path.splitext(url)
    file_nam_extension = file_name + file_extension
    response = requests.get(url)
    with open(os.path.join(folder_path, file_nam_extension), 'wb') as file:
        file.write(response.content)
    
    return os.path.join(folder_path, file_nam_extension)

In [23]:
url = 'https://www.health.gov.au/resources/collections/nndss-fortnightly-reports'
links = get_file_lists(url)
links

[('National Notifiable Diseases Surveillance System (NNDSS) fortnightly reports – 5 February 2024 to 18 February 2024',
  'https://www.health.gov.au/resources/publications/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-5-february-2024-to-18-february-2024?language=en',
  'https://www.health.gov.au/sites/default/files/2024-03/nndss_fortnightly_table_-_5_february_2024_to_18_february_2024.xlsx'),
 ('National Notifiable Diseases Surveillance System (NNDSS) fortnightly reports – 22 January 2024 to 4 February 2024',
  'https://www.health.gov.au/resources/publications/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-22-january-2024-to-4-february-2024?language=en',
  'https://www.health.gov.au/sites/default/files/2024-03/nndss_fortnightly_table_-_22_january_2024_to_4_february_2024.xlsx'),
 ('National Notifiable Diseases Surveillance System (NNDSS) fortnightly reports – 08 January to 21 January 2024',
  'https://www.health.gov.au/resources/pu

In [25]:
num_processes = int(0.1 * multiprocessing.cpu_count())
with Pool(processes=num_processes) as pool:
  results_1 = pool.map(download_file, links)

## Reading data

In [63]:
def clean_data(i):
  file_name = results_1[i]
  url = links[i][1]
  try:
    df = pd.read_excel(file_name, skiprows=2, skipfooter=3, header=0)
    start_date = df['This reporting period'][0]
    end_date = df['This reporting period'][1]

    # drop Nan in 1st column
    df = df.dropna(subset=[df.columns[1]])
    df = df[['Disease name', 'This reporting period']]
    df = df.rename(columns={'This reporting period': 'Cases',
                            'Disease name': 'Disease'})

    # Area, Date, Year, Week, URL
    df['Area'] = 'AU'
    df['Date'] = start_date
    df['Year'] = start_date.year
    df['Week'] = ''
    df['URL'] = url

    df = df[['Area', 'Date', 'Year', 'Week', 'Disease', 'Cases', 'URL']]

    return df
  except:
    print('Error reading file:', url)
    pass


In [64]:
clean_data(0)

Unnamed: 0,Area,Date,Year,Week,Disease,Cases,URL
2,AU,2024-02-05,2024,,Hepatitis B (newly acquired),5,https://www.health.gov.au/resources/publicatio...
3,AU,2024-02-05,2024,,Hepatitis B (unspecified),210,https://www.health.gov.au/resources/publicatio...
4,AU,2024-02-05,2024,,Hepatitis C (newly acquired),24,https://www.health.gov.au/resources/publicatio...
5,AU,2024-02-05,2024,,Hepatitis C (unspecified),272,https://www.health.gov.au/resources/publicatio...
6,AU,2024-02-05,2024,,Hepatitis D,6,https://www.health.gov.au/resources/publicatio...
...,...,...,...,...,...,...,...
69,AU,2024-02-05,2024,,Q fever,42,https://www.health.gov.au/resources/publicatio...
70,AU,2024-02-05,2024,,Rabies,0,https://www.health.gov.au/resources/publicatio...
71,AU,2024-02-05,2024,,Tularaemia,0,https://www.health.gov.au/resources/publicatio...
72,AU,2024-02-05,2024,,Invasive Group A Streptococcal disease (iGAS),80,https://www.health.gov.au/resources/publicatio...


In [65]:
from multiprocessing import Pool
import multiprocessing

id_range = range(0, len(results_1), 1)

num_processes = int(0.1 * multiprocessing.cpu_count())
with Pool(processes=num_processes) as pool:
    alldata = pool.map(clean_data, id_range)

final_df = pd.concat(alldata, ignore_index=True)

# save to csv
final_df.to_csv('./AU/AllData.csv', index=False)

Error reading file: https://www.health.gov.au/resources/publications/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-12-to-25-september-2020?language=en


In [72]:
# Drop Disease is NA /Nan
pertussis_df = final_df.dropna(subset=['Disease'])
# Filter Disease contains pertussis
pertussis_df = pertussis_df[pertussis_df['Disease'].str.contains('Pertussis', case=False)]
# Arrange by Year and Week
pertussis_df = pertussis_df.sort_values(by=['Date'])
# reindex
pertussis_df = pertussis_df.reset_index(drop=True)
# add missing data
pertussis_df_missing = {
    'Area': 'AU',
    'Date': datetime(2020, 9, 12),
    'Year': 2020,
    'Week': '',
    'Disease': 'Pertussis',
    'Cases': 24,
    'URL': 'https://www.health.gov.au/resources/publications/national-notifiable-diseases-surveillance-system-nndss-fortnightly-reports-12-to-25-september-2020?language=en'
}
pertussis_df_missing = pd.DataFrame(pertussis_df_missing, index=[0])
pertussis_df = pd.concat([pertussis_df_missing, pertussis_df]).reset_index(drop=True)

# save to csv
pertussis_df.to_csv('./AU/pertussis.csv', index=False)
# pertussis_df