## Get data

- `https://www.data.gov.uk/dataset/6bf61328-a250-44fd-a787-481503f02865/notifiable-infectious-diseases-reports` contains data on notifiable infectious diseases in England and Wales from 2015 to 2019.
- `https://www.data.gov.uk/dataset/e37520b0-ddb4-4cfa-b53f-a9c50ef21965/notification-of-infectious-diseases` contains data on notifiable infectious diseases in England and Wales from 2020 to 2023.

In [51]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import os
from datetime import datetime
from multiprocessing import Pool
import multiprocessing

def get_file_links(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    links_with_text = []
    
    for link in soup.find_all('a', href=True):
        href = link.get('href')
        text = link.get_text(strip=True)
         # Remove 'Download' from the text
        text = text.replace('Download', '')

        # Get content before the comma
        if ',' in text:
            text = text.split(',')[0].strip()


        if href.endswith('.csv') or href.endswith('.xlsx') or href.endswith('.xls'):
            if href.startswith('http') or href.startswith('https'):
                links_with_text.append((text, href))
            else:
                links_with_text.append((text, url + href))
    
    return links_with_text


def download_file(link_with_text):
    folder_path = './UK'
    file_name = link_with_text[0]
    url = link_with_text[1]
    _, file_extension = os.path.splitext(url)
    file_nam_extension = file_name + file_extension
    response = requests.get(url)
    with open(os.path.join(folder_path, file_nam_extension), 'wb') as file:
        file.write(response.content)
    
    return os.path.join(folder_path, file_nam_extension)

In [52]:
url_1 = 'https://www.data.gov.uk/dataset/e37520b0-ddb4-4cfa-b53f-a9c50ef21965/notification-of-infectious-diseases'
file_links_1 = get_file_links(url_1)
file_links_1

[('NOIDS Report 2023 week 50-52',
  'https://admin.opendatani.gov.uk/dataset/e37520b0-ddb4-4cfa-b53f-a9c50ef21965/resource/cc970ee1-b2c9-42e1-99dc-625c4794d2eb/download/noids-week-52-csv.csv'),
 ('NOIDS Report 2023 week 46-49',
  'https://admin.opendatani.gov.uk/dataset/e37520b0-ddb4-4cfa-b53f-a9c50ef21965/resource/150819c8-8117-4198-ac15-e3ce7b8df113/download/noids-week-49-csv.csv'),
 ('NOIDS Report 2023 week 42-45',
  'https://admin.opendatani.gov.uk/dataset/e37520b0-ddb4-4cfa-b53f-a9c50ef21965/resource/41d4940c-7dc5-4717-9bbe-cd308c890cb6/download/noids-week-45-csv.csv'),
 ('NOIDS Report 2023 week 38-41',
  'https://admin.opendatani.gov.uk/dataset/e37520b0-ddb4-4cfa-b53f-a9c50ef21965/resource/5b4edeac-b821-4094-b652-f9d27c314ccc/download/noids-week-41-csv.csv'),
 ('NOIDS Report 2023 week 34-37',
  'https://admin.opendatani.gov.uk/dataset/e37520b0-ddb4-4cfa-b53f-a9c50ef21965/resource/be002f14-c3a6-4db1-a143-6fde74131e96/download/noids-week-37-csv.csv'),
 ('NOIDS Report 2023 week 29-3

In [53]:
num_processes = int(0.9 * multiprocessing.cpu_count())
with Pool(processes=num_processes) as pool:
  results_1 = pool.map(download_file, file_links_1)

In [54]:
url_2 = "https://www.data.gov.uk/dataset/6bf61328-a250-44fd-a787-481503f02865/notifiable-infectious-diseases-reports"
file_links_2 = get_file_links(url_2)
file_links_2

[('NOIDs Report 2020 weeks 27-29',
  'https://www.opendatani.gov.uk/dataset/6bf61328-a250-44fd-a787-481503f02865/resource/a8cdd0f1-4d75-4ce3-b77a-7b87a5a63b7a/download/noids-report-week-27-to-week-29.xls'),
 ('NOIDs Report 2020 weeks 34-35',
  'https://www.opendatani.gov.uk/dataset/6bf61328-a250-44fd-a787-481503f02865/resource/05e3da58-18f4-4272-bc0b-ca5fdf361a0b/download/noids-week-34-and-35.xls'),
 ('NOIDs Report weeks 30-33 2020',
  'https://www.opendatani.gov.uk/dataset/6bf61328-a250-44fd-a787-481503f02865/resource/051e94e1-a2f4-4864-bcc1-cb1038741414/download/noids-week-30-to-33.xls'),
 ('NOIDs Report 2020 week 26',
  'https://www.opendatani.gov.uk/dataset/6bf61328-a250-44fd-a787-481503f02865/resource/64ca5134-91ad-402b-b958-906c086ddad4/download/noids-report-week-26-2020.csv'),
 ('NOIDs Report 2020 week 25',
  'https://www.opendatani.gov.uk/dataset/6bf61328-a250-44fd-a787-481503f02865/resource/c1cb6a28-15a3-43e2-ac6f-0406f3842f94/download/noids-report-week-25-2020.csv'),
 ('NOIDs

In [55]:
num_processes = int(0.9 * multiprocessing.cpu_count())
with Pool(processes=num_processes) as pool:
  results_2 = pool.map(download_file, file_links_2)

In [56]:
merged_results = results_1.copy()
merged_results.extend(results_2)
merged_results

['NOIDS Report 2023 week 50-52.csv',
 'NOIDS Report 2023 week 46-49.csv',
 'NOIDS Report 2023 week 42-45.csv',
 'NOIDS Report 2023 week 38-41.csv',
 'NOIDS Report 2023 week 34-37.csv',
 'NOIDS Report 2023 week 29-32.csv',
 'NOIDS Report 2023 week 30-33.csv',
 'NOIDS Report 2023 week 25-28.csv',
 'NOIDS Report 2023 week 21-24.csv',
 'NOIDS Report 2023 week 17-20.csv',
 'NOIDS Report 2023 week 13-16.csv',
 'NOIDS Report 2023 week 9-12.xls',
 'NOIDS Report 2023 week 1-4.xls',
 'NOIDS Report 2023 week 5-8.xls',
 'NOIDS Report 2022 week 48-52.csv',
 'NOIDS Report 2022 week 45-48.csv',
 'NOIDS Report 2022 week 44-47.csv',
 'Noids Report 2022 Week 37 -40.csv',
 'Noids Report 2022 Week 33 - 36.csv',
 'Noids Report 2022 Week 33 - 36.csv',
 'Noids Report  2022 Week 29 - 32.csv',
 'Noids Report 2022  Week 25 - 28.csv',
 'Noids Report 2022 Week 21- 24.csv',
 'Noids Report  2022 Week 17 - 20.csv',
 'Noids Report 2022 week 13 - 16.csv',
 'Noids Report 2022 Week 9 - 12.csv',
 'Noids Report Week  5- 8

## Reading data

In [57]:
# read file
file_name = merged_results[0]
df = pd.read_csv(os.path.join('./UK', file_name))
df

Unnamed: 0,Current weekly total of notifications of infectious diseases in the past 4 weeks along with the cumulative total for the current year compared with corresponding periods,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,of the two preceding years,,,,,,,
1,,,,,,,,
2,"Notifications of Infectious Diseases, Week 1 -...",,,,,,,
3,,,,,,,,
4,,,,,,,,
5,,,,,,Cumulative Totals,,
6,Disease,Week 52,Week 51,Week 50,Week 49,2023,2022,2021
7,,25/12 - 31/12,18/12 - 24/12,11/12 - 17/12,04/12 - 10/12,1-52,1-52,1-52
8,Acute Encephalitis/Meningitis Bacterial,0,0,1,1,38,14,6
9,Acute Encephalitis/Meningitis Viral,0,0,0,0,0,1,0
