In [None]:
pip install requests beautifulsoup4




In [None]:
import requests
from bs4 import BeautifulSoup
import os

def download_csv_files(url, download_folder):
    # Send HTTP request to the URL
    response = requests.get(url)
    # Check if the request was successful
    if response.status_code != 200:
        print('Failed to retrieve webpage')
        return

    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all anchor tags <a> and filter by those ending with '.csv'
    csv_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.csv')]

    # Ensure the download folder exists
    os.makedirs(download_folder, exist_ok=True)

    # Download each CSV file found
    for link in csv_links:
        # Construct the full URL if the link is relative
        if not link.startswith('http'):
            link = url + link

        # Extract the filename from the URL
        filename = os.path.join(download_folder, link.split('/')[-1])

        # Send request to download the CSV file
        file_response = requests.get(link)
        if file_response.status_code == 200:
            with open(filename, 'wb') as f:
                f.write(file_response.content)
            print(f'Downloaded {filename}')
        else:
            print(f'Failed to download {link}')

# Example usage
url = 'https://www.data.gov.uk/dataset/e37520b0-ddb4-4cfa-b53f-a9c50ef21965/notification-of-infectious-diseases'  # Replace with the actual URL
download_folder = 'downloaded_csvs'  # Folder where files will be saved
download_csv_files(url, download_folder)


Downloaded downloaded_csvs/noids-week-1-4-2024-.csv
Downloaded downloaded_csvs/noids-week-52-csv.csv
Downloaded downloaded_csvs/noids-week-49-csv.csv
Downloaded downloaded_csvs/noids-week-45-csv.csv
Downloaded downloaded_csvs/noids-week-41-csv.csv
Downloaded downloaded_csvs/noids-week-37-csv.csv
Downloaded downloaded_csvs/noids-week-32-csv-.csv
Downloaded downloaded_csvs/noids-week-33-csv-.csv
Downloaded downloaded_csvs/noids-week-28-csv.csv
Downloaded downloaded_csvs/noids-week-24-csv.csv
Downloaded downloaded_csvs/noids-report-2023-week-17-20.csv
Downloaded downloaded_csvs/noids-report-2023-week-13-16.csv
Downloaded downloaded_csvs/copy-of-noids-week-52-csv.csv
Downloaded downloaded_csvs/copy-of-noids-week-48-csv.csv
Downloaded downloaded_csvs/noids-report-week-44-2022-csv.csv
Downloaded downloaded_csvs/noids-report-week-37--40-2022.csv
Downloaded downloaded_csvs/noids-report-week-33---36-2022.csv
Downloaded downloaded_csvs/noids-report-week-33---36-2022.csv
Downloaded downloaded_csv

In [None]:

!zip -r /content/downloaded_csv.zip /content/downloaded_csvs

  adding: content/downloaded_csvs/ (stored 0%)
  adding: content/downloaded_csvs/noids-report--week-21---24.csv (deflated 53%)
  adding: content/downloaded_csvs/weekly-noids-report-week-19-2015.csv (deflated 61%)
  adding: content/downloaded_csvs/noids-report-week-37--40-2022.csv (deflated 53%)
  adding: content/downloaded_csvs/noids-report-2021-weeks-44-47.csv (deflated 51%)
  adding: content/downloaded_csvs/noids-report-2015-week-30.csv (deflated 60%)
  adding: content/downloaded_csvs/noids-week-40---week-43.csv (deflated 52%)
  adding: content/downloaded_csvs/noids-week-28-csv.csv (deflated 54%)
  adding: content/downloaded_csvs/noids-week-41-csv.csv (deflated 53%)
  adding: content/downloaded_csvs/noids-report-2023-week-13-16.csv (deflated 54%)
  adding: content/downloaded_csvs/noids-week-45-csv.csv (deflated 53%)
  adding: content/downloaded_csvs/noids-report-2015-week-28.csv (deflated 60%)
  adding: content/downloaded_csvs/noids-week-24-csv.csv (deflated 54%)
  adding: content/do

In [None]:
from google.colab import files
files.download("/content/downloaded_csv.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!pip install pandas matplotlib reportlab PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import pandas as pd
import os
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, PageBreak
from reportlab.lib import colors
from PyPDF2 import PdfMerger

def csv_to_pdf(csv_file, pdf_file):
    # Load CSV data into a DataFrame
    df = pd.read_csv(csv_file)

    # Create a PDF file using ReportLab
    pdf = SimpleDocTemplate(pdf_file, pagesize=letter)
    elements = []

    # Define the table style
    style = TableStyle([
        ('BACKGROUND', (0, 0), (-1, 0), colors.lightblue),
        ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
        ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
        ('GRID', (0, 0), (-1, -1), 1, colors.black)
    ])

    # Process the DataFrame in chunks to add to the PDF
    chunk_size = 25  # Adjust chunk size based on your needs
    for start_row in range(0, len(df), chunk_size):
        end_row = min(start_row + chunk_size, len(df))
        data = [df.columns.to_list()] + df.iloc[start_row:end_row].values.tolist()
        t = Table(data)
        t.setStyle(style)
        elements.append(t)
        elements.append(PageBreak())

    # Remove the last PageBreak
    if elements:
        elements.pop()

    # Build the PDF
    pdf.build(elements)

def merge_pdfs(pdf_files, output_path):
    merger = PdfMerger()

    for pdf in pdf_files:
        merger.append(pdf)

    merger.write(output_path)
    merger.close()

# Directory setup
csv_directory = '/content/csvs'  # Update this path
pdf_directory = '/content/pdfs'  # Update this path
final_pdf_path = os.path.join(pdf_directory, 'merged_output.pdf')

os.makedirs(pdf_directory, exist_ok=True)

pdf_files = []

# Convert each CSV to a PDF with a table
for filename in os.listdir(csv_directory):
    if filename.endswith('.csv'):
        csv_path = os.path.join(csv_directory, filename)
        pdf_path = os.path.join(pdf_directory, filename.replace('.csv', '.pdf'))
        csv_to_pdf(csv_path, pdf_path)
        pdf_files.append(pdf_path)
        print(f"PDF generated for {filename}")

# Merge all PDFs into one
merge_pdfs(pdf_files, final_pdf_path)
print(f"All PDFs have been merged into: {final_pdf_path}")


PDF generated for noids-week-4-6-2021.csv
PDF generated for noids-report-2015-week-22.csv
PDF generated for noids-week-41-csv.csv
PDF generated for noids-report-2015-week-9.csv
PDF generated for noids-report-2021-weeks-32-35.csv
PDF generated for noids-week-24-csv.csv
PDF generated for noids-report-2015-week-28.csv
PDF generated for noids-week-1-3-2021.csv
PDF generated for noids-report-2015-week-31.csv
PDF generated for noids-report-2015-week-36.csv
PDF generated for copy-of-noids-week-48-csv.csv
PDF generated for noids-report-2023-week-17-20.csv
PDF generated for noids-report-2021-week-52.csv
PDF generated for noids-report-week-37--40-2022.csv
PDF generated for noids-report-2015-week-38.csv
PDF generated for noidss-report-2015-week-21.csv
PDF generated for noids-week-51-to-week-53-2020.csv
PDF generated for noids-report-2015-week-23.csv
PDF generated for noids-report-week--29---32--2022.csv
PDF generated for noids-week-44-to-week-46-2020.csv
PDF generated for noids-report-2015-week-6

In [None]:
import pandas as pd
import os
from glob import glob

# Define the path to your CSV files
csv_directory = '/content/csvs'

# Load all CSV files in the directory into DataFrames
csv_files = glob(os.path.join(csv_directory, '*.csv'))
dataframes = []

for file in csv_files:
    # Correct header processing, assuming the second row is the actual header
    df = pd.read_csv(file, header=1)
    # Convert week data columns to numeric, errors='coerce' will handle non-numeric issues
    for col in df.columns[1:]:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    dataframes.append(df)

# Concatenate all DataFrames into one DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# Clean data, such as removing rows where 'Disease' is NaN
combined_df.dropna(subset=['Disease'], inplace=True)

# Aggregate data, here summing up all years and weeks per disease
aggregated_df = combined_df.groupby('Disease').sum()

# Export the aggregated data to a new CSV
output_file = os.path.join(csv_directory, 'aggregated_output.csv')
aggregated_df.to_csv(output_file, index=True)

print(f"Aggregated data has been saved to {output_file}")


Aggregated data has been saved to /content/csvs/aggregated_output.csv


In [None]:
import pandas as pd
import os
from glob import glob
import re

def get_year_from_column_name(column_name):
    # Extract the year from the column name using regex
    match = re.search(r'20\d{2}', column_name)
    return match.group(0) if match else None

# Define the path to your CSV files
csv_directory = '/content/csvs'

# List of all CSV files in the directory
csv_files = glob(os.path.join(csv_directory, '*.csv'))
all_yearly_data = {}

for file in csv_files:
    # Determine the header row by reading the file line by line until we find 'Disease'
    header_row = None
    with open(file, 'r') as f:
        for i, line in enumerate(f):
            if 'Disease' in line:
                header_row = i
                break

    # If the header row is found, read the CSV file from that row
    if header_row is not None:
        df = pd.read_csv(file, header=header_row)

    # Extract year from the last three column names and rename the columns to just the year
    last_three_cols = df.columns[-3:]
    for col in last_three_cols:
        year = get_year_from_column_name(col)
        if year:
            # Rename the column to be just the year
            df.rename(columns={col: year}, inplace=True)
            # Ensure the column is numeric
            df[year] = pd.to_numeric(df[year], errors='coerce')

            # Aggregate the data
            if year in all_yearly_data:
                # If the year is already in the dict, we check the total sum
                current_year_sum = df[year].sum()
                if current_year_sum > all_yearly_data[year]['sum']:
                    all_yearly_data[year]['data'] = df[['Disease', year]]
                    all_yearly_data[year]['sum'] = current_year_sum
            else:
                all_yearly_data[year] = {'data': df[['Disease', year]], 'sum': df[year].sum()}

# Combine all the dataframes with yearly data into one dataframe
final_df = pd.DataFrame()
for year_info in all_yearly_data.values():
    yearly_df = year_info['data']
    if final_df.empty:
        final_df = yearly_df
    else:
        final_df = final_df.merge(yearly_df, on='Disease', how='outer')

# Replace NaN with zeros and set 'Disease' as the first column (index reset)
final_df.fillna(0, inplace=True)
final_df.reset_index(drop=True, inplace=True)

# Save the final DataFrame to a CSV file
output_file = os.path.join(csv_directory, 'aggregated_yearly_data.csv')
final_df.to_csv(output_file, index=False)

print(f"Aggregated data has been saved to {output_file}")


Aggregated data has been saved to /content/csvs/aggregated_yearly_data.csv
