Add the data recourse:
Car:
URl:https://data.gov.hk/sc-data/dataset/hk-td-wcms_11-first-reg-vehicle
Air quilty:
URL:https://cd.epic.epd.gov.hk/EPICDI/air/station/?lang=en


In [6]:
import pandas as pd
import os

# Folder path containing data files for multiple years
folder_path = 'hongkong car data/'

# List all files in the folder and filter out those from 2020 and onwards
years = [2020, 2021, 2022, 2023, 2024]
files = {year: [f for f in os.listdir(folder_path) if str(year) in f] for year in years}

# Define a function to categorize engine size ranges
def categorize_engine_size(cc):
    if cc <= 1000:
        return '1000cc and below'
    elif 1001 <= cc <= 1600:
        return '1001-1600cc'
    elif 1601 <= cc <= 2000:
        return '1601-2000cc'
    elif 2001 <= cc <= 3000:
        return '2001-3000cc'
    else:
        return '3001cc and above'

# Define output file
output_file = 'output_vehicle_statistics.csv'

# Clear the contents of the output file and write the header
with open(output_file, 'w') as f:
    f.write('year,cc_rating,number\n')

# Iterate through each year
for year, file_list in files.items():
    all_data = pd.DataFrame()  # To store all monthly data for the year

    # Iterate through each month's file for the year
    for file in file_list:
        file_path = os.path.join(folder_path, file)
        
        # Read the file data
        df = pd.read_csv(file_path)
        
        # Convert the "Cylinder Capacity Of Engine (c.c.)" column to numeric, setting errors to NaN
        df['Cylinder Capacity Of Engine (c.c.)'] = pd.to_numeric(df['Cylinder Capacity Of Engine (c.c.)'], errors='coerce')
        
        # Drop rows where engine capacity is missing (i.e., cannot be converted to numeric)
        df = df.dropna(subset=['Cylinder Capacity Of Engine (c.c.)'])
        
        # Add a new column to categorize engine size ranges
        df['cc_rating'] = df['Cylinder Capacity Of Engine (c.c.)'].apply(categorize_engine_size)
        
        # Add a year column
        df['year'] = year
        
        # Concatenate the monthly data into the year's data
        all_data = pd.concat([all_data, df], ignore_index=True)

    # Aggregate the annual engine size statistics
    annual_counts = all_data.groupby(['year', 'cc_rating']).size().reset_index(name='number')

    # Append the annual aggregated data to the CSV file without writing the header
    annual_counts.to_csv(output_file, mode='a', header=False, index=False)

print(f'Statistics for 2020-2024 saved to {output_file}')




Statistics for 2020-2024 saved to output_vehicle_statistics.csv


In [8]:
import pandas as pd
import os

def process_file(file_path, output_file):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Convert data to long format
    df_melted = df.melt(id_vars=['DATE', 'POLLUTANT'], var_name='region', value_name='value')

    # Filter out 'N.A.' data (Uncomment if needed)
    # df_melted = df_melted[df_melted['value'] != 'N.A.']

    # Convert the 'DATE' column to datetime format to ensure correct sorting
    df_melted['DATE'] = pd.to_datetime(df_melted['DATE'], format='%d-%m-%Y')

    # Sort by DATE and region
    df_sorted = df_melted.sort_values(by=['DATE', 'region'])

    # Check if the output file exists
    if os.path.exists(output_file):
        # If the file exists, clear its content by opening it in write mode and writing the header
        with open(output_file, 'w') as f:
            f.write('DATE,POLLUTANT,region,value\n')
    else:
        # If the file does not exist, create it and write the header
        with open(output_file, 'w') as f:
            f.write('DATE,POLLUTANT,region,value\n')

    # Write the processed and sorted data to the output file in append mode without writing the header again
    df_sorted.to_csv(output_file, mode='a', header=False, index=False)

if __name__ == "__main__":
    # Input file path
    input_file = 'air_daily.csv'  # Replace with your input file path
    # Output file path
    output_file = 'fix_air_daily.csv'  # Replace with your desired output file path

    # Process a single file
    process_file(input_file, output_file)

    print(f'File processed and sorted. Results saved to {output_file}.')





File processed and sorted. Results saved to fix_air_daily.csv.
