In [11]:
import pandas as pd
import os

# 文件夹路径，包含多个年份的数据文件
folder_path = 'hongkong car data/'

# 列出文件夹中的所有文件，并过滤出2020及之后的年份
years = [2020, 2021, 2022, 2023, 2024]
files = {year: [f for f in os.listdir(folder_path) if str(year) in f] for year in years}

# 定义排量区间分类函数
def categorize_engine_size(cc):
    if cc <= 1000:
        return '1000cc and below'
    elif 1001 <= cc <= 1600:
        return '1001-1600cc'
    elif 1601 <= cc <= 2000:
        return '1601-2000cc'
    elif 2001 <= cc <= 3000:
        return '2001-3000cc'
    else:
        return '3001cc and above'

# 初始化一个空的 DataFrame 用于保存年度数据
all_year_data = pd.DataFrame()

# 遍历每一年
for year, file_list in files.items():
    all_data = pd.DataFrame()  # 用于存储该年所有月份的数据

    # 遍历该年每个月的文件
    for file in file_list:
        file_path = os.path.join(folder_path, file)
        
        # 读取文件数据
        df = pd.read_csv(file_path)
        
        # 将"cylinder capacity"列转换为数值类型，错误值设置为NaN
        df['Cylinder Capacity Of Engine (c.c.)'] = pd.to_numeric(df['Cylinder Capacity Of Engine (c.c.)'], errors='coerce')
        
        # 删除排量为空的行（即不能转换为数值的行）
        df = df.dropna(subset=['Cylinder Capacity Of Engine (c.c.)'])
        
        # 添加新的列，用于分类排量区间
        df['cc_rating'] = df['Cylinder Capacity Of Engine (c.c.)'].apply(categorize_engine_size)
        
        # 添加年份列
        df['year'] = year
        
        # 将每个月的数据合并到该年的数据
        all_data = pd.concat([all_data, df])

    # 累计该年整年的排量统计
    annual_counts = all_data.groupby(['year', 'cc_rating']).size().reset_index(name='number')

    # 将每年的累计数据追加保存到CSV文件中
    output_file = 'output_vehicle_statistics.csv'
    if os.path.exists(output_file):
        # 如果文件已存在，使用 append 模式 ('a')，并且不写入列名
        annual_counts.to_csv(output_file, mode='a', header=False, index=False)
    else:
        # 如果文件不存在，创建文件并写入列名
        annual_counts.to_csv(output_file, mode='w', header=True, index=False)

print(f'Statistics for 2020-2024 saved to {output_file}')


Statistics for 2020-2024 saved to output_vehicle_statistics.csv


In [19]:
import pandas as pd
import os

def process_file(file_path, output_file):
    # 读取 CSV 文件
    df = pd.read_csv(file_path)

    # 将数据转换为长格式
    df_melted = df.melt(id_vars=['DATE', 'POLLUTANT'], var_name='region', value_name='value')

    # 过滤掉'N.A.'的数据
    df_melted = df_melted[df_melted['value'] != 'N.A.']

    # 将'DATE'列转换为日期格式，以确保正确排序
    df_melted['DATE'] = pd.to_datetime(df_melted['DATE'], format='%d-%m-%Y')

    # 按DATE和region排序
    df_sorted = df_melted.sort_values(by=['DATE', 'region'])

    # 将处理并排序后的数据写入到输出文件中
    df_sorted.to_csv(output_file, mode='w', index=False)

if __name__ == "__main__":
    # 输入文件路径
    input_file = 'air_daily.csv'  # 替换为你输入的文件路径
    # 输出文件路径
    output_file = 'fix_air_daily.csv'  # 替换为你想要的输出文件路径

    # 处理单个文件
    process_file(input_file, output_file)

    print(f'File processed and sorted. Results saved to {output_file}.')


File processed and sorted. Results saved to fix_air_daily.csv.
