In [3]:
import os
import pandas as pd 

def load_reference_columns(reference_file_path,column_name='指标名称'):
    df_ref=pd.read_csv(reference_file_path)
    if column_name in df_ref.columns:
        return df_ref[column_name]

def process_single_file(input_file_path, reference_series, output_dir):
    try:
        # 加载CSV文件
        df = pd.read_csv(input_file_path)
        # 获取当前文件的列名
        column_name='指标名称'
        # 对比并补全缺失的列
        if column_name not in df.columns:
            # 如果当前文件不存在该列，则该列填充为NaN
            df[column_name]=float('nan')
            
        # 确保日期格式一致
        reference_series = pd.to_datetime(reference_series, format='%Y-%m').dt.strftime('%Y-%m')
        df[column_name] = pd.to_datetime(df[column_name], format='%Y-%m', errors='coerce').dt.strftime('%Y-%m')

        # 创建一个完整的日期索引以检查缺失的日期
        complete_dates =pd.DataFrame({'指标名称': reference_series.unique()})
        df_complete=pd.merge(complete_dates,df,on='指标名称',how='left')
        # 构建输出文件路径
        base_name=os.path.basename(input_file_path)
        output_file_path=os.path.join(output_dir, base_name)

        # 保存更新后的数据到新的CSV文件
        df_complete.to_csv(output_file_path, index=False)

        print(f"已处理并保存到 {output_file_path}")

    except Exception as e:
        print(f"处理文件 {input_file_path} 时出错: {e}")

def find_matching_active_file(base_dir, input_file_path):
    dir_path = os.path.dirname(input_file_path)
    file_name=os.path.basename(input_file_path)
    parts=file_name.split('_',2)
    file_prefix=parts[0]+'_'+parts[1]
    for root, dirs, files in os.walk(dir_path):
        for file in files: 
            if file.endswith('active_dates_and_times.csv') and file.startswith(file_prefix): # 匹配寻找对应的时间基准
                return os.path.join(root, file)
    return None

def process_all_files(base_dir,output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for root, dirs, files in os.walk(base_dir):
        for file in files:
             if file.endswith('.csv') and not file.endswith('active_dates_and_times.csv'):
                input_file_path=os.path.join(root, file)
                # 查找对应的 active_dates_and_times.csv 文件
                matching_active_file=find_matching_active_file(base_dir, input_file_path)
                if matching_active_file:
                    # 加载参考文件的列名
                    reference_series=load_reference_columns(matching_active_file)
                    # 处理单个文件
                    process_single_file(input_file_path, reference_series, output_dir)
                else:
                    print(f"警告: 没有找到与 {input_file_path} 对应的 active_dates_and_times.csv 文件")


base_dir='./outputdata 原数据'
output_dir='./add 仅填补日期 无补0'
process_all_files(base_dir,output_dir)

已处理并保存到 ./add\AdguardTeam_AdguardFilters_contributor.csv
已处理并保存到 ./add\airbytehq_airbyte_contributor.csv
已处理并保存到 ./add\alibaba_nacos_contributor.csv
已处理并保存到 ./add\angular_angular_contributor.csv
已处理并保存到 ./add\angular_components_contributor.csv
已处理并保存到 ./add\ankidroid_Anki-Android_contributor.csv
已处理并保存到 ./add\ansible_ansible_contributor.csv
已处理并保存到 ./add\ant-design_ant-design_contributor.csv
已处理并保存到 ./add\apache_airflow_contributor.csv
已处理并保存到 ./add\apache_apisix_contributor.csv
已处理并保存到 ./add\apache_arrow_contributor.csv
已处理并保存到 ./add\apache_beam_contributor.csv
已处理并保存到 ./add\apache_dolphinscheduler_contributor.csv
已处理并保存到 ./add\apache_doris_contributor.csv
已处理并保存到 ./add\apache_flink_contributor.csv
已处理并保存到 ./add\apache_hudi_contributor.csv
已处理并保存到 ./add\apache_iceberg_contributor.csv
已处理并保存到 ./add\apache_pulsar_contributor.csv
已处理并保存到 ./add\apache_shardingsphere_contributor.csv
已处理并保存到 ./add\apache_spark_contributor.csv
已处理并保存到 ./add\apache_superset_contributor.csv
已处理并保存到 ./add\apach