In [None]:
import os
import pandas as pd

In [2]:

# 1. 現状のサマリーを読み込む
csv_path = "../out/2025-12-15_15-08-48/split_summary_reasons.csv"
df = pd.read_csv(csv_path)

# 2. エラーがある行を特定する ("error"が含まれる行)
error_df = df[df['chromosome_reason'].astype(str).str.contains("error", case=False, na=False)]
target_accessions = error_df['accession'].tolist()

print(f"修復対象のゲノム数: {len(target_accessions)}")
print(f"対象Accession: {target_accessions}")

修復対象のゲノム数: 2
対象Accession: ['GB_GCA_018613055.1', 'RS_GCF_016495865.1']


In [None]:
# 1. 現状のサマリーを読み込む
csv_path = "../out/2025-12-15_15-08-48/split_summary_reasons.csv"
df = pd.read_csv(csv_path)

# 2. エラーがある行を特定する (どちらかのカラムに "error" があれば対象)
# chromosome_reason にエラーがあるか
is_chrom_error = df['chromosome_reason'].astype(str).str.contains("error", case=False, na=False)
# plasmid_reason にエラーがあるか
is_plas_error = df['plasmid_reason'].astype(str).str.contains("error", case=False, na=False)

# OR条件 (|) で結合
error_df = df[is_chrom_error | is_plas_error]
target_accessions = error_df['accession'].tolist()

print(f"修復対象のゲノム数: {len(target_accessions)}")
print(f"対象Accession: {target_accessions}")

# --- もし対象が増えていたらファイル削除も実行 ---
if len(target_accessions) > 0:
    for idx, row in error_df.iterrows():
        # 染色体ファイルの削除
        if pd.notna(row['chromosome_path']) and os.path.exists(row['chromosome_path']):
            try:
                os.remove(row['chromosome_path'])
                print(f"Deleted: {row['chromosome_path']}")
            except OSError as e:
                print(f"Error deleting {row['chromosome_path']}: {e}")
        
        # プラスミドファイルの削除
        if pd.notna(row['plasmid_path']) and os.path.exists(row['plasmid_path']):
            try:
                os.remove(row['plasmid_path'])
                print(f"Deleted: {row['plasmid_path']}")
            except OSError as e:
                print(f"Error deleting {row['plasmid_path']}: {e}")

修復対象のゲノム数: 2
対象Accession: ['GB_GCA_018613055.1', 'RS_GCF_016495865.1']
Deleted: /home/yasutake/research/projects/metagenome/data/gtdb_split/226.0/genomic_files_reps/gtdb_genomes_reps_r226/database/GCA/018/613/055/GCA_018613055.1_genomic_chromosome.fna.gz
Deleted: /home/yasutake/research/projects/metagenome/data/gtdb_split/226.0/genomic_files_reps/gtdb_genomes_reps_r226/database/GCF/016/495/865/GCF_016495865.1_genomic_chromosome.fna.gz


In [None]:
# 3. 壊れたファイルを削除する
for idx, row in error_df.iterrows():
    # 染色体ファイルの削除
    if pd.notna(row['chromosome_path']) and os.path.exists(row['chromosome_path']):
        os.remove(row['chromosome_path'])
        print(f"Deleted corrupted file: {row['chromosome_path']}")
    
    # プラスミドファイルの削除
    if pd.notna(row['plasmid_path']) and os.path.exists(row['plasmid_path']):
        os.remove(row['plasmid_path'])
        print(f"Deleted corrupted file: {row['plasmid_path']}")