In [1]:
import gzip
import re
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

In [2]:
split_summary_path = "../out/2025-12-15_15-08-48/split_summary.csv"
df = pd.read_csv(split_summary_path)
df

Unnamed: 0,accession,chromosome_path,plasmid_path
0,RS_GCF_010645065.1,/home/yasutake/research/projects/metagenome/da...,
1,GB_GCA_036004205.1,/home/yasutake/research/projects/metagenome/da...,
2,GB_GCA_934196075.1,/home/yasutake/research/projects/metagenome/da...,
3,GB_GCA_018674025.1,/home/yasutake/research/projects/metagenome/da...,
4,GB_GCA_030430655.1,/home/yasutake/research/projects/metagenome/da...,
...,...,...,...
140277,GB_GCA_030668065.1,/home/yasutake/research/projects/metagenome/da...,
140278,GB_GCA_009889605.1,/home/yasutake/research/projects/metagenome/da...,
140279,GB_GCA_041305945.1,/home/yasutake/research/projects/metagenome/da...,
140280,GB_GCA_030699205.1,/home/yasutake/research/projects/metagenome/da...,


2025-12-21 (Modify)
- Add reason column to split_summary.csv

In [3]:
def extract_reason_from_header(file_path):
    """
    Read .fna.gz in file_path & Extract [class_reason=...]
    """
    if pd.isna(file_path): # No plasmids
        return None
    
    reasons = set()
    try:
        with gzip.open(file_path, 'rt') as f:
            for line in f:
                if line.startswith(">"):
                    match = re.search(r"\[class_reason=(.*?)\]", line)
                    if match:
                        reasons.add(match.group(1))
    except Exception as e:
        return f"error: {e}"
    
    if not reasons:
        return "unknown"
    return ";".join(sorted(reasons)) # for mutiple contigs & coressponding reasons

In [4]:
tqdm.pandas()

print("Extracting plasmid reasons...")
df['plasmid_reason'] = df['plasmid_path'].progress_apply(extract_reason_from_header)

print("Extracting chromosome reasons...")
df['chromosome_reason'] = df['chromosome_path'].progress_apply(extract_reason_from_header)

# Confirm result
display(df.head())

print("\n--- Rows with plasmids ---")
display(df[df['plasmid_path'].notna()].head())

out_path = "../out/2025-12-15_15-08-48/split_summary_reasons.csv"
df.to_csv(out_path, index=False)
print(f"Saved detailed summary to {out_path}")

Extracting plasmid reasons...


  0%|          | 0/140282 [00:00<?, ?it/s]

Extracting chromosome reasons...


  0%|          | 0/140282 [00:00<?, ?it/s]

Unnamed: 0,accession,chromosome_path,plasmid_path,plasmid_reason,chromosome_reason
0,RS_GCF_010645065.1,/home/yasutake/research/projects/metagenome/da...,,,no_match
1,GB_GCA_036004205.1,/home/yasutake/research/projects/metagenome/da...,,,no_match
2,GB_GCA_934196075.1,/home/yasutake/research/projects/metagenome/da...,,,no_match
3,GB_GCA_018674025.1,/home/yasutake/research/projects/metagenome/da...,,,no_match
4,GB_GCA_030430655.1,/home/yasutake/research/projects/metagenome/da...,,,no_match



--- Rows with plasmids ---


Unnamed: 0,accession,chromosome_path,plasmid_path,plasmid_reason,chromosome_reason
35,RS_GCF_040957575.1,/home/yasutake/research/projects/metagenome/da...,/home/yasutake/research/projects/metagenome/da...,match=plasmid,match=chromosome
43,RS_GCF_033052795.2,/home/yasutake/research/projects/metagenome/da...,/home/yasutake/research/projects/metagenome/da...,match=plasmid,no_match
238,RS_GCF_011045815.1,/home/yasutake/research/projects/metagenome/da...,/home/yasutake/research/projects/metagenome/da...,match=plasmid,match=chromosome
308,RS_GCF_036964665.1,/home/yasutake/research/projects/metagenome/da...,/home/yasutake/research/projects/metagenome/da...,match=plasmid,no_match
336,GB_GCA_002813895.1,/home/yasutake/research/projects/metagenome/da...,/home/yasutake/research/projects/metagenome/da...,match=plasmid,match=chromosome


Saved detailed summary to ../out/split_summary_reasons.csv
