In [None]:
from pathlib import Path
import yaml
import glob
import os
from pathlib import Path
from collections import defaultdict
import json

In [2]:
def parse_flagstat(flagstat_file):
    """Extract mapped reads from flagstat file"""
    with open(flagstat_file) as f:
        for line in f:
            if "mapped (" in line:
                return int(line.split()[0])
    return None

In [3]:
def generate_flagstat_yaml(flagstat_dirs: list[str], fastp_dir: str, output_file="../results/multiqc/flagstat_mqc.yaml"):
    """Generate MultiQC custom content YAML"""
    
    data = defaultdict(dict)
    
    flagstat_files = []
    
    for dir in flagstat_dirs:
        flagstat_files.extend(glob.glob(os.path.join(dir, "*.flagstat")))
    
    for flagstat_file in flagstat_files:
        sample, _, _ = os.path.basename(flagstat_file).split(".")
        sample = sample.split("-")[0]
        step = Path(flagstat_file).parent.name
        
        filter_mapped = parse_flagstat(flagstat_file)
        if filter_mapped:
            data[sample][f"reads_mapped_after_{step}"] = filter_mapped / 1_000_000
    
    for fastp_file in glob.glob(os.path.join(fastp_dir, "**", "*.json"), recursive=True):
        sample = Path(fastp_file).parent.name
        fastp_json = json.load(open(fastp_file))
        data[sample]["reads_before_fastp"] = fastp_json["summary"]["before_filtering"]["total_reads"] / 1_000_000
        
    output = {
        "id": "user_defined",
        "description": "Additional QC metrics defined by the user",
        "plot_type": "generalstats",
        "headers": {
            "reads_mapped_after_filter": {
                "title": "Reads mapped (samtools filter)",
                "description": "Number of mapped reads after samtools filtering step",
                "format": "{:.1f} M",
            },
            "reads_mapped_after_picard": {
                "title": "Reads mapped (picard)",
                "description": "Number of mapped reads after Picard deduplication",
                "format": "{:.1f} M",
            },
            "reads_before_fastp": {
                "title": "Reads Before Filtering",
                "description": "Total reads before filtering (millions)",
                "format": "{:.1f} M",
            },
        },
        "data": dict(sorted(data.items()))
    }
    
    with open(output_file, 'w') as f:
        yaml.dump(output, f, default_flow_style=False, sort_keys=False)
    
    print(f"Generated {output_file} with {len(data)} samples")

In [4]:
flagstat_dirs = ["../results/filter"  , "../results/picard"]
fastp_dir = "../results/processed"

generate_flagstat_yaml(flagstat_dirs, fastp_dir)

Generated ../results/multiqc/flagstat_mqc.yaml with 154 samples


In [5]:
# fastp_json["summary"]["before_filtering"]["total_reads"] = total_reads

In [None]:
os.chdir("/zata/zippy/ramirezc/atac-smk")

with open("file_list.txt", "w") as f:
    analysis_dir = {
        'results/macs3_callpeak/0.05': '*.xls',
        'results/bowtie2_align': "*.flagstat",
        'results/picard': "*.txt",
        'results/processed': "*.json",
        'results/multiqc': "*_mqc.yaml"results/picard
    }
    for dir, pattern in analysis_dir.items():
        files = glob.glob(os.path.join(dir, pattern), recursive=True)
        for file in files:
            f.write(file + "\n")