In [11]:
from pysam import AlignmentFile
from glob import glob
from tqdm import tqdm

# Human genome alignment sanity check
- Reads were aligned to GRCh38 (human reference genome) using BWA-MEM
- After alignment, reads that mapped to human genome were removed
- Small percentage of microbial genes was left 
- Recalculation is done for each file 

In [16]:
def calculate_reads(bam):
    total_reads = 0
    unmapped_reads = 0
    with AlignmentFile(bam, "rb") as f:
        for read in f:
            total_reads += 1
            if read.is_unmapped:
                unmapped_reads += 1
    return total_reads, unmapped_reads

In [17]:
aligned_files = glob("/nfs/cds-peta/exports/biol_micro_cds_gr_sunagawa/scratch/vbezshapkin/sinuses_2023/data/metagenomics/processed/host_removed/*.bam")

In [18]:
print("<File>: <non-human reads>/<total reads>")

for file in sorted(aligned_files):
    total_reads, unmapped_reads = calculate_reads(file)
    file_id = file.split("/")[-1].split("_")[0]

    print(f"{file_id}: {unmapped_reads}/{total_reads} ({unmapped_reads/total_reads*100:.2f}%)")

<File>: <non-human reads>/<total reads>
DROKAM: 6723/19980195 (0.03%)
GOSTOM: 6960/23616162 (0.03%)
OCHDOR: 10300/21074568 (0.05%)
SALDOM: 4688/18118004 (0.03%)
SZAHEN: 7680/21998446 (0.03%)
SZAJOA: 146057/21972377 (0.66%)
SZAMAC: 24197/19929515 (0.12%)
TADMAL: 47165/21854193 (0.22%)
Undetermined: 10684815/15019603 (71.14%)
WOJFRA: 14756/20204406 (0.07%)
WROBEA: 10178/22839758 (0.04%)
