In [1]:
import numpy as np
import polars as pl
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import pysam
import util

In [2]:
NO_MATCH_PATH="/home/tbrekalo/dev/tb-ram/no-match.csv"
READS_PATH="/storage2/tbrekalo/HG002-simulated/chr19-sample.fasta"
MINIMAP2_OVERLAPS="/storage2/tbrekalo/HG002-simulated/chr19-sample-minimap2-ovlps.paf"
ORIGNS_PATH="/storage2/tbrekalo/HG002-simulated/chr19-read-origins.csv"

In [3]:
with pysam.FastaFile(READS_PATH) as f:
    df_names = pl.DataFrame({
        "read-name": [read_name for read_name in f.references]
    }).with_row_index().with_columns(
        pl.col("index").cast(pl.Int64)
    )

In [4]:
df_names

index,read-name
i64,str
0,"""S1_12271_chr19…"
1,"""S1_99432_chr19…"
2,"""S1_36937_chr19…"
3,"""S1_159_chr19"""
4,"""S1_94_chr19"""
5,"""S1_84586_chr19…"
6,"""S1_49868_chr19…"
7,"""S1_88719_chr19…"
8,"""S1_91600_chr19…"
9,"""S1_57697_chr19…"


In [8]:
df_discarded = (
    pl.read_csv(NO_MATCH_PATH)
    .join(
        df_names.select(
            pl.col("index").alias("lhs-id"),
            pl.col("read-name").alias("lhs-name"),
        ),
        on="lhs-id",
    )
    .join(
        df_names.select(
            pl.col("index").alias("rhs-id"), pl.col("read-name").alias("rhs-name")
        ),
        on="rhs-id",
    )
    .select(
        pl.col("lhs-id"),
        pl.col("rhs-id"),
        pl.col("lhs-name").alias("query-name"),
        pl.col("rhs-name").alias("target-name"),
        pl.col("reason"),
    )
)

In [9]:
df_minimap2 = util.create_annotated_overlaps_from_ava(
    df_overlaps=util.load_paf_df(MINIMAP2_OVERLAPS),
    df_origins=util.load_origins_df(ORIGNS_PATH),
)

In [17]:
df_minimap2.filter(pl.col("label") == 1).select(
    "query-name", "target-name"
).unique().filter(pl.col("query-name") != pl.col("target-name")).join(
    df_discarded, on=["query-name", "target-name"]
)

query-name,target-name,lhs-id,rhs-id,reason
str,str,i64,i64,str
"""S1_57697_chr19…","""S1_66532_chr19…",9,729,"""large-distance…"
"""S1_34490_chr19…","""S1_85428_chr19…",113,42,"""empty-chain"""
"""S1_43491_chr19…","""S1_91600_chr19…",126,8,"""empty-chain"""
"""S1_26579_chr19…","""S1_7217_chr19""",116,261,"""empty-chain"""
"""S1_32063_chr19…","""S1_45652_chr19…",155,544,"""empty-chain"""
"""S1_44819_chr19…","""S1_83566_chr19…",259,233,"""empty-chain"""
"""S1_102_chr19""","""S1_90645_chr19…",363,607,"""large-distance…"
"""S1_54417_chr19…","""S1_96902_chr19…",737,209,"""empty-chain"""
