In [1]:
import polars as pl
import pandas as pd
import pysam

import util

In [2]:
SAMPLE_PATH = "/storage2/tbrekalo/HG002-simulated/chr19-sample.fasta"

MINIMAP2_OVLPS_PATH = "/storage2/tbrekalo/HG002-simulated/chr19-sample-ovlps-minimap2.paf"
ORIGINS_PATH = "/storage2/tbrekalo/HG002-simulated/chr19-read-origins.csv"

MATCHES_PATH = "/storage2/tbrekalo/HG002-simulated/chr19-sample-ram-matches.tsv"
MATCHES_PATH_NO_FILTER = (
    "/storage2/tbrekalo/HG002-simulated/chr19-sample-ram-matches-no-filter.tsv"
)

CHAINS_PATH = "/storage2/tbrekalo/HG002-simulated/chr19-sample-ram-chains.tsv"
CHAINS_PATH_NO_FILTER = (
    "/storage2/tbrekalo/HG002-simulated/chr19-sample-ram-chains-no-filter.tsv"
)

In [3]:
reads = pysam.FastaFile(SAMPLE_PATH)

In [4]:
df_origins = util.load_origins_df(ORIGINS_PATH)
df_chains = util.load_chains_df(CHAINS_PATH)
df_chains_no_filter = util.load_chains_df(CHAINS_PATH_NO_FILTER)
df_overlaps_minimap2 = util.load_paf_df(MINIMAP2_OVLPS_PATH)

In [5]:
def create_chain_annotations(
    df_chains: pl.DataFrame, df_origins: pl.DataFrame
) -> pl.DataFrame:
    return util.calc_ava_origin_overlap(
        util.expand_ava_with_origin_info(
            df_ava=util.create_overlaps_from_chains(df_chains),
            df_origins=df_origins,
        )
    ).select(
        pl.col("chain-id"),
        ((pl.col("ratio") > 0.875) & pl.col("matching-strands")).alias("label"),
    )

In [6]:
df_chains_annotations = create_chain_annotations(df_chains, df_origins)

In [7]:
df_chains_no_filter_annotations = create_chain_annotations(df_chains_no_filter, df_origins)

In [8]:
df_overlaps_minimap2_annotated = util.create_annotated_overlaps_from_ava(df_overlaps_minimap2, df_origins)

In [11]:
print(df_chains_annotations.select(pl.col("label").value_counts()))
print(df_chains_no_filter_annotations.select(pl.col("label").value_counts()))
print(df_overlaps_minimap2_annotated.select(pl.col("label").value_counts()))

shape: (2, 1)
┌────────────────┐
│ label          │
│ ---            │
│ struct[2]      │
╞════════════════╡
│ {true,203}     │
│ {false,125920} │
└────────────────┘
shape: (2, 1)
┌─────────────────┐
│ label           │
│ ---             │
│ struct[2]       │
╞═════════════════╡
│ {true,391}      │
│ {false,3252946} │
└─────────────────┘
shape: (2, 1)
┌────────────┐
│ label      │
│ ---        │
│ struct[2]  │
╞════════════╡
│ {0,296419} │
│ {1,11923}  │
└────────────┘


In [None]:
df_chains_overlaps = util.create_overlaps_from_chains(df_chains).join(
    df_chains_annotations,
    on="chain-id",
)
# .filter(
#     pl.col("is-true-positive"),
# )

df_chains_no_filter_overlaps = util.create_overlaps_from_chains(df_chains_no_filter).join(
    df_chains_no_filter_annotations,
    on="chain-id",
)
# .filter(
#     pl.col("is-true-positive"),
# )

In [None]:
print(df_overlaps_minimap2.shape)
print(df_chains_overlaps.shape)
print(df_chains_no_filter_overlaps.shape)

In [None]:
def project_chain_ovlps(df: pl.DataFrame):
    columns = [
        # "chain-id",
        "query-name",
        "query-start",
        "query-end",
        "strand",
        "target-name",
        "target-start",
        "target-end",
    ]
    return df.select(columns).sort(by=columns[1:])

In [None]:
print(project_chain_ovlps(df_chains_overlaps).write_csv())

In [None]:
print(project_chain_ovlps(df_chains_no_filter_overlaps).write_csv())