In [10]:
import numpy as np
import pandas as pd

def score_hit_naive_bayes(
    df: pd.DataFrame,
    feature_likelihoods: dict = None,
    prior_direct: float = 0.05,
    pseudocount: float = 1e-6,
    return_all: bool = True
) -> pd.DataFrame:
    """
    Scores each motif hit using NaÃ¯ve Bayes with dynamic logFC discretization.

    Parameters
    ----------
    df : pd.DataFrame
        Must contain columns: M_prom, M_chip, M_atac, logFC.
    feature_likelihoods : dict
        Format: {feature_name: {value: P(value | direct)}}
    prior_direct : float
        Prior probability that a motif hit is regulatory.
    pseudocount : float
        Used to prevent log(0) or divide-by-zero errors.
    return_all : bool
        If True, returns df with added 'P_regulatory' column.

    Returns
    -------
    pd.DataFrame or pd.Series
    """

    df = df.copy()

    # Compute dynamic thresholds based on logFC quantiles
    valid_logfc = df["logFC"].dropna()
    if not valid_logfc.empty:
        q_low, q_high = valid_logfc.quantile([0.33, 0.66])
    else:
        q_low, q_high = -1.0, 1.0  # fallback

    def discretize_logfc(logfc):
        if pd.isna(logfc):
            return "neutral"
        elif logfc >= q_high:
            return "up"
        elif logfc <= q_low:
            return "down"
        else:
            return "neutral"

    df["logFC_cat"] = df["logFC"].apply(discretize_logfc)

    # Default likelihoods if not provided
    if feature_likelihoods is None:
        feature_likelihoods = {
            "M_prom": {1: 0.75, 0: 0.25},
            "M_chip": {1: 0.90, 0: 0.10},
            "M_atac": {1: 0.85, 0: 0.30},
            "logFC_cat": {
                "up": 0.80,
                "neutral": 0.30,
                "down": 0.05
            }
        }

    prior_nondirect = 1 - prior_direct

    def compute_posterior(row):
        num = prior_direct
        denom = prior_nondirect

        for feat, probs in feature_likelihoods.items():
            val = row.get(feat, None)
            if pd.isna(val):
                val = "neutral" if feat == "logFC_cat" else 0
            p = probs.get(val, pseudocount)
            num *= p
            denom *= 1 - p

        return num / (num + denom + pseudocount)

    scores = df.apply(compute_posterior, axis=1)
    scores.name = "P_regulatory"

    return df.assign(P_regulatory=scores) if return_all else scores


In [11]:
scored_df = pd.read_csv("../scored_df.csv", sep=",")

ranked_df = score_hit_naive_bayes(scored_df)

top_hits = (
    ranked_df
    .sort_values("P_regulatory", ascending=False)
    [["Peak_ID", "Motif", "P_regulatory", "M_prom", "M_chip", "M_atac", "logFC"]]
    .head(50)
)

top_hits.to_csv("top_predicted_regulatory_hits.tsv", sep="\t", index=False)


In [11]:
# 1) Read
import pandas as pd, numpy as np
df = pd.read_csv("../creba_noChange_genes.csv", header = None)
df.columns = ["gene"]

# 2) Drop duplicates while preserving order
seen = set()
unique = []
for g in df["gene"]:
    if g not in seen:
        unique.append(g)
        seen.add(g)

# 3) Generate dummy logFC from -1 to -0.5
n = len(unique)
if n == 0:
    raise ValueError("No genes found after deduplication.")
logfc_values = np.linspace(-0.5, 0, n)


# 4) Build output DataFrame and write
out = pd.DataFrame({
    "gene": unique,
    "logFC": logfc_values
})
out.to_csv("creba_noChange_genes_final.csv", index=False)
