# -*- coding: utf-8 -*-
__author__ = 'Jiahui Zhang'

In [2]:
import os
import pandas as pd

# === 1. 路径和文件名设置 ===
base_dir = "/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP"

files = {
    "ADHD": "ADHD_SNP.tsv",
    "ALZ": "ALZ_SNP.tsv",
    "ASD": "ASD_SNP.tsv",
    "EPILEPSY": "EPILE_SNP.tsv",
    "MEMORY": "MP_SNP.tsv",
}

# === 2. 读取文件，提取 SNP 集合 ===
snp_sets = {}
counts = {}
dfs = {}  # 保存 DataFrame，后面写 cleaned 文件要用

for name, fname in files.items():
    path = os.path.join(base_dir, fname)
    df = pd.read_csv(path, sep="\t")
    dfs[name] = df

    # 有些文件里 SNPS 可能不是字符串，统一转一下
    df["SNPS"] = df["SNPS"].astype(str)

    snp_set = set(df["SNPS"])
    snp_sets[name] = snp_set
    counts[name] = len(snp_set)

# === 3. 打印每个文件的 SNP 总数 ===
print("===== 每个文件中不同 SNP 的数量 =====")
for name in files.keys():
    print(f"{name}: {counts[name]} SNPs")

# === 4. 计算 ALZ 和其他四个文件的 overlap 情况 ===
alz_snps = snp_sets["ALZ"]

print("\n===== ALZ 与其他疾病 SNP 的重合情况 =====")
for name in files.keys():
    if name == "ALZ":
        continue

    other_snps = snp_sets[name]
    overlap = alz_snps.intersection(other_snps)
    overlap_count = len(overlap)

    # 百分比（*100 后保留两位小数）
    pct_in_alz = overlap_count / len(alz_snps) * 100 if len(alz_snps) > 0 else 0.0
    pct_in_other = overlap_count / len(other_snps) * 100 if len(other_snps) > 0 else 0.0

    print(f"\nALZ vs {name}:")
    print(f"  重合 SNP 数: {overlap_count}")
    print(f"  占 ALZ 的比例: {pct_in_alz:.2f}%")
    print(f"  占 {name} 的比例: {pct_in_other:.2f}%")



===== 每个文件中不同 SNP 的数量 =====
ADHD: 2317 SNPs
ALZ: 259 SNPs
ASD: 1260 SNPs
EPILEPSY: 330 SNPs
MEMORY: 4034 SNPs

===== ALZ 与其他疾病 SNP 的重合情况 =====

ALZ vs ADHD:
  重合 SNP 数: 0
  占 ALZ 的比例: 0.00%
  占 ADHD 的比例: 0.00%

ALZ vs ASD:
  重合 SNP 数: 0
  占 ALZ 的比例: 0.00%
  占 ASD 的比例: 0.00%

ALZ vs EPILEPSY:
  重合 SNP 数: 0
  占 ALZ 的比例: 0.00%
  占 EPILEPSY 的比例: 0.00%

ALZ vs MEMORY:
  重合 SNP 数: 5
  占 ALZ 的比例: 1.93%
  占 MEMORY 的比例: 0.12%


In [3]:
# === 5. 生成去掉与 ALZ 重合 SNP 的 cleaned 文件 ===
for name in files.keys():
    if name == "ALZ":
        continue  # ALZ 自己不改

    df = dfs[name].copy()
    df["SNPS"] = df["SNPS"].astype(str)

    # 找到与 ALZ 重合的 SNP
    overlap_snps = set(df["SNPS"]).intersection(alz_snps)

    # 保留非重合 SNP 行
    cleaned_df = df[~df["SNPS"].isin(overlap_snps)]

    # 输出文件名：原来加 _cleaned 后缀
    original_fname = files[name]
    root, ext = os.path.splitext(original_fname)
    cleaned_fname = root + "_cleaned" + ext
    cleaned_path = os.path.join(base_dir, cleaned_fname)

    cleaned_df.to_csv(cleaned_path, sep="\t", index=False)

    print(f"\n{name}: 已保存 cleaned 文件 -> {cleaned_fname}")
    print(f"  原始 SNP 数: {counts[name]}")
    print(f"  删除重合 SNP 数: {len(overlap_snps)}")
    print(f"  cleaned 剩余 SNP 数: {len(set(cleaned_df['SNPS']))}")



ADHD: 已保存 cleaned 文件 -> ADHD_SNP_cleaned.tsv
  原始 SNP 数: 2317
  删除重合 SNP 数: 0
  cleaned 剩余 SNP 数: 2317

ASD: 已保存 cleaned 文件 -> ASD_SNP_cleaned.tsv
  原始 SNP 数: 1260
  删除重合 SNP 数: 0
  cleaned 剩余 SNP 数: 1260

EPILEPSY: 已保存 cleaned 文件 -> EPILE_SNP_cleaned.tsv
  原始 SNP 数: 330
  删除重合 SNP 数: 0
  cleaned 剩余 SNP 数: 330

MEMORY: 已保存 cleaned 文件 -> MP_SNP_cleaned.tsv
  原始 SNP 数: 4034
  删除重合 SNP 数: 5
  cleaned 剩余 SNP 数: 4029


In [4]:
import os
import pandas as pd

# ===== 路径设置 =====
base_root = "/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data"

prepared_dir = os.path.join(base_root, "Prepared_data4")
related_snp_dir = os.path.join(base_root, "Related_SNP")

# C 对应的 SNP 名单（non-exonic SNP after cleaning）
nonexon_markers_path = os.path.join(prepared_dir, "nonexonMarkers_after_clean.txt")

# 四个 cleaned 文件
cleaned_files = {
    "ADHD":  "ADHD_SNP_cleaned.tsv",
    "ASD":   "ASD_SNP_cleaned.tsv",
    "EPILE": "EPILE_SNP_cleaned.tsv",
    "MEMORY": "MP_SNP_cleaned.tsv",
}

# ===== 1. 读 C 对应的 SNP 列表 =====
c_snps = pd.read_csv(nonexon_markers_path, header=None)[0].astype(str)
c_snp_set = set(c_snps)
print(f"C 中 SNP 总数（nonexonMarkers_after_clean）：{len(c_snp_set)}")

# ===== 2. 依次和四个 cleaned 文件求交集 =====
print("\n===== C 的 SNP 与四个 cleaned GWAS 文件的重合情况 =====")
for name, fname in cleaned_files.items():
    path = os.path.join(related_snp_dir, fname)
    df = pd.read_csv(path, sep="\t")

    # 确保 SNPS 列存在且转成字符串
    if "SNPS" not in df.columns:
        raise KeyError(f"{fname} 中没有 'SNPS' 这一列，请检查列名。")

    snps = df["SNPS"].astype(str)
    snp_set = set(snps)

    overlap = c_snp_set.intersection(snp_set)
    overlap_count = len(overlap)

    # 一些比例统计（可选）
    pct_in_c = overlap_count / len(c_snp_set) * 100 if len(c_snp_set) > 0 else 0.0
    pct_in_file = overlap_count / len(snp_set) * 100 if len(snp_set) > 0 else 0.0

    print(f"\n{name}:")
    print(f"  {fname} 中 SNP 总数: {len(snp_set)}")
    print(f"  与 C 重合 SNP 数: {overlap_count}")
    print(f"  占 C 中 SNP 比例: {pct_in_c:.4f}%")
    print(f"  占 {name} 文件 SNP 比例: {pct_in_file:.4f}%")


C 中 SNP 总数（nonexonMarkers_after_clean）：71278

===== C 的 SNP 与四个 cleaned GWAS 文件的重合情况 =====

ADHD:
  ADHD_SNP_cleaned.tsv 中 SNP 总数: 2317
  与 C 重合 SNP 数: 56
  占 C 中 SNP 比例: 0.0786%
  占 ADHD 文件 SNP 比例: 2.4169%

ASD:
  ASD_SNP_cleaned.tsv 中 SNP 总数: 1260
  与 C 重合 SNP 数: 49
  占 C 中 SNP 比例: 0.0687%
  占 ASD 文件 SNP 比例: 3.8889%

EPILE:
  EPILE_SNP_cleaned.tsv 中 SNP 总数: 330
  与 C 重合 SNP 数: 13
  占 C 中 SNP 比例: 0.0182%
  占 EPILE 文件 SNP 比例: 3.9394%

MEMORY:
  MP_SNP_cleaned.tsv 中 SNP 总数: 4029
  与 C 重合 SNP 数: 39
  占 C 中 SNP 比例: 0.0547%
  占 MEMORY 文件 SNP 比例: 0.9680%


In [5]:
# -*- coding: utf-8 -*-
"""
使用 Prepared_data4 中的 C.npy + ids.txt + figure3_groups.csv，
对四个 cleaned 疾病 SNP 集合中与 C 重合的 SNP 做两组 t 检验（AD 组1 vs 组2）。

结果输出到：
  /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP
"""

from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind

# ------------ 路径设置 ------------
ROOT = Path("/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data")
PREP = ROOT / "Prepared_data4"
SNP_DIR = ROOT / "Related_SNP"

C_PATH = PREP / "C.npy"
IDS_PATH = PREP / "ids.txt"
NONEXON_MARKERS_PATH = PREP / "nonexonMarkers_after_clean.txt"
GROUPS_PATH = PREP / "figure3_groups.csv"   # Fig3 脚本生成的 AD-only 分组

# 四个 cleaned GWAS SNP 文件
CLEANED_FILES = {
    "ADHD":   SNP_DIR / "ADHD_SNP_cleaned.tsv",
    "ASD":    SNP_DIR / "ASD_SNP_cleaned.tsv",
    "EPILE":  SNP_DIR / "EPILE_SNP_cleaned.tsv",
    "MEMORY": SNP_DIR / "MP_SNP_cleaned.tsv",
}

# ------------ 1. 读取 C, ids, SNP 列顺序 ------------
print("[INFO] Loading C, ids, and SNP markers ...")
C = np.load(C_PATH)                           # shape: (n_subjects, n_C_snps)
ids = np.loadtxt(IDS_PATH, dtype=str)        # length: n_subjects
assert C.shape[0] == len(ids), "C 的行数与 ids 数量不一致。"

# C 的列对应的 SNP 名称（non-exonic）
nonexon_snps = pd.read_csv(NONEXON_MARKERS_PATH, header=None)[0].astype(str).to_numpy()
assert C.shape[1] == len(nonexon_snps), "C 的列数与 nonexonMarkers_after_clean.txt 长度不一致。"

snp_to_col = {snp: idx for idx, snp in enumerate(nonexon_snps)}

# ------------ 2. 读取 AD 分组（figure3_groups.csv） ------------
print("[INFO] Loading AD groups from figure3_groups.csv ...")
grp_df = pd.read_csv(GROUPS_PATH)

# figure3_groups.csv 中列名是 "id" 和 "cluster"
if "id" not in grp_df.columns or "cluster" not in grp_df.columns:
    raise ValueError("figure3_groups.csv 应包含 'id' 和 'cluster' 两列。")

grp_df = grp_df[["id", "cluster"]].copy()
grp_df["id"] = grp_df["id"].astype(str)

# 把 cluster 映射到 {1,2}，保证和 Fig4 的脚本一致
uniq_clusters = sorted(pd.unique(grp_df["cluster"].astype(int)))
if len(uniq_clusters) < 2:
    raise ValueError("分组文件中只有一个簇，无法做两组比较。")
cluster_map = {uniq_clusters[0]: 1, uniq_clusters[1]: 2}
grp_df["group"] = grp_df["cluster"].astype(int).map(cluster_map)

# 只保留 AD 中在 ids 里的那些
id_to_idx = {sid: i for i, sid in enumerate(ids)}
grp_df = grp_df[grp_df["id"].isin(id_to_idx.keys())].copy()

ad_indices = np.array([id_to_idx[sid] for sid in grp_df["id"]], dtype=int)
groups = grp_df["group"].to_numpy(int)
assert ad_indices.shape[0] == groups.shape[0], "AD index 与 group 长度不一致。"

# 在 AD 病人子集上取 C
C_ad = C[ad_indices, :]     # shape: (n_AD, n_C_snps)

print(f"[INFO] AD 病人数: {C_ad.shape[0]}, C 中 SNP 数: {C_ad.shape[1]}")

# ------------ 3. 对四个 cleaned GWAS 集中的 overlap SNP 做 t-test ------------
for disease, path in CLEANED_FILES.items():
    print(f"\n[INFO] Processing disease: {disease}")
    df = pd.read_csv(path, sep="\t")

    if "SNPS" not in df.columns:
        raise KeyError(f"{path.name} 中缺少 'SNPS' 列。")

    snps_d = df["SNPS"].astype(str)
    snp_set_d = set(snps_d)

    # 与 C 的 overlap
    overlap_snps = sorted(snp_set_d.intersection(snp_to_col.keys()))
    print(f"[INFO] {disease}: cleaned SNP 数={len(snp_set_d)}, 与 C 重合 SNP 数={len(overlap_snps)}")

    results = []

    for snp in overlap_snps:
        col_idx = snp_to_col.get(snp, None)
        if col_idx is None:
            continue  # 理论上不会发生

        x = C_ad[:, col_idx].astype(float)

        g1 = x[groups == 1]
        g2 = x[groups == 2]

        # 安全检查：两组至少要有两个样本
        if len(g1) < 2 or len(g2) < 2:
            t_stat, p_val = np.nan, np.nan
        else:
            t_stat, p_val = ttest_ind(g1, g2, equal_var=False, nan_policy="omit")

        results.append({
            "SNP": snp,
            "C_col_index": col_idx,
            "n_group1": len(g1),
            "n_group2": len(g2),
            "mean_group1": float(np.mean(g1)) if len(g1) > 0 else np.nan,
            "mean_group2": float(np.mean(g2)) if len(g2) > 0 else np.nan,
            "t_stat": float(t_stat) if t_stat is not None else np.nan,
            "p_value": float(p_val) if p_val is not None else np.nan,
        })

    if not results:
        print(f"[WARN] {disease}: 没有与 C 重合的 SNP，跳过保存。")
        continue

    res_df = pd.DataFrame(results)
    # 按 p 值排序一下，方便你后面筛选
    res_df = res_df.sort_values("p_value", na_position="last")

    out_path = SNP_DIR / f"{disease}_C_overlap_ttest_results.tsv"
    res_df.to_csv(out_path, sep="\t", index=False)
    print(f"[INFO] {disease}: 结果已保存到 {out_path}")


[INFO] Loading C, ids, and SNP markers ...
[INFO] Loading AD groups from figure3_groups.csv ...
[INFO] AD 病人数: 251, C 中 SNP 数: 71278

[INFO] Processing disease: ADHD
[INFO] ADHD: cleaned SNP 数=2317, 与 C 重合 SNP 数=56
[INFO] ADHD: 结果已保存到 /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/ADHD_C_overlap_ttest_results.tsv

[INFO] Processing disease: ASD
[INFO] ASD: cleaned SNP 数=1260, 与 C 重合 SNP 数=49
[INFO] ASD: 结果已保存到 /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/ASD_C_overlap_ttest_results.tsv

[INFO] Processing disease: EPILE
[INFO] EPILE: cleaned SNP 数=330, 与 C 重合 SNP 数=13
[INFO] EPILE: 结果已保存到 /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/EPILE_C_overlap_ttest_results.tsv

[INFO] Processing disease: MEMORY
[INFO] MEMORY: cleaned SNP 数=4029, 与 C 重合 SNP 数=39
[INFO] MEMORY: 结果已保存到 /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/MEMORY_C_overlap_ttest_results.tsv


  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)
  res = hypotest_fun_out(*samples, **kwds)


In [6]:
# -*- coding: utf-8 -*-
"""
从四个 *_C_overlap_ttest_results.tsv 中筛选 p_value < 0.1 的 SNP，
并用 Prepared_data4/markers.tsv + gencode.v19.annotation.gtf 做基因注释。

输出：在 Related_SNP 目录下生成 4 个 CSV 文件，比如：
  ADHD_C_overlap_p0.1_annotated.csv
  ASD_C_overlap_p0.1_annotated.csv
  EPILE_C_overlap_p0.1_annotated.csv
  MEMORY_C_overlap_p0.1_annotated.csv
"""

from pathlib import Path
import pandas as pd
import numpy as np
import pyranges as pr

# ===== 路径 =====
BASE = Path("/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data")
PREP = BASE / "Prepared_data4"
SNP_DIR = BASE / "Related_SNP"

GTF_PATH = BASE / "gencode.v19.annotation.gtf"
MARKERS_TSV = PREP / "markers.tsv"

# 四个 t-test 结果文件
TTEST_FILES = {
    "ADHD":   SNP_DIR / "ADHD_C_overlap_ttest_results.tsv",
    "ASD":    SNP_DIR / "ASD_C_overlap_ttest_results.tsv",
    "EPILE":  SNP_DIR / "EPILE_C_overlap_ttest_results.tsv",
    "MEMORY": SNP_DIR / "MEMORY_C_overlap_ttest_results.tsv",
}

# ===== 公用小工具 =====
def ensure_chr_prefix(s):
    s = str(s)
    return s if s.lower().startswith("chr") else "chr" + s

def pick_col(cols, cands):
    """在 cols 中找候选列名之一，兼容大小写。"""
    for c in cands:
        if c in cols:
            return c
    low = {c.lower(): c for c in cols}
    for c in cands:
        if c.lower() in low:
            return low[c.lower()]
    return None

def agg_join(series):
    vals = series.dropna().astype(str).unique().tolist()
    return ";".join(vals) if vals else ""

# ===== 预读 markers.tsv 和 GTF =====
print("[INFO] 读取 markers.tsv ...")
m = pd.read_csv(MARKERS_TSV, sep="\t")
need_cols = ["Name", "Chromosome", "Start", "End"]
missing = [c for c in need_cols if c not in m.columns]
if missing:
    raise ValueError(f"{MARKERS_TSV} 缺少列: {missing}")
m = m[need_cols].copy()
m["Name"] = m["Name"].astype(str)

print("[INFO] 读取 GTF ...")
gtf = pr.read_gtf(str(GTF_PATH))
feat_col = "feature" if "feature" in gtf.df.columns else ("Feature" if "Feature" in gtf.df.columns else None)
if feat_col is None:
    raise KeyError(f"GTF 缺少 feature 列。实际列: {list(gtf.df.columns)}")
genes = gtf[gtf.df[feat_col] == "gene"]

# 方便后面 nearest 用
gr_genes = genes

# ===== 主循环：对四个疾病分别处理 =====
for disease, ttest_path in TTEST_FILES.items():
    print(f"\n[INFO] 处理疾病: {disease}")
    if not ttest_path.exists():
        print(f"[WARN] {ttest_path} 不存在，跳过。")
        continue

    res = pd.read_csv(ttest_path, sep="\t")
    # 要有 "SNP" 和 "p_value"
    if "SNP" not in res.columns or "p_value" not in res.columns:
        raise ValueError(f"{ttest_path.name} 中必须包含 'SNP' 和 'p_value' 列。")

    # 筛选 p_value < 0.1
    sig = res[res["p_value"] < 0.1].copy()
    sig["SNP"] = sig["SNP"].astype(str)
    if sig.empty:
        print(f"[INFO] {disease}: p_value < 0.1 的 SNP 为空，将仍输出一个只有表头的文件。")
        # 直接生成空文件，附上列名
        out_empty = SNP_DIR / f"{disease}_C_overlap_p0.1_annotated.csv"
        sig.to_csv(out_empty, index=False)
        continue

    sig_snps = sig["SNP"].tolist()
    print(f"[INFO] {disease}: p < 0.1 的 SNP 数量 = {len(sig_snps)}")

    # ===== 从 markers.tsv 中取这些 SNP 的坐标 =====
    m_sub = m[m["Name"].isin(sig_snps)].copy()
    if m_sub.empty:
        print(f"[WARN] {disease}: 在 markers.tsv 中找不到这些 SNP 的坐标，将仅输出 t-test 结果。")
        out_path = SNP_DIR / f"{disease}_C_overlap_p0.1_annotated.csv"
        sig.to_csv(out_path, index=False)
        continue

    # 按 sig_snps 顺序对齐
    order_map = {s: i for i, s in enumerate(sig_snps)}
    m_sub["__order__"] = m_sub["Name"].map(order_map)
    m_sub = m_sub.sort_values("__order__").drop(columns="__order__")

    # 构造 PyRanges 输入
    snps_df = pd.DataFrame({
        "Chromosome": m_sub["Chromosome"].astype(str).map(ensure_chr_prefix),
        "Start": m_sub["Start"].astype(int),
        "End": m_sub["End"].astype(int),
        "SNP": m_sub["Name"].astype(str),
    })
    gr_snps = pr.PyRanges(snps_df)

    # ===== overlap: SNP 落在 gene 区间上 =====
    ovl = gr_snps.join(genes).df

    # 找基因相关列名
    gene_id_col   = pick_col(ovl.columns, ["gene_id", "Gene_id", "geneID"])
    gene_name_col = pick_col(ovl.columns, ["gene_name", "Gene_name", "gene", "gene_symbol"])
    gene_type_col = pick_col(ovl.columns, ["gene_type", "Gene_type", "gene_biotype", "Gene_biotype", "biotype"])

    if len(ovl) > 0:
        ovl_grouped = (ovl
            .groupby("SNP", as_index=False)
            .agg({
                "Chromosome": "first",
                "Start":      "first",
                gene_id_col:   agg_join if gene_id_col   else (lambda s: ""),
                gene_name_col: agg_join if gene_name_col else (lambda s: ""),
                gene_type_col: agg_join if gene_type_col else (lambda s: ""),
            })
        )
    else:
        # 没有任何 overlap 的情况
        ovl_grouped = pd.DataFrame(columns=["SNP","Chromosome","Start"])

    # 标准列名
    col_map = {}
    if gene_id_col:   col_map[gene_id_col]   = "Gene ID"
    if gene_name_col: col_map[gene_name_col] = "Gene Name"
    if gene_type_col: col_map[gene_type_col] = "Gene Type"
    ovl_grouped = ovl_grouped.rename(columns=col_map)
    ovl_grouped = ovl_grouped.rename(columns={"Start": "Location"})

    # 基础表：所有 sig SNP + 坐标（即使没 overlap 到 gene）
    base = snps_df.rename(columns={"Start": "Location"})[["SNP", "Chromosome", "Location"]]
    annot = base.merge(
        ovl_grouped[["SNP","Gene ID","Gene Name","Gene Type"]] if len(ovl_grouped) else base.assign(**{"Gene ID":np.nan,"Gene Name":np.nan,"Gene Type":np.nan}),
        on="SNP", how="left"
    )

    # ===== 对没有任何 gene 注释的 SNP，用最近基因补充 =====
    missing_mask = annot["Gene ID"].isna() & annot["Gene Name"].isna() & annot["Gene Type"].isna()
    if missing_mask.any():
        print(f"[INFO] {disease}: 有 {missing_mask.sum()} 个 SNP 未 overlap 到 gene，使用 nearest 填充。")
        # PyRanges 支持布尔掩码，需要对应行
        gr_snps_missing = pr.PyRanges(snps_df[missing_mask.values])
        nearest_df = gr_snps_missing.nearest(gr_genes).df

        gid = pick_col(nearest_df.columns, ["gene_id", "Gene_id", "geneID"])
        gna = pick_col(nearest_df.columns, ["gene_name", "Gene_name", "gene", "gene_symbol"])
        gty = pick_col(nearest_df.columns, ["gene_type", "Gene_type", "gene_biotype", "Gene_biotype", "biotype"])

        nearest_slim = pd.DataFrame({
            "SNP": nearest_df["SNP"].astype(str),
            "Gene ID": nearest_df[gid].astype(str) if gid else "",
            "Gene Name": nearest_df[gna].astype(str) if gna else "",
            "Gene Type": nearest_df[gty].astype(str) if gty else "",
        })

        annot = annot.merge(nearest_slim, on="SNP", how="left", suffixes=("", "_nearest"))
        for k in ["Gene ID","Gene Name","Gene Type"]:
            fill_mask = annot[k].isna() | (annot[k].astype(str).str.strip()=="")
            annot.loc[fill_mask, k] = annot.loc[fill_mask, k + "_nearest"]
            if k + "_nearest" in annot.columns:
                annot.drop(columns=[k + "_nearest"], inplace=True)

    # ===== 把 t-test 结果合并进来 =====
    sig_for_merge = sig.copy()
    sig_for_merge["SNP"] = sig_for_merge["SNP"].astype(str)

    final = annot.merge(sig_for_merge, on="SNP", how="left")

    # 按 p_value 排个序，方便后面看
    if "p_value" in final.columns:
        final = final.sort_values("p_value", na_position="last")

    # 输出
    out_path = SNP_DIR / f"{disease}_C_overlap_p0.1_annotated.csv"
    final.to_csv(out_path, index=False)
    print(f"[DONE] {disease}: 已输出 {out_path}")


[INFO] 读取 markers.tsv ...
[INFO] 读取 GTF ...

[INFO] 处理疾病: ADHD
[INFO] ADHD: p < 0.1 的 SNP 数量 = 48
[INFO] ADHD: 有 22 个 SNP 未 overlap 到 gene，使用 nearest 填充。


join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.
join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.
join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.


[DONE] ADHD: 已输出 /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/ADHD_C_overlap_p0.1_annotated.csv

[INFO] 处理疾病: ASD
[INFO] ASD: p < 0.1 的 SNP 数量 = 42
[INFO] ASD: 有 19 个 SNP 未 overlap 到 gene，使用 nearest 填充。
[DONE] ASD: 已输出 /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/ASD_C_overlap_p0.1_annotated.csv

[INFO] 处理疾病: EPILE
[INFO] EPILE: p < 0.1 的 SNP 数量 = 13
[INFO] EPILE: 有 3 个 SNP 未 overlap 到 gene，使用 nearest 填充。
[DONE] EPILE: 已输出 /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/EPILE_C_overlap_p0.1_annotated.csv

[INFO] 处理疾病: MEMORY
[INFO] MEMORY: p < 0.1 的 SNP 数量 = 32
[INFO] MEMORY: 有 18 个 SNP 未 overlap 到 gene，使用 nearest 填充。
[DONE] MEMORY: 已输出 /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/MEMORY_C_overlap_p0.1_annotated.csv


join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.
join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.
join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.
join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.
join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.


In [7]:
# -*- coding: utf-8 -*-
from pathlib import Path
import pandas as pd

# ===== 路径设置 =====
BASE = Path("/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data")
SNP_DIR = BASE / "Related_SNP"

# 四个 annotated 文件（你已经生成好的）
ANNOT_FILES = {
    "ADHD":   SNP_DIR / "ADHD_C_overlap_p0.1_annotated.csv",
    "ASD":    SNP_DIR / "ASD_C_overlap_p0.1_annotated.csv",
    "EPILE":  SNP_DIR / "EPILE_C_overlap_p0.1_annotated.csv",
    "MEMORY": SNP_DIR / "MEMORY_C_overlap_p0.1_annotated.csv",
}

# ALZ 相关基因文件（从 GeneCards 下的）
ALZ_GENE_PATH = SNP_DIR / "ALZ_GENE.csv"

# ===== 1. 读入 ALZ 相关基因列表 =====
alz_genes_df = pd.read_csv(ALZ_GENE_PATH)
if "Gene Symbol" not in alz_genes_df.columns:
    raise ValueError(f"{ALZ_GENE_PATH} 中找不到 'Gene Symbol' 列，实际列为: {alz_genes_df.columns.tolist()}")

alz_gene_set = (
    alz_genes_df["Gene Symbol"]
    .astype(str)
    .str.strip()
    .replace("", pd.NA)
    .dropna()
    .unique()
)
alz_gene_set = set(alz_gene_set)

print(f"[INFO] 从 ALZ_GENE.csv 读入阿尔兹海默相关基因数: {len(alz_gene_set)}")

# 小工具：判断一行 Gene Name 是否包含 ALZ 相关基因
def has_alz_gene(gene_name: str) -> bool:
    if pd.isna(gene_name):
        return False
    s = str(gene_name).strip()
    if not s:
        return False
    # Gene Name 里有时是 "GENE1;GENE2" 这种形式
    parts = []
    for token in s.replace(",", ";").split(";"):
        token = token.strip()
        if token:
            parts.append(token)
    for g in parts:
        if g in alz_gene_set:
            return True
    return False

# ===== 2. 对四个疾病文件分别过滤 =====
for disease, in_path in ANNOT_FILES.items():
    print(f"\n[INFO] 处理 {disease}: {in_path.name}")
    df = pd.read_csv(in_path)

    if "Gene Name" not in df.columns:
        raise ValueError(f"{in_path.name} 中找不到 'Gene Name' 列，实际列为: {df.columns.tolist()}")

    # 计算每行是否含 ALZ 相关基因
    mask_alz = df["Gene Name"].apply(has_alz_gene)

    n_total = len(df)
    n_alz = mask_alz.sum()
    n_keep = n_total - n_alz

    print(f"  总行数: {n_total}")
    print(f"  含 ALZ 相关基因的行数: {n_alz}")
    print(f"  保留行数(去掉这些基因后): {n_keep}")

    df_cleaned = df[~mask_alz].copy()

    # 输出：在原文件名基础上加后缀 _noALZgene
    out_path = SNP_DIR / f"{in_path.stem}_noALZgene.csv"
    df_cleaned.to_csv(out_path, index=False)
    print(f"  已保存: {out_path}")


[INFO] 从 ALZ_GENE.csv 读入阿尔兹海默相关基因数: 17508

[INFO] 处理 ADHD: ADHD_C_overlap_p0.1_annotated.csv
  总行数: 48
  含 ALZ 相关基因的行数: 25
  保留行数(去掉这些基因后): 23
  已保存: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/ADHD_C_overlap_p0.1_annotated_noALZgene.csv

[INFO] 处理 ASD: ASD_C_overlap_p0.1_annotated.csv
  总行数: 42
  含 ALZ 相关基因的行数: 19
  保留行数(去掉这些基因后): 23
  已保存: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/ASD_C_overlap_p0.1_annotated_noALZgene.csv

[INFO] 处理 EPILE: EPILE_C_overlap_p0.1_annotated.csv
  总行数: 13
  含 ALZ 相关基因的行数: 5
  保留行数(去掉这些基因后): 8
  已保存: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/EPILE_C_overlap_p0.1_annotated_noALZgene.csv

[INFO] 处理 MEMORY: MEMORY_C_overlap_p0.1_annotated.csv
  总行数: 32
  含 ALZ 相关基因的行数: 14
  保留行数(去掉这些基因后): 18
  已保存: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/MEMORY_C_overlap_p0.1_annotated_noALZgene.csv


In [8]:
# -*- coding: utf-8 -*-
from pathlib import Path
import pandas as pd

BASE = Path("/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data")
SNP_DIR = BASE / "Related_SNP"

# 1) 已经去掉 ALZ 基因的四个 SNP 文件
ANNOT_NO_ALZ = {
    "ADHD":   SNP_DIR / "ADHD_C_overlap_p0.1_annotated_noALZgene.csv",
    "ASD":    SNP_DIR / "ASD_C_overlap_p0.1_annotated_noALZgene.csv",
    "EPILE":  SNP_DIR / "EPILE_C_overlap_p0.1_annotated_noALZgene.csv",
    "MEMORY": SNP_DIR / "MEMORY_C_overlap_p0.1_annotated_noALZgene.csv",
}

# 2) 四个疾病自己的 GeneCards 基因列表
DISEASE_GENES = {
    "ADHD":   SNP_DIR / "ADHD_GENE.csv",
    "ASD":    SNP_DIR / "ASD_GENE.csv",
    "EPILE":  SNP_DIR / "EPILE_GENE.csv",
    "MEMORY": SNP_DIR / "MP_GENE.csv",   # memory performance 对应的 gene 列表
}

def load_gene_set(path: Path) -> set:
    """从 GeneCards 导出的 *_GENE.csv 里读 Gene Symbol 列做成集合。"""
    df = pd.read_csv(path)
    if "Gene Symbol" not in df.columns:
        raise ValueError(f"{path.name} 中找不到 'Gene Symbol' 列，实际列为: {df.columns.tolist()}")
    genes = (
        df["Gene Symbol"]
        .astype(str)
        .str.strip()
        .replace("", pd.NA)
        .dropna()
        .unique()
    )
    return set(genes)

def gene_name_has_any(gene_name: str, gene_set: set) -> bool:
    """判断一行 Gene Name 是否包含 gene_set 中的任意基因（支持 'A;B;C' 这种形式）"""
    if pd.isna(gene_name):
        return False
    s = str(gene_name).strip()
    if not s:
        return False
    parts = []
    # 兼容 ; 和 , 分隔
    for token in s.replace(",", ";").split(";"):
        token = token.strip()
        if token:
            parts.append(token)
    for g in parts:
        if g in gene_set:
            return True
    return False

for disease, annot_path in ANNOT_NO_ALZ.items():
    print(f"\n[INFO] 处理 {disease}: {annot_path.name}")

    if disease not in DISEASE_GENES:
        print(f"[WARN] {disease} 没有对应的基因文件映射，跳过。")
        continue

    gene_path = DISEASE_GENES[disease]
    if not annot_path.exists():
        print(f"[WARN] {annot_path} 不存在，跳过。")
        continue
    if not gene_path.exists():
        print(f"[WARN] {gene_path} 不存在，跳过。")
        continue

    # 读疾病基因集合
    disease_genes = load_gene_set(gene_path)
    print(f"  {disease} 相关基因数: {len(disease_genes)}")

    # 读已经去掉 ALZ 基因的 SNP 注释文件
    df = pd.read_csv(annot_path)
    if "Gene Name" not in df.columns:
        raise ValueError(f"{annot_path.name} 中找不到 'Gene Name' 列，实际列为: {df.columns.tolist()}")

    # 标记哪些行含有该疾病相关基因
    mask_has_self = df["Gene Name"].apply(lambda x: gene_name_has_any(x, disease_genes))

    n_total = len(df)
    n_self = mask_has_self.sum()
    n_keep = n_total - n_self

    print(f"  原始行数: {n_total}")
    print(f"  含 {disease} 相关基因的行数: {n_self}")
    print(f"  保留行数（去掉这些后）: {n_keep}")

    df_cleaned = df[~mask_has_self].copy()

    # 输出文件名：原文件名 + _no{DISEASE}gene
    out_path = SNP_DIR / f"{annot_path.stem}_no{disease}gene.csv"
    df_cleaned.to_csv(out_path, index=False)
    print(f"  已保存: {out_path}")



[INFO] 处理 ADHD: ADHD_C_overlap_p0.1_annotated_noALZgene.csv
  ADHD 相关基因数: 3051
  原始行数: 23
  含 ADHD 相关基因的行数: 0
  保留行数（去掉这些后）: 23
  已保存: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/ADHD_C_overlap_p0.1_annotated_noALZgene_noADHDgene.csv

[INFO] 处理 ASD: ASD_C_overlap_p0.1_annotated_noALZgene.csv
  ASD 相关基因数: 15128
  原始行数: 23
  含 ASD 相关基因的行数: 6
  保留行数（去掉这些后）: 17
  已保存: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/ASD_C_overlap_p0.1_annotated_noALZgene_noASDgene.csv

[INFO] 处理 EPILE: EPILE_C_overlap_p0.1_annotated_noALZgene.csv
  EPILE 相关基因数: 10045
  原始行数: 8
  含 EPILE 相关基因的行数: 3
  保留行数（去掉这些后）: 5
  已保存: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/EPILE_C_overlap_p0.1_annotated_noALZgene_noEPILEgene.csv

[INFO] 处理 MEMORY: MEMORY_C_overlap_p0.1_annotated_noALZgene.csv
  MEMORY 相关基因数: 16691
  原始行数: 18
  含 MEMORY 相关基因的行数: 3
  保留行数（去掉这些后）: 15
  已保存: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/MEMORY

Random SNP test for significance of EPILE

In [11]:
# -*- coding: utf-8 -*-
"""
在 Prepared_data4 的 C.npy 上随机抽 100 个 SNP，做 AD group1 vs group2 的 Welch t-test
重复 10 次：
  - 控制台输出每次有多少 SNP 的 p < 1e-6
  - 保存每次抽到的 100 个 SNP 及 t_stat/p_value 等到 10 个文件
输出目录：
  /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/randomTest
"""

from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind

# ------------ 路径设置 ------------
ROOT = Path("/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data")
PREP = ROOT / "Prepared_data4"
SNP_DIR = ROOT / "Related_SNP"
OUT_DIR = SNP_DIR / "randomTest"

C_PATH = PREP / "C.npy"
IDS_PATH = PREP / "ids.txt"
NONEXON_MARKERS_PATH = PREP / "nonexonMarkers_after_clean.txt"
GROUPS_PATH = PREP / "figure3_groups.csv"   # Fig3 脚本生成的 AD-only 分组

# ------------ 参数 ------------
N_RANDOM_SNP = 100
N_REPEAT = 10
P_THRESH = 1e-6
BASE_SEED = 12345  # 你可以改成任何整数，保证可复现

# ------------ 0. 创建输出目录 ------------
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ------------ 1. 读取 C, ids, SNP 列顺序 ------------
print("[INFO] Loading C, ids, and SNP markers ...")
C = np.load(C_PATH)                           # shape: (n_subjects, n_C_snps)
ids = np.loadtxt(IDS_PATH, dtype=str)        # length: n_subjects
assert C.shape[0] == len(ids), "C 的行数与 ids 数量不一致。"

nonexon_snps = pd.read_csv(NONEXON_MARKERS_PATH, header=None)[0].astype(str).to_numpy()
assert C.shape[1] == len(nonexon_snps), "C 的列数与 nonexonMarkers_after_clean.txt 长度不一致。"

snp_to_col = {snp: idx for idx, snp in enumerate(nonexon_snps)}

# ------------ 2. 读取 AD 分组（figure3_groups.csv） ------------
print("[INFO] Loading AD groups from figure3_groups.csv ...")
grp_df = pd.read_csv(GROUPS_PATH)

if "id" not in grp_df.columns or "cluster" not in grp_df.columns:
    raise ValueError("figure3_groups.csv 应包含 'id' 和 'cluster' 两列。")

grp_df = grp_df[["id", "cluster"]].copy()
grp_df["id"] = grp_df["id"].astype(str)

uniq_clusters = sorted(pd.unique(grp_df["cluster"].astype(int)))
if len(uniq_clusters) < 2:
    raise ValueError("分组文件中只有一个簇，无法做两组比较。")
cluster_map = {uniq_clusters[0]: 1, uniq_clusters[1]: 2}
grp_df["group"] = grp_df["cluster"].astype(int).map(cluster_map)

id_to_idx = {sid: i for i, sid in enumerate(ids)}
grp_df = grp_df[grp_df["id"].isin(id_to_idx.keys())].copy()

ad_indices = np.array([id_to_idx[sid] for sid in grp_df["id"]], dtype=int)
groups = grp_df["group"].to_numpy(int)
assert ad_indices.shape[0] == groups.shape[0], "AD index 与 group 长度不一致。"

C_ad = C[ad_indices, :]  # shape: (n_AD, n_C_snps)

print(f"[INFO] AD 病人数: {C_ad.shape[0]}, C 中 SNP 数: {C_ad.shape[1]}")
print(f"[INFO] Output directory: {OUT_DIR}")

# ------------ 3. 随机抽样 + t-test，重复 10 次 ------------
n_total_snps = C_ad.shape[1]
if N_RANDOM_SNP > n_total_snps:
    raise ValueError(f"N_RANDOM_SNP={N_RANDOM_SNP} 大于 C 的 SNP 总数={n_total_snps}。")

for rep in range(1, N_REPEAT + 1):
    rng = np.random.default_rng(BASE_SEED + rep)

    # 随机抽 100 列（不放回）
    sampled_cols = rng.choice(n_total_snps, size=N_RANDOM_SNP, replace=False)
    sampled_cols = np.sort(sampled_cols)

    results = []
    for col_idx in sampled_cols:
        snp = str(nonexon_snps[col_idx])
        x = C_ad[:, col_idx].astype(float)

        g1 = x[groups == 1]
        g2 = x[groups == 2]

        # Welch t-test + nan omit
        if len(g1) < 2 or len(g2) < 2:
            t_stat, p_val = np.nan, np.nan
        else:
            t_stat, p_val = ttest_ind(g1, g2, equal_var=False, nan_policy="omit")

        results.append({
            "SNP": snp,
            "C_col_index": int(col_idx),
            "n_group1": int(np.sum(groups == 1)),
            "n_group2": int(np.sum(groups == 2)),
            "mean_group1": float(np.nanmean(g1)) if len(g1) > 0 else np.nan,
            "mean_group2": float(np.nanmean(g2)) if len(g2) > 0 else np.nan,
            "t_stat": float(t_stat) if t_stat is not None else np.nan,
            "p_value": float(p_val) if p_val is not None else np.nan,
        })

    res_df = pd.DataFrame(results)

    # 统计 p < 1e-6 的个数（排除 NaN）
    sig_count = int(np.sum((res_df["p_value"].to_numpy(float) < P_THRESH) & (~res_df["p_value"].isna())))
    print(f"[RESULT] Repeat {rep:02d}/{N_REPEAT}: among {N_RANDOM_SNP} random SNPs, "
          f"count(p < {P_THRESH:.0e}) = {sig_count}")

    # 保存该次的 100 SNP 结果
    out_path = OUT_DIR / f"random_C_100snps_ttest_rep{rep:02d}.tsv"
    res_df.to_csv(out_path, sep="\t", index=False)


[INFO] Loading C, ids, and SNP markers ...
[INFO] Loading AD groups from figure3_groups.csv ...
[INFO] AD 病人数: 251, C 中 SNP 数: 71278
[INFO] Output directory: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/randomTest
[RESULT] Repeat 01/10: among 100 random SNPs, count(p < 1e-06) = 53
[RESULT] Repeat 02/10: among 100 random SNPs, count(p < 1e-06) = 51


  res = hypotest_fun_out(*samples, **kwds)


[RESULT] Repeat 03/10: among 100 random SNPs, count(p < 1e-06) = 45
[RESULT] Repeat 04/10: among 100 random SNPs, count(p < 1e-06) = 48
[RESULT] Repeat 05/10: among 100 random SNPs, count(p < 1e-06) = 44
[RESULT] Repeat 06/10: among 100 random SNPs, count(p < 1e-06) = 50
[RESULT] Repeat 07/10: among 100 random SNPs, count(p < 1e-06) = 47
[RESULT] Repeat 08/10: among 100 random SNPs, count(p < 1e-06) = 49
[RESULT] Repeat 09/10: among 100 random SNPs, count(p < 1e-06) = 54
[RESULT] Repeat 10/10: among 100 random SNPs, count(p < 1e-06) = 55


In [12]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
epile_enrichment_prep_10kb.py

Foreground: EPILE SNPs with p <= 1e-6 from EPILE_C_overlap_ttest_results.tsv
Background: all non-exonic SNPs (from nonExonMarkers_after_clean.txt) intersected with BIM
Mapping: SNP -> genes whose gene body overlaps SNP position within ±10kb (gene_start-10kb <= pos <= gene_end+10kb)

Outputs (to out_dir):
- EPILE_fg_snps_p1e-6.tsv
- EPILE_fg_snp2gene_10kb.tsv
- EPILE_fg_genes_10kb.txt
- C_background_genes_10kb.txt
- mapping_summary_10kb.txt
"""

import os
import re
import sys
from bisect import bisect_left, bisect_right
from collections import defaultdict
import pandas as pd

# ----------------------------
# User-specified paths
# ----------------------------
GTF_PATH = "/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/gencode.v19.annotation.gtf"
BIM_PATH = "/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/ADNI_GO2_GWAS_PLINK2/ADNI_GO2_GWAS_2nd_orig_BIN.bim"
EPILE_TTEST_PATH = "/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/EPILE_C_overlap_ttest_results.tsv"
NONEXON_PATH = "/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Prepared_data4/nonExonMarkers_after_clean.txt"
OUT_DIR = "/Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP"

WINDOW_BP = 10_000
P_THRESH = 1e-6

# ----------------------------
# Helpers
# ----------------------------
def die(msg: str, code: int = 1):
    print(f"[ERROR] {msg}", file=sys.stderr)
    sys.exit(code)

def ensure_exists(path: str, label: str):
    if not os.path.exists(path):
        die(f"{label} not found: {path}")

def normalize_chr(ch):
    """
    Convert chromosome labels to a consistent string without 'chr' prefix.
    Examples:
      'chr1' -> '1'
      '1'    -> '1'
      'X'    -> 'X'
      'chrX' -> 'X'
      'MT'/'M'/'chrM' -> 'MT' (normalize)
    """
    s = str(ch).strip()
    s = s.replace("chr", "")
    if s in {"M", "MT"}:
        return "MT"
    return s

def parse_gtf_genes(gtf_path: str) -> pd.DataFrame:
    """
    Parse GTF and keep only 'gene' features.
    Return DataFrame: chr, start, end, gene
    """
    genes = []
    # GTF columns: seqname, source, feature, start, end, score, strand, frame, attribute
    with open(gtf_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if not line or line.startswith("#"):
                continue
            parts = line.rstrip("\n").split("\t")
            if len(parts) < 9:
                continue
            feature = parts[2]
            if feature != "gene":
                continue
            chrom = normalize_chr(parts[0])
            try:
                start = int(parts[3])
                end = int(parts[4])
            except ValueError:
                continue
            attr = parts[8]
            # extract gene_name "XXX"
            m = re.search(r'gene_name "([^"]+)"', attr)
            if not m:
                continue
            gene_name = m.group(1)
            genes.append((chrom, start, end, gene_name))

    if not genes:
        die("No genes parsed from GTF. Please confirm the GTF format and content.")

    df = pd.DataFrame(genes, columns=["chr", "start", "end", "gene"]).drop_duplicates()
    return df

def load_bim(bim_path: str) -> pd.DataFrame:
    """
    Load PLINK BIM.
    Columns: chr, rsid, cm, pos, a1, a2
    """
    bim = pd.read_csv(
        bim_path,
        sep=r"\s+",
        header=None,
        names=["chr", "rsid", "cm", "pos", "a1", "a2"],
        dtype={"chr": str, "rsid": str, "cm": float, "pos": int, "a1": str, "a2": str},
    )
    bim["chr"] = bim["chr"].apply(normalize_chr)
    return bim

def load_nonexon_snps(nonexon_path: str) -> set:
    snps = set()
    with open(nonexon_path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            s = line.strip()
            if s:
                snps.add(s)
    if not snps:
        die("nonExonMarkers_after_clean.txt appears empty.")
    return snps

def infer_columns_epile(df: pd.DataFrame) -> tuple[str, str]:
    """
    Infer rsid and pvalue columns in EPILE ttest table.
    Accepts common variations.
    """
    cols = {c.lower(): c for c in df.columns}
    rsid_candidates = ["rsid", "snp", "snps", "marker", "id"]
    p_candidates = ["pvalue", "p_value", "p", "pval", "p-val", "p.value"]

    rs_col = None
    p_col = None
    for k in rsid_candidates:
        if k in cols:
            rs_col = cols[k]
            break
    for k in p_candidates:
        if k in cols:
            p_col = cols[k]
            break

    if rs_col is None or p_col is None:
        die(
            "Cannot infer rsid/pvalue columns from EPILE_C_overlap_ttest_results.tsv. "
            f"Found columns: {list(df.columns)}. "
            "Please rename columns to include 'rsid' and 'pvalue' (recommended) or edit infer_columns_epile()."
        )
    return rs_col, p_col

def build_gene_index(genes_df: pd.DataFrame, window_bp: int):
    """
    Build per-chromosome sorted gene intervals with window applied.
    Returns dict chr -> list of (start_w, end_w, gene), plus an array of starts for binary search.
    """
    idx = {}
    for chrom, gdf in genes_df.groupby("chr"):
        gdf2 = gdf.copy()
        gdf2["start_w"] = (gdf2["start"] - window_bp).clip(lower=1)
        gdf2["end_w"] = gdf2["end"] + window_bp
        gdf2 = gdf2.sort_values("start_w")

        intervals = list(zip(gdf2["start_w"].tolist(), gdf2["end_w"].tolist(), gdf2["gene"].tolist()))
        starts = [x[0] for x in intervals]
        idx[chrom] = (intervals, starts)
    return idx

def map_snp_to_genes(chrom: str, pos: int, gene_index) -> list[str]:
    """
    Return list of genes whose windowed interval covers (chrom,pos).
    Uses binary search on interval starts then scans forward until start > pos.
    """
    if chrom not in gene_index:
        return []
    intervals, starts = gene_index[chrom]
    # find rightmost interval start <= pos
    i = bisect_right(starts, pos) - 1
    if i < 0:
        return []

    genes = []
    # scan backwards a bit in case overlapping intervals started earlier
    # start from i and go backwards until interval end < pos (safe bound)
    j = i
    while j >= 0 and intervals[j][1] >= pos:
        j -= 1
    j += 1  # first interval that may overlap pos

    # scan forward until start > pos
    k = j
    n = len(intervals)
    while k < n and intervals[k][0] <= pos:
        if intervals[k][1] >= pos:
            genes.append(intervals[k][2])
        k += 1

    return genes

# ----------------------------
# Main
# ----------------------------
def main():
    ensure_exists(GTF_PATH, "GTF")
    ensure_exists(BIM_PATH, "BIM")
    ensure_exists(EPILE_TTEST_PATH, "EPILE ttest results")
    ensure_exists(NONEXON_PATH, "nonExonMarkers_after_clean")
    os.makedirs(OUT_DIR, exist_ok=True)

    print("[INFO] Loading GTF genes...")
    genes_df = parse_gtf_genes(GTF_PATH)
    print(f"[INFO] Parsed genes: {len(genes_df):,}")

    print("[INFO] Building gene index (±10kb)...")
    gene_index = build_gene_index(genes_df, WINDOW_BP)

    print("[INFO] Loading BIM...")
    bim = load_bim(BIM_PATH)
    print(f"[INFO] BIM SNPs: {len(bim):,}")

    print("[INFO] Loading non-exonic SNP list...")
    nonexon_snps = load_nonexon_snps(NONEXON_PATH)
    print(f"[INFO] non-exonic markers: {len(nonexon_snps):,}")

    print("[INFO] Restricting BIM to non-exonic SNPs (background SNP pool)...")
    bim_bg = bim[bim["rsid"].isin(nonexon_snps)].copy()
    print(f"[INFO] Background SNPs in BIM ∩ non-exonic: {len(bim_bg):,}")

    if bim_bg.empty:
        die("No overlap between BIM rsIDs and nonExonMarkers_after_clean.txt. Please verify matching rsID formats.")

    print("[INFO] Loading EPILE t-test results...")
    epile = pd.read_csv(EPILE_TTEST_PATH, sep="\t")
    rs_col, p_col = infer_columns_epile(epile)

    # ensure numeric p-values
    epile[p_col] = pd.to_numeric(epile[p_col], errors="coerce")
    epile = epile.dropna(subset=[p_col, rs_col]).copy()

    epile_sig = epile[epile[p_col] <= P_THRESH].copy()
    if epile_sig.empty:
        die(f"No SNPs found with {p_col} <= {P_THRESH}. Please check threshold or column names.")
    epile_sig = epile_sig.sort_values(p_col)

    fg_out_path = os.path.join(OUT_DIR, "EPILE_fg_snps_p1e-6.tsv")
    epile_sig.to_csv(fg_out_path, sep="\t", index=False)
    print(f"[OK] Saved foreground SNP table: {fg_out_path} (n={len(epile_sig)})")

    fg_snps = set(epile_sig[rs_col].astype(str).tolist())

    print("[INFO] Extracting foreground SNP coordinates from BIM (restricted to non-exonic pool)...")
    fg_bim = bim_bg[bim_bg["rsid"].isin(fg_snps)].copy()

    missing_fg = sorted(list(fg_snps - set(fg_bim["rsid"])))
    if missing_fg:
        print("[WARN] Some foreground SNPs not found in BIM ∩ non-exonic list. They will be excluded.")
        print("[WARN] Missing rsIDs (first 20):", missing_fg[:20])

    if fg_bim.empty:
        die("No foreground SNPs found in BIM after restricting to non-exonic list. Check rsID matching or whether foreground SNPs are indeed non-exonic.")

    # Map foreground SNPs to genes
    print("[INFO] Mapping foreground SNPs to genes (±10kb)...")
    fg_rows = []
    for _, row in fg_bim.iterrows():
        chrom = row["chr"]
        pos = int(row["pos"])
        rsid = row["rsid"]
        genes = map_snp_to_genes(chrom, pos, gene_index)
        if not genes:
            fg_rows.append((rsid, chrom, pos, ""))  # keep empty mapping
        else:
            for g in genes:
                fg_rows.append((rsid, chrom, pos, g))

    fg_map_df = pd.DataFrame(fg_rows, columns=["rsid", "chr", "pos", "gene"])
    fg_map_path = os.path.join(OUT_DIR, "EPILE_fg_snp2gene_10kb.tsv")
    fg_map_df.to_csv(fg_map_path, sep="\t", index=False)
    print(f"[OK] Saved foreground SNP->gene mapping: {fg_map_path}")

    fg_genes = sorted(set([g for g in fg_map_df["gene"].tolist() if isinstance(g, str) and g.strip() != ""]))
    fg_genes_path = os.path.join(OUT_DIR, "EPILE_fg_genes_10kb.txt")
    with open(fg_genes_path, "w") as f:
        for g in fg_genes:
            f.write(g + "\n")
    print(f"[OK] Saved foreground gene list: {fg_genes_path} (n={len(fg_genes)})")

    # Map background SNPs to genes (build background gene universe)
    # This can be large; do in a memory-safe loop.
    print("[INFO] Mapping background SNPs to genes to construct background gene universe (may take some time)...")
    bg_genes_set = set()
    mapped_bg_snps = 0
    for _, row in bim_bg.iterrows():
        chrom = row["chr"]
        pos = int(row["pos"])
        genes = map_snp_to_genes(chrom, pos, gene_index)
        if genes:
            bg_genes_set.update(genes)
            mapped_bg_snps += 1

    bg_genes = sorted(bg_genes_set)
    bg_genes_path = os.path.join(OUT_DIR, "C_background_genes_10kb.txt")
    with open(bg_genes_path, "w") as f:
        for g in bg_genes:
            f.write(g + "\n")
    print(f"[OK] Saved background gene universe: {bg_genes_path} (n={len(bg_genes)})")

    # Summary
    summary_path = os.path.join(OUT_DIR, "mapping_summary_10kb.txt")
    n_fg_snps_total = len(fg_snps)
    n_fg_snps_used = fg_bim["rsid"].nunique()
    n_fg_snps_mapped = fg_map_df[fg_map_df["gene"].astype(str).str.len() > 0]["rsid"].nunique()

    with open(summary_path, "w") as f:
        f.write("=== Epilepsy enrichment prep summary (±10kb mapping) ===\n")
        f.write(f"GTF: {GTF_PATH}\n")
        f.write(f"BIM: {BIM_PATH}\n")
        f.write(f"non-exonic SNP list: {NONEXON_PATH}\n")
        f.write(f"EPILE t-test table: {EPILE_TTEST_PATH}\n")
        f.write("\n")
        f.write(f"P-threshold (foreground): {P_THRESH}\n")
        f.write(f"Foreground SNPs in t-test table (p<=thr): {len(epile_sig)}\n")
        f.write(f"Foreground SNPs requested (unique): {n_fg_snps_total}\n")
        f.write(f"Foreground SNPs found in BIM ∩ non-exonic: {n_fg_snps_used}\n")
        f.write(f"Foreground SNPs mapped to >=1 gene: {n_fg_snps_mapped}\n")
        f.write(f"Foreground genes (unique): {len(fg_genes)}\n")
        f.write("\n")
        f.write(f"Background SNPs (BIM ∩ non-exonic): {len(bim_bg)}\n")
        f.write(f"Background SNPs mapped to >=1 gene: {mapped_bg_snps}\n")
        f.write(f"Background genes (unique): {len(bg_genes)}\n")

    print(f"[OK] Saved summary: {summary_path}")
    print("[DONE] All outputs written to:", OUT_DIR)

if __name__ == "__main__":
    main()


[INFO] Loading GTF genes...
[INFO] Parsed genes: 57,819
[INFO] Building gene index (±10kb)...
[INFO] Loading BIM...
[INFO] BIM SNPs: 716,503
[INFO] Loading non-exonic SNP list...
[INFO] non-exonic markers: 71,278
[INFO] Restricting BIM to non-exonic SNPs (background SNP pool)...
[INFO] Background SNPs in BIM ∩ non-exonic: 71,278
[INFO] Loading EPILE t-test results...
[OK] Saved foreground SNP table: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/EPILE_fg_snps_p1e-6.tsv (n=9)
[INFO] Extracting foreground SNP coordinates from BIM (restricted to non-exonic pool)...
[INFO] Mapping foreground SNPs to genes (±10kb)...
[OK] Saved foreground SNP->gene mapping: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/EPILE_fg_snp2gene_10kb.tsv
[OK] Saved foreground gene list: /Users/zhangjiahui/Desktop/Haohan Research/Alz GWAS data/Related_SNP/EPILE_fg_genes_10kb.txt (n=10)
[INFO] Mapping background SNPs to genes to construct background gene universe (may take 