In [1]:
import pandas as pd
import os

# 云端相对路径（根据你当前目录结构调整）
dma_file = "./5.3的甲基化分析/PRAD_DMA_Significant_Results.csv"
probe_map_file = "./5.3的甲基化分析/Probe_Map.csv"
promoter_file = "./5.3的甲基化分析/Promoter_Region.csv"
deg_file = "./5.3的甲基化分析/PRAD_DEG_Significant_Results.csv"
output_dir = "./5.3的甲基化分析"

os.makedirs(output_dir, exist_ok=True)

# ======= 读取数据 ========
dma_df = pd.read_csv(dma_file)
probe_df = pd.read_csv(probe_map_file)
promoter_df = pd.read_csv(promoter_file)
deg_df = pd.read_csv(deg_file)

# ======= 合并 DMA 和 Probe 坐标 ========
dma_annotated = dma_df.merge(probe_df, on="Probe_ID", how="left")

# ======= 判断 Probe 是否落在启动子区域 ========
matched = []

for _, row in dma_annotated.iterrows():
    hits = promoter_df[
        (promoter_df['Promoter_Chr'] == row['Probe_Chr']) &
        (promoter_df['Promoter_Start'] <= row['Probe_Chrom_Start']) &
        (promoter_df['Promoter_End'] >= row['Probe_Chrom_End'])
    ]
    for _, hit in hits.iterrows():
        matched.append({
            "Probe_ID": row["Probe_ID"],
            "HGNC_Symbol": hit["HGNC_Symbol"],
            "Mean_Diff": row["Mean_Diff"],
            "adj_p_value": row["adj_p_value"]
        })

# 转换为 DataFrame
promoter_hits_df = pd.DataFrame(matched)

# ======= 保存所有启动子内 Probe ========
promoter_hits_path = os.path.join(output_dir, "PRAD_Methylation_Probes_in_Promoters.csv")
promoter_hits_df.to_csv(promoter_hits_path, index=False)
print(f"✅ 所有启动子区域的 probe 保存至：{promoter_hits_path}")

# ======= 联合 DEG 表达数据 ========
deg_filtered = deg_df[["HGNC_Symbol", "Log2_Fold_Change"]]
merged = promoter_hits_df.merge(deg_filtered, on="HGNC_Symbol", how="left")

# ======= 启动子甲基化↑ + 表达↓ 筛选 ========
up_meth_down_expr = merged[
    (merged["Mean_Diff"] > 0.15) & 
    (merged["Log2_Fold_Change"] < -0.5)
].dropna()

up_meth_down_path = os.path.join(output_dir, "PRAD_Promoters_Methylation_Up_Expression_Down.csv")
up_meth_down_expr.to_csv(up_meth_down_path, index=False)
print(f"✅ 启动子甲基化↑ + 表达↓ 的基因保存至：{up_meth_down_path}")

# ======= 启动子甲基化↓ + 表达↑ 筛选 ========
down_meth_up_expr = merged[
    (merged["Mean_Diff"] < -0.15) & 
    (merged["Log2_Fold_Change"] > 0.5)
].dropna()

down_meth_up_path = os.path.join(output_dir, "PRAD_Promoters_Methylation_Down_Expression_Up.csv")
down_meth_up_expr.to_csv(down_meth_up_path, index=False)
print(f"✅ 启动子甲基化↓ + 表达↑ 的基因保存至：{down_meth_up_path}")


✅ 所有启动子区域的 probe 保存至：./5.3的甲基化分析/PRAD_Methylation_Probes_in_Promoters.csv
✅ 启动子甲基化↑ + 表达↓ 的基因保存至：./5.3的甲基化分析/PRAD_Promoters_Methylation_Up_Expression_Down.csv
✅ 启动子甲基化↓ + 表达↑ 的基因保存至：./5.3的甲基化分析/PRAD_Promoters_Methylation_Down_Expression_Up.csv


In [2]:
import pandas as pd
import os

# ======= 路径配置（云端 Azure 路径） =======
base_dir = "./5.3的甲基化分析"

dma_file = os.path.join(base_dir, "PRAD_DMA_Significant_Results.csv")
probe_map_file = os.path.join(base_dir, "Probe_Map.csv")
promoter_file = os.path.join(base_dir, "Promoter_Region.csv")
deg_file = os.path.join(base_dir, "PRAD_DEG_Significant_Results.csv")
gene_region_file = os.path.join(base_dir, "Gene_Region.csv")
output_dir = base_dir  # 输出目录设为当前文件夹

os.makedirs(output_dir, exist_ok=True)

# ======= 数据读取 =======
dma_df = pd.read_csv(dma_file)
probe_df = pd.read_csv(probe_map_file)
promoter_df = pd.read_csv(promoter_file)
deg_df = pd.read_csv(deg_file)
gene_region_df = pd.read_csv(gene_region_file)

# 合并 probe 坐标信息
dma_annotated = dma_df.merge(probe_df, on="Probe_ID", how="left")

# 筛除启动子区域内 probe
promoter_probes = set()
for _, row in promoter_df.iterrows():
    hits = dma_annotated[
        (dma_annotated['Probe_Chr'] == row['Promoter_Chr']) &
        (dma_annotated['Probe_Chrom_Start'] >= row['Promoter_Start']) &
        (dma_annotated['Probe_Chrom_End'] <= row['Promoter_End'])
    ]
    promoter_probes.update(hits['Probe_ID'].tolist())

non_promoter_df = dma_annotated[~dma_annotated['Probe_ID'].isin(promoter_probes)]

# 匹配基因体区域
matched = []
for _, row in non_promoter_df.iterrows():
    hits = gene_region_df[
        (gene_region_df['Gene_Chr'] == row['Probe_Chr']) &
        (gene_region_df['Gene_Start'] <= row['Probe_Chrom_Start']) &
        (gene_region_df['Gene_End'] >= row['Probe_Chrom_End'])
    ]
    for _, hit in hits.iterrows():
        matched.append({
            "Probe_ID": row["Probe_ID"],
            "HGNC_Symbol": hit["HGNC_Symbol"],
            "Mean_Diff": row["Mean_Diff"],
            "adj_p_value": row["adj_p_value"]
        })

gene_body_hits_df = pd.DataFrame(matched)

# 联合 DEG 表达数据
deg_filtered = deg_df[["HGNC_Symbol", "Log2_Fold_Change"]]
merged = gene_body_hits_df.merge(deg_filtered, on="HGNC_Symbol", how="left")

# 启动子外甲基化下降 + 表达上升
hypo_up = merged[
    (merged["Mean_Diff"] < -0.15) & 
    (merged["Log2_Fold_Change"] > 0.5)
].dropna()

# 启动子外甲基化上升 + 表达下调
hyper_down = merged[
    (merged["Mean_Diff"] > 0.15) & 
    (merged["Log2_Fold_Change"] < -0.5)
].dropna()

# 保存结果
hypo_up.to_csv(f"{output_dir}/PRAD_GeneBody_Methylation_Down_Expression_Up.csv", index=False)
hyper_down.to_csv(f"{output_dir}/PRAD_GeneBody_Methylation_Up_Expression_Down.csv", index=False)
gene_body_hits_df.to_csv(f"{output_dir}/PRAD_Methylation_Probes_in_GeneBody.csv", index=False)

print("✅ 分析完成，结果已保存：")
print(" - 启动子外甲基化下降 + 表达上升 → PRAD_GeneBody_Methylation_Down_Expression_Up.csv")
print(" - 启动子外甲基化上升 + 表达下调 → PRAD_GeneBody_Methylation_Up_Expression_Down.csv")
print(" - 所有命中基因体区域的 probe → PRAD_Methylation_Probes_in_GeneBody.csv")


✅ 分析完成，结果已保存：
 - 启动子外甲基化下降 + 表达上升 → PRAD_GeneBody_Methylation_Down_Expression_Up.csv
 - 启动子外甲基化上升 + 表达下调 → PRAD_GeneBody_Methylation_Up_Expression_Down.csv
 - 所有命中基因体区域的 probe → PRAD_Methylation_Probes_in_GeneBody.csv


In [3]:
"""
Author: Weilin He
Description: Reads and analyzes CSV files related to methylation and expression data in Azure environment.
"""

import pandas as pd
import numpy as np
import os

# Azure 云端文件夹路径配置
base_dir = './5.3的甲基化分析'

# 文件列表（已在云端存在的 PRAD 文件名）
files = [
    'PRAD_GeneBody_Methylation_Down_Expression_Up.csv',
    'PRAD_GeneBody_Methylation_Up_Expression_Down.csv',
    'PRAD_Methylation_Probes_in_GeneBody.csv',
    'PRAD_Methylation_Probes_in_Promoters.csv',
    'PRAD_Promoters_Methylation_Down_Expression_Up.csv',
    'PRAD_Promoters_Methylation_Up_Expression_Down.csv'
]

results = {}

# CSV分析函数
def analyze_csv(file_path):
    df = pd.read_csv(file_path)
    analysis = {
        'row_count': len(df),
        'column_names': df.columns.tolist(),
        'sample_data': df.head(3).to_dict(orient='records')
    }

    if 'Methylation_Probes_in_GeneBody.csv' in file_path:
        mean_diff = df['Mean_Diff'].dropna()
        analysis['mean_diff_stats'] = {
            'min': mean_diff.min(),
            'max': mean_diff.max(),
            'avg': mean_diff.mean()
        }

    if 'Methylation_Probes_in_Promoters.csv' in file_path:
        mean_diff = df['Mean_Diff'].dropna()
        positive = (mean_diff > 0).sum()
        negative = (mean_diff < 0).sum()
        analysis['mean_diff_stats'] = {
            'min': mean_diff.min(),
            'max': mean_diff.max(),
            'avg': mean_diff.mean(),
            'positive_count': positive,
            'negative_count': negative,
            'positive_percentage': f"{positive / len(mean_diff) * 100:.2f}%",
            'negative_percentage': f"{negative / len(mean_diff) * 100:.2f}%"
        }

    if 'Promoters_Methylation_Down_Expression_Up.csv' in file_path:
        log2fc = df['Log2_Fold_Change'].dropna()
        mean_diff = df['Mean_Diff'].dropna()
        analysis['expression_stats'] = {
            'min_log2fc': log2fc.min(),
            'max_log2fc': log2fc.max(),
            'avg_log2fc': log2fc.mean(),
            'avg_mean_diff': mean_diff.mean()
        }

    return analysis

# 处理所有文件
for file in files:
    file_path = os.path.join(base_dir, file)
    try:
        results[file] = analyze_csv(file_path)
    except Exception as e:
        results[file] = {'error': str(e)}

# 生成总结统计
summary = {
    'total_promoter_probes': results.get('PRAD_Methylation_Probes_in_Promoters.csv', {}).get('row_count', 0),
    'total_gene_body_probes': results.get('PRAD_Methylation_Probes_in_GeneBody.csv', {}).get('row_count', 0),
    'promoter_methyl_down_expr_up': results.get('PRAD_Promoters_Methylation_Down_Expression_Up.csv', {}).get('row_count', 0),
    'promoter_methyl_up_expr_down': results.get('PRAD_Promoters_Methylation_Up_Expression_Down.csv', {}).get('row_count', 0),
    'gene_body_methyl_down_expr_up': results.get('PRAD_GeneBody_Methylation_Down_Expression_Up.csv', {}).get('row_count', 0),
    'gene_body_methyl_up_expr_down': results.get('PRAD_GeneBody_Methylation_Up_Expression_Down.csv', {}).get('row_count', 0)
}

# 展示结果
print("Summary of Analysis:", summary)

if 'PRAD_Promoters_Methylation_Down_Expression_Up.csv' in results:
    print("\nDetailed Expression Stats (Promoters Down Methylation & Expression Up):", 
          results['PRAD_Promoters_Methylation_Down_Expression_Up.csv'].get('expression_stats', {}))

if 'PRAD_GeneBody_Methylation_Down_Expression_Up.csv' in results:
    print("\nGene Body Row Count:", results['PRAD_GeneBody_Methylation_Down_Expression_Up.csv']['row_count'])
    print("Gene Body Sample Data:", results['PRAD_GeneBody_Methylation_Down_Expression_Up.csv']['sample_data'])

if 'PRAD_Methylation_Probes_in_Promoters.csv' in results:
    print("\nPromoter Region Methylation Distribution:", 
          results['PRAD_Methylation_Probes_in_Promoters.csv'].get('mean_diff_stats', {}))

print("\nCount of Promoter Methylation Up & Expression Down Genes:", 
      summary['promoter_methyl_up_expr_down'])


Summary of Analysis: {'total_promoter_probes': 1931, 'total_gene_body_probes': 45, 'promoter_methyl_down_expr_up': 343, 'promoter_methyl_up_expr_down': 0, 'gene_body_methyl_down_expr_up': 8, 'gene_body_methyl_up_expr_down': 0}

Detailed Expression Stats (Promoters Down Methylation & Expression Up): {'min_log2fc': 1.0299512413352487, 'max_log2fc': 1.808220990218901, 'avg_log2fc': 1.3119848767335553, 'avg_mean_diff': -0.21617629763037624}

Gene Body Row Count: 8
Gene Body Sample Data: [{'Probe_ID': 'cg12814550', 'HGNC_Symbol': 'KIAA1210', 'Mean_Diff': -0.2431334471653807, 'adj_p_value': 2.7082425394247728e-27, 'Log2_Fold_Change': 2.243803417132592}, {'Probe_ID': 'cg01016662', 'HGNC_Symbol': 'KCNA3', 'Mean_Diff': -0.2273195041932236, 'adj_p_value': 2.5784501323599824e-25, 'Log2_Fold_Change': 1.4617609664805569}, {'Probe_ID': 'cg02614217', 'HGNC_Symbol': 'IL21-AS1', 'Mean_Diff': -0.2035768319355921, 'adj_p_value': 3.4456000714228656e-25, 'Log2_Fold_Change': 1.7896684965611689}]

Promoter R