In [35]:
import pandas as pd
from scipy.stats import pearsonr
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import os
from itertools import combinations

In [2]:
qtl_dict = {
        'inosine-QTL':'/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.8.1/Iqtl/nano_merge_I_summary.csv',
        'puQTL':'/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.8.1/puqtl/nano_merge_promoter_summary.csv',
        'm6A-QTL':'/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.8.1/m6Aqtl/nano_merge_m6A_summary.csv',
        'pseU-QTL':'/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.8.1/pseUqtl/nano_merge_pseU_summary.csv',
        'm5C-QTL':'/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.8.1/m5Cqtl/nano_merge_m5C_summary.csv',
        'stQTL':'/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.8.1/stqtl/nano_merge_stability_summary.csv',
        '3aQTL':'/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.8.1/3aqtl/nano_merge_APA_summary.csv',
        'irQTL':'/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.8.1/irqtl/nano_merge_isoform_summary.csv'
    }

In [3]:
def load_and_filter_qtl(qtl_file,qtl_type):
    if qtl_type == "stQTL":
        cols_to_read = ["chrom","strand","snp_pos_1base","rsID","pvalue","Beta"]
        qtl_data = pd.read_csv(qtl_file, usecols=cols_to_read)
        qtl_data = qtl_data[qtl_data['pvalue'] < 0.05]
        qtl_data = qtl_data.loc[qtl_data.groupby(["chrom","strand","snp_pos_1base","rsID"])['pvalue'].idxmin()]
    else:
        cols_to_read = ["chrom","strand","snp_pos_1base","rsID","BayesFactor","Beta"]
        qtl_data = pd.read_csv(qtl_file, usecols=cols_to_read)
        qtl_data = qtl_data[qtl_data['BayesFactor'] > 3]
        qtl_data = qtl_data.loc[qtl_data.groupby(["chrom","strand","snp_pos_1base","rsID"])['BayesFactor'].idxmax()]
    # 修改列名
    qtl_data = qtl_data[["chrom","strand","snp_pos_1base","rsID","Beta"]]
    qtl_data.columns = ["chrom","strand","snp_pos_1base","rsID",f"{qtl_type}_Beta"]
    return qtl_data

In [4]:
df1 = load_and_filter_qtl(qtl_dict["inosine-QTL"],"inosine-QTL")
df2 = load_and_filter_qtl(qtl_dict["puQTL"],"puQTL")
df3 = load_and_filter_qtl(qtl_dict["m6A-QTL"],"m6A-QTL")
df4 = load_and_filter_qtl(qtl_dict["pseU-QTL"],"pseU-QTL")
df5 = load_and_filter_qtl(qtl_dict["m5C-QTL"],"m5C-QTL")
df6 = load_and_filter_qtl(qtl_dict["stQTL"],"stQTL")
df7 = load_and_filter_qtl(qtl_dict["3aQTL"],"3aQTL")
df8 = load_and_filter_qtl(qtl_dict["irQTL"],"irQTL")

dfs = [df1, df2, df3, df4, df5, df6, df7, df8]

max_len = 0
max_df = None
for df in dfs:
    if len(df) > max_len:
        max_len = len(df)
        max_df = df
merged_df = max_df
for df in dfs:
    if df is not max_df:
        merged_df = pd.merge(merged_df, df, on=["chrom", "strand", "snp_pos_1base", "rsID"], how='outer')


In [34]:
def calculate_pearson_correlation(df, qtl1, qtl2):
  process_df = df[[f'{qtl1}_Beta', f'{qtl2}_Beta']].dropna()
  correlation, p_value = pearsonr(process_df[f'{qtl1}_Beta'],process_df[f'{qtl2}_Beta'])
  x = process_df[f'{qtl1}_Beta']
  y = process_df[f'{qtl2}_Beta']
  plt.figure(figsize=(5, 4))
  # 修改散点图样式
  plt.scatter(x, y, color='black', marker='o', s=20)  # color='black' 设置颜色为黑色，marker='o' 设置点的形状为圆形，s=20 设置点的大小为20
  plt.xlabel(f'{qtl1}_Beta')
  plt.ylabel(f'{qtl2}_Beta')
  # 将相关性结果添加到标题中
  title = f'Pearson r = {correlation:.2f}, p = {p_value:.3f}'
  plt.title(title)
  plt.grid(True)
  plt.tight_layout()
  plt.savefig(f'{qtl1}_{qtl2}_ß.pdf')

In [None]:
os.chdir("/mnt/hpc/home/xuxinran/DirectSeq/8_downsteam/beta_pearson")

qtl_names = [
    'inosine-QTL',
    'puQTL',
    'm6A-QTL',
    'pseU-QTL',
    'm5C-QTL',
    'stQTL',
    '3aQTL',
    'irQTL'
]
for pair in combinations(qtl_names, 2):
    calculate_pearson_correlation(merged_df, pair[0], pair[1])