In [6]:
from coloc import coloc, print_coloc_result
import pandas as pd
from subprocess import call
import itertools
import numpy as np

In [31]:
ASE_file = "/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.7.2/ASE/nano_merge_ASE_SNP.csv"
m6A_file = "/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.7.2/m6A/nano_merge_m6A_SNP.csv"
pseU_file = "/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.7.2/pseU/nano_merge_pseU_SNP.csv"
stability_file = "/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.7.2/stability/nano_merge_stability_SNP.csv"
isoform_file = "/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.7.2/isoform/nano_merge_isoform_SNP.csv"
polyA_file = "/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.7.2/apa/nano_merge_polyA_SNP.csv"
promoter_file = "/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.7.2/promoter/nano_merge_promoter_SNP.csv"
tss_file = "/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.7.2/tss/nano_merge_tss_SNP.csv"

gene_bed_file = "/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.7.2/stability/gene_merge.bed"
outdir = "/mnt/hpc/home/xuxinran/DirectSeq/data/zhaolin_240206/240201-zhaolin-RNA-merge/v0.7.2/colco"

chrom = "chr1"
strand = "+"
min_qtl_count=25
n = 104

In [32]:
def calculate_abf(se, beta, maf, n):
    Z = np.abs(beta / se)
    r = 1 / (n * maf * (1 - maf) * se**2) # Calculate shrinkage factor
    abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)
    return abf


In [33]:
def read_file(n, chrom, strand, full_summury, trait, trait_col=''):
    df = pd.read_csv(full_summury)
    df = df[(df['chrom'] == chrom)&(df['strand'] == strand)]
    df = df[df['p_value']<1]
    df[f'abf_{trait}'] = df.apply(lambda row: calculate_abf(row['SE'], row['beta'], row['EAF'], n), axis=1)
    df = df.dropna(subset=[f'abf_{trait}'])
    df = df.rename(columns={'p_value': f'p_value_{trait}'})
    common_col = ['rsID', 'chrom', 'snp_pos_1base', 'strand', 'EAF', f'abf_{trait}', f'p_value_{trait}']
    if trait_col:
        common_col.append(trait_col)
    df = df[common_col]
    return df

In [34]:
## 读所有的文件
ASE_df = read_file(n, chrom, strand, ASE_file, "ASE")
m6A_df = read_file(n, chrom, strand, m6A_file, "m6A", "m6A_pos_1base")
pseU_df = read_file(n, chrom, strand, pseU_file, "pseU", "pseU_pos_1base")
stability_df = read_file(n, chrom, strand, stability_file, "stability")
isoform_df = read_file(n, chrom, strand, isoform_file, "isoform")
polyA_df = read_file(n, chrom, strand, polyA_file, "polyA")
promoter_df = read_file(n, chrom, strand, promoter_file, "promoter")
tss_df = read_file(n, chrom, strand, tss_file, "tss")

## 合并所有的 DataFrame
dfs = [ASE_df, stability_df, isoform_df, polyA_df, promoter_df, tss_df, m6A_df, pseU_df]
merged_df = dfs[0]
for df in dfs[1:]:
    merged_df = pd.merge(merged_df, df, on=['rsID', 'chrom', 'snp_pos_1base', 'strand', 'EAF'], how='outer')
merged_df['non_null_count'] = merged_df.notnull().sum(axis=1) # 统计每行的非空值数量
merged_df = merged_df[merged_df['non_null_count'] >= 10]  # 4列键 + 至少2种trait(6列)，保证这个SNP至少是两种QTL
merged_df = merged_df.drop(columns=['non_null_count'])

## gene注释
gene_df = merged_df[['chrom', 'snp_pos_1base', 'strand', 'rsID']]
gene_df = gene_df.drop_duplicates()
gene_df['snp_pos_0base'] = gene_df['snp_pos_1base'] - 1
gene_df['score'] = 0
gene_df = gene_df[['chrom', 'snp_pos_0base', 'snp_pos_1base', 'rsID', 'score', 'strand']]
gene_df.to_csv(f'{outdir}/rs_{chrom}_{strand}.bed', sep='\t', index=False, header=False)
call(f'bedtools intersect -wa -wb -s -a {outdir}/rs_{chrom}_{strand}.bed -b {gene_bed_file} > {outdir}/gene_rs_{chrom}_{strand}.bed',shell=True)
gene_df = pd.read_csv(f'{outdir}/gene_rs_{chrom}_{strand}.bed',header=None,sep='\t',usecols = [0,2,3,5,9],names=['chrom','snp_pos_1base','rsID','strand','geneID'])
merged_df = pd.merge(merged_df, gene_df[['chrom', 'snp_pos_1base', 'rsID', 'strand', 'geneID']], on=['chrom', 'snp_pos_1base', 'rsID', 'strand'], how='left')
call(f'rm {outdir}/rs_{chrom}_{strand}.bed {outdir}/gene_rs_{chrom}_{strand}.bed',shell=True)

## 分区进行colco
gene_groups = merged_df.groupby('geneID')

for gene_group in gene_groups:
    geneID = gene_group[0]
    gene_df = gene_group[1]
    # 筛选1:首先对gene整体筛选 保证gene内至少有min_qtl_count个重叠QTL
    if len(set(gene_df['snp_pos_1base'])) >= min_qtl_count:
        # 运行下面的函数
        break
    else:
        # print(f"{geneID} has less than {min_qtl_count} shared QTLs, skip")
        continue


  abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)
  abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)
  df = pd.read_csv(full_summury)
  abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)
  df = pd.read_csv(full_summury)
  abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)
  abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)
  abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)
  abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)
  abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)
  abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)
  abf = np.sqrt(1 - r) * np.exp((Z**2 / 2) * r)


In [35]:
trait_columns = ['tss', 'promoter', 'polyA', 'isoform', 'stability', 'm6A', 'pseU', 'ASE']
for col in trait_columns:
    gene_df[col] = gene_df[f'p_value_{col}'].notna()
## 下面用来生成所有两两trait组合的df
sub_dfs = {}
for trait1, trait2 in itertools.combinations(trait_columns, 2):
    sub_df = gene_df[(gene_df[trait1] != False) & (gene_df[trait2] != False)]
    if len(set(sub_df['snp_pos_1base'])) >= min_qtl_count :
        sub_dfs[f"{trait1}_{trait2}"] = sub_df # 筛选2：两两trait组合的SNP数量大于阈值

In [152]:
def gene_2trait_combiner(geneID, gene_df, min_qtl_count=50, p_threshold=0.05):
    trait_columns = ['tss', 'promoter', 'polyA', 'isoform', 'stability', 'm6A', 'pseU', 'ASE']
    for col in trait_columns:
        gene_df[col] = gene_df[f'p_value_{col}'].notna()
    ## 下面用来生成所有两两trait组合的df
    sub_dfs = {}
    for trait1, trait2 in itertools.combinations(trait_columns, 2):
        sub_df = gene_df[(gene_df[trait1] != False) & (gene_df[trait2] != False)]
        if len(set(sub_df['snp_pos_1base'])) >= min_qtl_count :
            sub_dfs[f"{trait1}_{trait2}"] = sub_df # 筛选2：两两trait组合的SNP数量大于阈值
    if len(sub_dfs) == 0:
        print(f'In {geneID},all trait combinations has less than {min_qtl_count} shared QTLs')
        return None
    else:
        # 运行snp_combiner

In [41]:
df = sub_dfs['m6A_pseU']
trait1 = 'm6A'
trait2 = 'pseU'
df = df[["chrom" , "snp_pos_1base", "rsID", "strand", "EAF", f"abf_{trait1}", f"p_value_{trait1}", f"abf_{trait2}", f"p_value_{trait2}"]]
df = df.drop_duplicates()
unique_snps = df[df.duplicated('snp_pos_1base', keep=False) == False]
duplicate_snps = df[df.duplicated('snp_pos_1base', keep=False)]
grouped = duplicate_snps.groupby('snp_pos_1base')
groups = [group.index.tolist() for _, group in grouped] # 提取每个重复 SNP 的所有行
for group in groups:
    duplicate_sub_df = duplicate_snps.loc[group]
    colco_sub_df = pd.concat([unique_snps, duplicate_sub_df])
    ## 筛选3: 保证每个SNP的两个trait中 至少有一个p_value<0.05
    if (colco_sub_df[f"p_value_{trait1}"] > 0.05).all() and (colco_sub_df[f"p_value_{trait2}"] > 0.05).all():
        continue
    else:
        ## 进行snp_combiner
        break

In [None]:
def snp_combiner(df, trait1, trait2, p_threshold=0.05):
    df = df[["chrom" , "snp_pos_1base", "rsID", "strand", "EAF", f"abf_{trait1}", f"p_value_{trait1}", f"abf_{trait2}", f"p_value_{trait2}"]]
    df = df.drop_duplicates()
    unique_snps = df[df.duplicated('snp_pos_1base', keep=False) == False]
    duplicate_snps = df[df.duplicated('snp_pos_1base', keep=False)]
    grouped = duplicate_snps.groupby('snp_pos_1base')
    groups = [group.index.tolist() for _, group in grouped] # 提取每个重复 SNP 的所有行
    for group in groups:
        duplicate_sub_df = duplicate_snps.loc[group]
        colco_sub_df = pd.concat([unique_snps, duplicate_sub_df])
        ## 筛选3: 保证每个SNP的两个trait中 至少有一个p_value<0.05
        if (colco_sub_df[f"p_value_{trait1}"] > p_threshold).all() and (colco_sub_df[f"p_value_{trait2}"] > p_threshold).all():
            continue
        else:
            ## 进行snp_combiner
            break


In [53]:
res = coloc(trait1_lnbfs=colco_sub_df['abf_m6A'],trait2_lnbfs=colco_sub_df['abf_pseU'],prior1=1e-3,prior2=1e-3,prior12=1e-4,)

In [54]:
list(res)

[0.8704316162265129,
 0.05636531124196337,
 0.05448740339774299,
 0.0033749567290424502,
 0.01534071240473844]

In [55]:
help(print_coloc_result)

Help on function print_coloc_result in module coloc.coloc:

print_coloc_result(title: str, pp0: float, pp1: float, pp2: float, pp3: float, pp4: float)
    Print an ascii representation of the colocalization test result
    
    Parameters
    ----------
    title
        a title for the ascii art
    pp0
        the posterior probability of H0: no association
    pp1
        the posterior probability of H1: association in trait 1 only
    pp2
        the posterior probability of H2: association in trait 2 only
    pp3
        the posterior probability of H3: independent associations
    pp4
        the posterior probability of H4: colocalized associations



In [49]:
print_coloc_result('not colocalized', *res)


not colocalized

PP0: [|||||||||||||||||   ] [ 0.8704316162265129 ]
PP1: [|                   ] [ 0.05636531124196337 ]
PP2: [|                   ] [ 0.05448740339774299 ]
PP3: [                    ] [ 0.0033749567290424502 ]
PP4: [                    ] [ 0.01534071240473844 ]



In [None]:
def run_coloc_analysis(colco_sub_df,prior1_set=1e-4 ,prior2_set=1e-4 ,prior12_set=1e-5):
    result = coloc.coloc(
    trait1_lnbfs=LBFs_trait1,
    trait2_lnbfs=LBFs_trait2,
    prior1=prior1_set,
    prior2=prior2_set,
    prior12=prior12_set,
)


