In [2]:
import sys
import os
import pandas as pd
import numpy as np

In [1]:
clustering_scripts_path = "/lila/data/deyk/fabiha/20231211_mwe/clustering_scripts"
clusters_path = "/lila/data/deyk/fabiha/20231211_mwe/clusters"

base_path= "/data/deyk/fabiha/20231211_mwe"
post_results_path = "/data/deyk/fabiha/20231211_mwe/post_results"
results_path = "/lila/data/deyk/fabiha/20231211_mwe/results"
annot_path = "/lila/data/deyk/fabiha/20231211_mwe/annot"
ldsc_results_path = "/lila/data/deyk/fabiha/20231211_mwe/ldsc_results"

In [3]:
gene_loc="/lila/data/deyk/IGVF/IGVF_Cellular_Programs_Benchmark/PoPS_P2D_benchmark/supplemental_data/NCBI37.3.ensembl.gene.loc"

In [4]:
gene_nomenclature=pd.read_table(gene_loc)

In [5]:
gene_nomenclature = gene_nomenclature[~gene_nomenclature["Ensembl"].duplicated(keep='first')] # not best method but need to keep unique

In [6]:
gene_nomenclature

Unnamed: 0,NCBI,chr,start,end,strand,HGNC,Ensembl
0,79501,1,69091,70008,+,OR4F5,ENSG00000186092
1,100996442,1,142447,174392,-,LOC100996442,ENSG00000238009
2,729759,1,367659,368597,+,OR4F29,ENSG00000284733
3,81399,1,621096,622034,-,OR4F16,ENSG00000284662
4,148398,1,859993,879961,+,SAMD11,ENSG00000187634
...,...,...,...,...,...,...,...
18311,442867,Y,26764151,26785354,+,BPY2B,ENSG00000183795
18312,57054,Y,26909216,26959639,-,DAZ3,ENSG00000187191
18313,57135,Y,26979967,27053187,+,DAZ4,ENSG00000205916
18314,442868,Y,27177048,27198251,-,BPY2C,ENSG00000185894


# scripts

## build_module_annotations

In [7]:
st_build_module_annotations_sh = r'''genescore_cell={$CLUSTERS_PATH}
bed_cell={$RESULTS_PATH}
enhancer_tissue={$TISSUE}

# ALL BLD GI - on ibd case control data

# ALL BLD - on alzheimer case control data
# ALL BLD - on msc case control data
# ALL BLD LNG - on asthma case control data

module load gcc/10.2.0
module load R

IFS="
"

TASKFILE={$TASKFILE}

for line in `cat $TASKFILE | awk '{print $1}' | sort | uniq`;
do
   temp=`echo $line | awk '{print $1}'`
   bed_dir=$bed_cell/$temp
   if [ ! -d $bed_dir ]
   then
       mkdir $bed_dir
   fi
   genescore_dir=$genescore_cell/$temp
   if [ ! -d $genescore_dir ]
   then
       mkdir $genescore_dir
   fi
   for ll in `ls -1 $genescore_dir | sed 's/\.txt//g' | awk '{print $1}' | sort | uniq`;
   do
      annot_name=`echo $ll | awk '{print $1}'`
      echo $temp $annot_name
     if [ ! -f $bed_dir/$annot_name/100kb.bed ]
      then
	  cmd="Rscript build_module_annotations.R  $genescore_dir $bed_dir $annot_name $enhancer_tissue"
          bsub -W 90 -R "rusage[mem=20]" -e geneS2G_{$NAME}.err -o geneS2G_{$NAME}.out -n 1 "$cmd"
      fi
   done
done
'''

def build_module_annotations(name, tissue="ALL"):
    filename = f"{clustering_scripts_path}/build_module_annotations_{name}_{tissue}.sh"
    taskfile = f"{clustering_scripts_path}/{name}.txt"
    with open(taskfile, "w") as f:
        f.write(f"{name}")    
    with open(filename, "w") as f:
        temp = st_build_module_annotations_sh.replace("{$CLUSTERS_PATH}", clusters_path)
        temp = temp.replace("{$RESULTS_PATH}", results_path)
        temp = temp.replace("{$TASKFILE}", taskfile)
        temp = temp.replace("{$NAME}", name)
        temp = temp.replace("{$TISSUE}", tissue)
        f.write(temp)


## clean_bedgraphs

run this script on a node through command line

In [8]:
st_clean_bedgraphs_sh = r'''bedops_cell=/data/deyk/extras/BEDOPS/bin
bedtools_cell=/data/deyk/extras/bedtools2/bin
bed_cell={$RESULTS}

TASKFILE={$TASKFILE}

for line in `cat $TASKFILE | awk '{print $1}' | sort | uniq`;
do
   annot_name=`echo $line | awk '{print $1}'`
   input_cell=$bed_cell/$annot_name
   echo  $input_cell
   names=`ls $input_cell | cut -f 1 -d '.'`
   for name in $names
   do
       $bedtools_cell/bedtools sort -i $input_cell/$name.bed > $input_cell/$name.2.bed
       $bedtools_cell/bedtools merge -i $input_cell/$name.2.bed -c 4 -o max > $input_cell/$name.3.bed
       mv $input_cell/$name.3.bed $input_cell/$name.bed
       rm $input_cell/$name.2.bed
   done
done
'''

def clean_bedgraphs(name):
    filename = f"{clustering_scripts_path}/clean_bedgraphs_{name}.sh"
    taskfile = f"{clustering_scripts_path}/folders_{name}.txt"
    results = f"{results_path}/{name}"
    with open(taskfile, "w") as f:
        f.write("\n".join([i.split(".")[0] for i in os.listdir(f"{clusters_path}/{name}")])+"\n")
    with open(filename, "w") as f:
        temp = st_clean_bedgraphs_sh.replace("{$RESULTS}", results)
        temp = temp.replace("{$TASKFILE}", taskfile)
        temp = temp.replace("{$NAME}", name)
        f.write(temp)


## create_annot_from_bedgraph

In [9]:
st_create_annot_from_bedgraph_sh = r'''module load gcc/10.2.0
module load conda
module load anaconda/anaconda3
source /lila/home/fabihat/anaconda3/etc/profile.d/conda.sh
conda activate v2g2p
export PATH=$PATH:/data/deyk/extras/bedtools2/bin
#source activate ldsc
bedfile_path={$RESULTS}
bimfile_path=/data/deyk/kushal/extras/BIMS_hg38
annot_path={$ANNOT}
#ldsc_path=/n/groups/price/kushal/LDSC/ldsc
IFS="
"

TASKFILE={$TASKFILE}
for line in `cat $TASKFILE | awk '{print $1}' | sort | uniq`;
do
    name=`echo $line | awk '{print $1}'`
    if [ ! -d $annot_path/$name ]
    then
        mkdir $annot_path/$name
    fi
    for bedline in `ls $bedfile_path/$name/ | cat | sort | uniq | awk -F .'bed' '{print $1}'`;
    do
        bedname=`echo $bedline | awk '{print $1}'`
        if [ ! -d $annot_path/$name/$bedname ]
        then
            mkdir $annot_path/$name/$bedname
        fi
        if [ ! -f $annot_path/$name/$bedname/$bedname.22.annot.gz ]
        then
            cmd="/lila/home/deyk/.conda/envs/ldsc/bin/python make_annot_combine_from_bedgraph.py --bedname $bedname --bedfile_path $bedfile_path/$name --bimfile_path $bimfile_path --annot_path $annot_path/$name/$bedname"
            bsub -W 300 -R "rusage[mem=20]" -e annot_{$NAME}.err -o annot_{$NAME}.out -n 1 "$cmd"
        fi
    done
done
'''

def create_annot_from_bedgraph(name):
    filename = f"{clustering_scripts_path}/create_annot_from_bedgraph_{name}.sh"
    taskfile = f"{clustering_scripts_path}/folders_{name}.txt"
    results = f"{results_path}/{name}"
    annot = f"{annot_path}/{name}"
    with open(taskfile, "w") as f:
        f.write("\n".join([i.split(".")[0] for i in os.listdir(f"{clusters_path}/{name}")])+"\n")
    with open(filename, "w") as f:
        temp = st_create_annot_from_bedgraph_sh.replace("{$RESULTS}", results)
        temp = temp.replace("{$TASKFILE}", taskfile)
        temp = temp.replace("{$ANNOT}", annot)
        temp = temp.replace("{$NAME}", name)
        f.write(temp)
    !mkdir "{annot_path}/{name}"

## ldsc_mega

In [10]:
st_ldsc_mega_sh = r'''annot_cell={$ANNOT}
ldsc_path=/data/deyk/kushal/ldsc
bfile_path=/data/deyk/kushal/LDSC/1000G_EUR_Phase3_plink_hg38
hapmap_path=/data/deyk/kushal/LDSC/hapmap3_snps

IFS="
"

TASKFILE={$TASKFILE}

#module load conda2
#source activate ldsc
module load anaconda/anaconda3
source /lila/home/fabihat/anaconda3/etc/profile.d/conda.sh
conda activate v2g2p

for line in `cat $TASKFILE | awk '{print $1}' | sort | uniq`;
do
   annot_module=`echo $line | awk '{print $1}'`
   echo $annot_cell $annot_module
   for ll in `ls $annot_cell/$annot_module | awk '{print $1}' | sort | uniq`;
   do
       annot_dir=`echo $ll | awk '{print $1}'`
       echo $annot_dir
       if [ ! -d $annot_cell/$annot_module/$annot_dir ]
       then
	   mkdir $annot_cell/$annot_module/$annot_dir
       fi
       for chrom in {1..22}
       do
       if [ ! -f $annot_cell/$annot_module/$annot_dir/$annot_dir.$chrom.l2.ldscore.gz ]
       then
           cmd="/lila/home/deyk/.conda/envs/ldsc/bin/python $ldsc_path/ldsc.py --bfile $bfile_path/1000G.EUR.QC.$chrom --l2 --ld-wind-cm 1 --yes-really --annot $annot_cell/$annot_module/$annot_dir/$annot_dir.$chrom.annot.gz --print-snps $hapmap_path/hm.$chrom.snp --out $annot_cell/$annot_module/$annot_dir/$annot_dir.$chrom"
           bsub -W 300 -R "rusage[mem=20]" -e mega_{$NAME}.err -o mega_{$NAME}.out -n 1 "$cmd"
       fi
    done
  done
done
'''

def ldsc_mega(name):
    filename = f"{clustering_scripts_path}/ldsc_mega_{name}.sh"
    taskfile = f"{clustering_scripts_path}/folders_{name}.txt"
    annot = f"{annot_path}/{name}"
    with open(taskfile, "w") as f:
        f.write("\n".join([i.split(".")[0] for i in os.listdir(f"{clusters_path}/{name}")])+"\n")
    with open(filename, "w") as f:
        temp = st_ldsc_mega_sh.replace("{$TASKFILE}", taskfile)
        temp = temp.replace("{$ANNOT}", annot)
        temp = temp.replace("{$NAME}", name)
        f.write(temp)


## ldsc_reg

In [11]:
st_ldsc_reg_sh = r'''annot_cell={$ANNOT}
baseline_cell=/data/deyk/kushal/ENCODE_Flagship2023/ANNOTATIONS/Baselines
baseline_version=baseline_Epi_hg38
#baseline_version=baselineLD_v2.2
ldsc_path=/data/deyk/kushal/ldsc/
weights_path=/data/deyk/kushal/extras/1000G_EUR_Phase3_hg38/weights
freq_path=/data/deyk/kushal/extras/1000G_EUR_Phase3_hg38/plink_files
#sumstats_cell=/n/groups/price/ldsc/sumstats_formatted
sumstats_cell=/data/deyk/kushal/ENCODE_Flagship2023/sumstats
output_cell_pre={$LDSC_RESULTS}
IFS="
"

#sumstats_taskfile=/n/groups/price/kushal/singlecellLDSC/data/traits_bio.txt
sumstats_taskfile=/data/deyk/kushal/LDSC/TASKFILES/sumstats_encode.txt
annot_taskfile={$TASKFILE}

module load anaconda/anaconda3
source /lila/home/fabihat/anaconda3/etc/profile.d/conda.sh
conda activate v2g2p
#module load conda2
#source activate ldsc

if [ ! -d $output_cell_pre ]
then
    mkdir $output_cell_pre
fi

output_cell=$output_cell_pre/$baseline_version

if [ ! -d $output_cell ]
then
    mkdir $output_cell
fi

echo $output_cell
for line in `cat $annot_taskfile | awk '{print $1}' | sort | uniq`;
do
    annot_module=`echo $line | awk '{print $1}'`
    echo $annot_cell $annot_module
    if [ ! -d $annot_cell/$annot_module ]
    then
        echo "Error: annotation module directory not found" > ldsc_logfile_{$NAME}.log
        exit 100
    fi
    if [ ! -d $output_cell/$annot_module ]
    then
        mkdir $output_cell/$annot_module
    fi
    for ll in `ls $annot_cell/$annot_module | awk '{print $1}' | sort | uniq`;
    do
        annot_dir=`echo $ll | awk '{print $1}'`
        echo $annot_dir
        if [ ! -d $annot_cell/$annot_module/$annot_dir ]
        then
            echo "Error: annotation module directory not found" > ldsc_logfile_{$NAME}.log
            exit 101
        fi
        if [ ! -d $output_cell/$annot_module/$annot_dir ]
        then
            mkdir $output_cell/$annot_module/$annot_dir
        fi
        for step in `cat $sumstats_taskfile | awk '{print $1}' | sort | uniq`;
        do
            sumstats_file=`echo $step | awk '{print $1}'`
            echo $sumstats_cell $sumstats_file
            if [ ! -f $sumstats_cell/$sumstats_file ]
            then
                echo "Error: sumstats file not found" > ldsc_logfile_{$NAME}.log
                exit 102
            fi
            if [ ! -f $output_cell/$annot_module/$annot_dir/$sumstats_file.results ]
            then
            cmd="/lila/home/deyk/.conda/envs/ldsc/bin/python $ldsc_path/ldsc.py  --h2 $sumstats_cell/$sumstats_file --ref-ld-chr $annot_cell/$annot_module/$annot_dir/$annot_dir.,$baseline_cell/$baseline_version/baselineLD.  --frqfile-chr $freq_path/1000G.EUR.hg38. --w-ld-chr $weights_path/weights.hm3_noMHC. --overlap-annot --print-coefficients --print-delete-vals --out $output_cell/$annot_module/$annot_dir/$sumstats_file"
            bsub -W 300 -R "rusage[mem=20]" -e reg_max_{$NAME}.err -o reg_max_{$NAME}.out -n 1 "$cmd"
            fi
        done
    done
done
'''

def ldsc_reg(name):
    filename = f"{clustering_scripts_path}/ldsc_reg_{name}.sh"
    ldsc_results = f"{ldsc_results_path}/{name}"
    taskfile = f"{clustering_scripts_path}/folders_{name}.txt"
    annot = f"{annot_path}/{name}"
    with open(taskfile, "w") as f:
        f.write("\n".join([i.split(".")[0] for i in os.listdir(f"{clusters_path}/{name}")])+"\n")
    with open(filename, "w") as f:
        temp = st_ldsc_reg_sh.replace("{$TASKFILE}", taskfile)
        temp = temp.replace("{$ANNOT}", annot)
        temp = temp.replace("{$LDSC_RESULTS}", ldsc_results)
        temp = temp.replace("{$NAME}", name)
        f.write(temp)


## get_sd_annot

In [12]:
st_get_sd_annot_sh = r'''cellname={$ANNOT}
index=1
module load gcc/10.2.0
module load R

IFS=""

cmd="Rscript get_sd_annot.R  $cellname $index"
bsub -W 450 -R "rusage[mem=20]" -e getsd_{$NAME}.err -o getsd_{$NAME}.out -n 2 "$cmd"
'''

def get_sd_annot(name):
    filename = f"{clustering_scripts_path}/get_sd_annot_{name}.sh"
    annot = f"{annot_path}/{name}"
    with open(filename, "w") as f:
        temp = st_get_sd_annot_sh.replace("{$ANNOT}", annot)
        temp = temp.replace("{$NAME}", name)
        f.write(temp)


## ldsc_postprocess

In [13]:
st_ldsc_postprocess_sh = r'''annot_cell={$ANNOT}
output_cell_pre={$LDSC_RESULTS}
baseline_version=baseline_Epi_hg38
#baseline_version=baselineLD_v2.2
output_cell=$output_cell_pre/$baseline_version

sumstats_taskfile=/data/deyk/kushal/LDSC/TASKFILES/sumstats_encode.txt
#sumstats_taskfile=/n/groups/price/kushal/singlecellLDSC/data/traits_bio.txt

IFS="
"

module load gcc/10.2.0
module load R

flag=0
index_in_results=1 ## which annotation to choose from the .results file in case of multiple annotations


for step in `cat $sumstats_taskfile | awk '{print $1}' | sort | uniq`;
do
sumstats_file=`echo $step | awk '{print $1}'`
echo $sumstats_file
sumstats_file2=${sumstats_file%.sumstats}

counter1=0
for step2 in `ls $output_cell | awk '{print $1}' | sort | uniq`;
do
    annot_name=`echo $step2 | awk '{print $1}'`
    if [ ! -f $output_cell/$annot_name/${sumstats_file2}_ldsc_postprocess.txt ]
    then
	counter1=$(($counter1+1))
    fi
done

if (( $counter1 > 0 ))
then
    echo $sumstats_file2
    cmd="Rscript ldsc_postprocess.R  $annot_cell $output_cell $sumstats_file $flag $index_in_results"
    bsub -W 270 -R "rusage[mem=20]" -e ldsc_post_{$NAME}.err -o ldsc_post_{$NAME}.out -n 1 "$cmd"
    #sbatch --time=40:00 --mem=20000 --output=ldsc_post.out --error=ldsc_post.err -p short -c 1 --wrap="$cmd"
fi
done
'''

def ldsc_postprocess(name):
    filename = f"{clustering_scripts_path}/ldsc_postprocess_{name}.sh"
    ldsc_results = f"{ldsc_results_path}/{name}"
    annot = f"{annot_path}/{name}"
    with open(filename, "w") as f:
        temp = st_ldsc_postprocess_sh.replace("{$ANNOT}", annot)
        temp = temp.replace("{$LDSC_RESULTS}", ldsc_results)
        temp = temp.replace("{$NAME}", name)
        f.write(temp)


## marginal meta enrichment

In [14]:
meta_enr_sh = r'''module load gcc/10.2.0
module load R

IFS=""

cmd="Rscript marginal_meta_enrichment.R {$NAME} {$ANNOT_PATH} {$LDSC_RESULTS_PATH} {$POST_RESULTS_PATH}"
bsub -W 450 -R "rusage[mem=20]" -e marginal_meta_enrichment_{$NAME}.err -o marginal_meta_enrichment_{$NAME}.out -n 2 "$cmd"
'''

def marginal_meta_enrichment(name):
    filename = f"{clustering_scripts_path}/marginal_meta_enrichment_{name}.sh"
    with open(filename, "w") as f:
        temp = meta_enr_sh.replace("{$ANNOT_PATH}", annot_path)
        temp = temp.replace("{$LDSC_RESULTS_PATH}", ldsc_results_path)
        temp = temp.replace("{$POST_RESULTS_PATH}", post_results_path)
        temp = temp.replace("{$NAME}", name)
        f.write(temp)


## marginal meta taustar

In [15]:
meta_tau_sh = r'''module load gcc/10.2.0
module load R

IFS=""

cmd="Rscript marginal_meta_taustar.R {$NAME} {$ANNOT_PATH} {$LDSC_RESULTS_PATH} {$POST_RESULTS_PATH}"
bsub -W 450 -R "rusage[mem=20]" -e marginal_meta_taustar_{$NAME}.err -o marginal_meta_taustar_{$NAME}.out -n 2 "$cmd"
'''

def marginal_meta_taustar(name):
    filename = f"{clustering_scripts_path}/marginal_meta_taustar_{name}.sh"
    with open(filename, "w") as f:
        temp = meta_tau_sh.replace("{$ANNOT_PATH}", annot_path)
        temp = temp.replace("{$LDSC_RESULTS_PATH}", ldsc_results_path)
        temp = temp.replace("{$POST_RESULTS_PATH}", post_results_path)
        temp = temp.replace("{$NAME}", name)
        f.write(temp)


# runner

In [18]:
name = "transCCA_Z_Diag_k10_sumabsCV"

In [19]:
v = pd.read_csv(f"/lila/data/deyk/fabiha/20231221_xuewei_trans/yifei_eqtlgen/{name}.csv", index_col=0)

In [20]:
v = v[v.index.isin(gene_nomenclature[gene_nomenclature["Ensembl"].isin(set(v.index))]["Ensembl"].to_list())]

In [21]:
ensembl_hgnc = dict(zip(gene_nomenclature["Ensembl"], gene_nomenclature["HGNC"]))
v.index = [ensembl_hgnc[ens] for ens in list(v.index)]

In [22]:
v

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10
JAM3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PF4V1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
MAP3K7CL,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
VENTX,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
PARP8,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...
BIK,-0.000474,-0.001969,0.000000,-0.005485,-0.001342,0.000000,-0.007607,0.000000,0.000000,0.000000
SLAIN2,-0.003047,0.000124,-0.005790,-0.002246,-0.004931,-0.005208,-0.007557,0.006216,0.002605,0.000807
TRIM26,-0.005162,-0.008880,-0.004738,-0.014747,-0.002550,0.005638,0.000000,0.003993,0.002162,0.000738
FANCA,0.006001,-0.008233,0.000000,-0.005089,-0.001705,0.012942,0.002908,-0.002225,-0.002259,0.000000


In [23]:
temp = f"/lila/data/deyk/fabiha/20231211_mwe/clusters/{name}"
!mkdir $temp

In [24]:
for col in v:
    v[[col]].abs().to_csv(f"/lila/data/deyk/fabiha/20231211_mwe/clusters/{name}/{name}_{col}.txt", header=None, sep="\t")

In [25]:
build_module_annotations(name)
clean_bedgraphs(name)
create_annot_from_bedgraph(name)
ldsc_mega(name)
ldsc_reg(name)
get_sd_annot(name)
ldsc_postprocess(name)
marginal_meta_enrichment(name)
marginal_meta_taustar(name)