In [1]:
import importlib
import nanotools
importlib.reload(nanotools) # reload nanotools module
import wiggle_functions as wf
import gzip
import sys
import pysam
import pyBigWig
import pandas as pd
import numpy as np



In [2]:
### Bed file configurations:
sample_source = "type" # "chr_type" or "type" or "chromosome"
chr_type_selected = ["X","Autosome"] # 'X' or "Autosome"
type_selected = ["her-1_FULL"]#,"intergenic_control"
#,"DPY27_chip_q2","DPY27_chip_q3","DPY27_chip_q4","intergenic_control"
#,"her-1_TSS","fem-1_TSS","fem-2_TSS","fem-3_TSS","sex-1_TSS"
# her-1_TSS/TES/FULL | TES_q1-4 | #TSS_q1-4 | strong/weak rex | whole_chr | 200kb_region | 50kb_region | center_DPY27_chip_albretton | gene_q1-q4 | MEX_motif | center_SDC2_chip_albretton | center_SDC3_chip_albretton |
# ATAC_seq_EXCL_dpy27_ol100 | ATAC_seq;DPY27_ol100
max_regions = 10 # max regions to consider; 0 = full set;
chromosome_selected = ["CHROMOSOME_I", "CHROMOSOME_II", "CHROMOSOME_III", "CHROMOSOME_IV","CHROMOSOME_V","CHROMOSOME_X"] #"CHROMOSOME_I", "CHROMOSOME_II", "CHROMOSOME_III", "CHROMOSOME_IV","CHROMOSOME_V",
strand_selected = ["+","-"] #+ and/or -
select_opp_strand = True #If you want to select both + and - strands for all regions set to True
down_sample_autosome = True # If you want to downsample autosome genes to match number of X genes set to True
if chr_type_selected == ["X"]:
    down_sample_autosome = False
bed_file = "/Data1/reference/tss_tes_rex_combined_v18_WS235.bed"
bed_window = 1000   # +/- around bed elements.
intergenic_window = 1000 # +/- around intergenic regions
mods = "a" # {A,CG,A+CG}
if sample_source == "chr_type":
    selection = chr_type_selected
if sample_source == "type":
    selection = type_selected
if sample_source == "chromosome":
    selection = chromosome_selected

In [3]:
### BAM Configurations
R9_m6A_thresh_percent = 0.85
R10_m6A_thresh_percent = 0.9
R10_5mC_thresh_percent = 0.9
R9_m6A_thresh = int(round(R9_m6A_thresh_percent*258,0)) #default is 129 = 50%; 181=70%; 194=75%; 207 = 80%; 232 = 90%
m6A_thresh = int(round(R10_m6A_thresh_percent*258,0))
mC_thresh = int(round(R10_5mC_thresh_percent*258,0))
print("R9_m6A_thresh: ", R9_m6A_thresh)
print("m6A_thresh: ", m6A_thresh)
print("mC_thresh: ", mC_thresh)

# modkit is used for aggregating methylation data from .bam files
# https://nanoporetech.github.io/modkit/quick_start.html
modkit_path = "/Data1/software/modkit/modkit"

# Options: N2_fiber; SDC2_degron_fiber; SDC2_degron_bg; N2_bg; N2-DPY27_dimelo_pAHia5
analysis_cond = ["N2_old_fiber"]#,"51_dpy21null_fiber","52_dpy21jmjc_fiber"]
#analysis_cond = ["54_sdc2_3xmCNB_mChMCVIPI_GFPHia5","66_sdc2_3xGNB_GFPHia5_mChMCVIPI","50_dpy27-3xGNB_GFP-Hia5_mcvipi","N2-DPY27_dimelo_pAHia5","N2_fiber_mixed_R10","N2_bg_R10"] #N2_fiber","SDC2_degron_fiber"]#,"N2_young"] ,"N2_bg","N2_fiber"
# "66_sdc2_3xGNB_GFPHia5_mChMCVIPI","60_ama1_3xGNB_GFPHia5_mChMCVIPI", "50_dpy27-3xGNB_GFP-Hia5_mcvipi"

### IMPORT BAM FILES AND METADATA FROM CSV FILE
input_metadata = pd.read_csv("/Data1/git/meyer-nanopore/scripts/bam_input_metadata_2_21_2024.txt", sep="\t", header=0)
# Set bam_files equal to list of items in column bam_files where conditions == N2_fiber
bam_files = input_metadata[input_metadata["conditions"].isin(analysis_cond)]["bam_files"].tolist()
conditions = input_metadata[input_metadata["conditions"].isin(analysis_cond)]["conditions"].tolist()
exp_ids = input_metadata[input_metadata["conditions"].isin(analysis_cond)]["exp_id_date"].tolist()
flowcells = input_metadata[input_metadata["conditions"].isin(analysis_cond)]["flowcell"].tolist()
bam_fracs = len(bam_files)*[1] # For full .bam set to = 1
sample_indices = list(range(len(bam_files)))

### Import CHIP SEQ / EXTERNAL DATA
ext_target = [] # h3_chip, sdc2_chip, sdc3_chip, dhs, gro, mnase, h4k20me1_chip, ama1_chip, "dpy27_chip","ama1_chip","mnase","gro"
ext_metadata = pd.read_csv("/Data1/git/meyer-nanopore/scripts/bw_input_metadata_2_21_2024.txt", sep="\t", header=0)
ext_exp_ids = ext_metadata[ext_metadata["target"].isin(ext_target)]["exp_id_date"].tolist()
ext_files = ext_metadata[ext_metadata["target"].isin(ext_target)]["bw_files"].tolist()
ext_targets = ext_metadata[ext_metadata["target"].isin(ext_target)]["target"].tolist()

output_stem = "/Data1/seq_data/wig_files/"
# for dimelo: [181/258,194/258]
thresh_list=len(bam_files)*[m6A_thresh/258] # For R10 flow cells use 0.5; for R9 flow cells use 0.9
# for position in flowcells == R9 set item with same index in thresh_list to R9_m6A_thresh/258
for i in range(len(flowcells)):
    if "R9" in flowcells[i]:
        thresh_list[i] = R9_m6A_thresh/258

R9_m6A_thresh:  219
m6A_thresh:  232
mC_thresh:  232


In [4]:
# Filter input bed_file based on input parameters (e.g. chromosome, type, strand, etc.)
# Function saves a new filtered bed file to the same folder as the original bed file
# called temp_do_not_use_"type".bed
importlib.reload(nanotools)
new_bed_files=nanotools.filter_bed_file(
    bed_file,
    sample_source,
    selection,
    chromosome_selected,
    chr_type_selected,
    type_selected,
    strand_selected,
    max_regions,
    bed_window,
    intergenic_window
)

modkit_bed_name = "modkit_temp.bed"
modkit_bed_df = nanotools.generate_modkit_bed(new_bed_files, down_sample_autosome, select_opp_strand,modkit_bed_name)
nanotools.display_sample_rows(modkit_bed_df, 5)

# Subsample bam based on bam_frac, used to accelerate testing
# if bam_frac = 1 will use original bam files, otherwise will save new subsampled bam files to output_stem.
args_list = [(bam_file, condition, bam_frac, sample_index, output_stem) for bam_file, condition, bam_frac, sample_index in zip(bam_files,conditions,bam_fracs,sample_indices)]
new_bam_files=[]
new_bam_files = nanotools.parallel_subsample_bam(bam_files, conditions, bam_fracs, sample_indices, output_stem)

print("Program finished!")
print("new_bam_files: ", new_bam_files)
print("exp_ids: ", exp_ids)

# for each file in new_bed_files, convert to tabix
for bed_file in new_bed_files:
    ! tabix -f -p bed {bed_file}

Filtering bed file...
Configs: type ['her-1_FULL'] ['CHROMOSOME_I', 'CHROMOSOME_II', 'CHROMOSOME_III', 'CHROMOSOME_IV', 'CHROMOSOME_V', 'CHROMOSOME_X'] ['X', 'Autosome'] ['her-1_FULL'] ['+', '-'] 10 1000 1000
Chromosome ends:             chromosome  start       end strand       type  chr-type
2         CHROMOSOME_I      0  15072434      +  whole_chr  Autosome
17796    CHROMOSOME_II      0  15279421      +  whole_chr  Autosome
38585   CHROMOSOME_III      0  13783801      +  whole_chr  Autosome
55014    CHROMOSOME_IV      0  17493829      +  whole_chr  Autosome
74823     CHROMOSOME_V      0  20924180      +  whole_chr  Autosome
103825    CHROMOSOME_X      0  17718942      +  whole_chr         X
/Data1/reference/her-1_FULL.bed has been compressed successfully to /Data1/reference/her-1_FULL.bed.gz
Index created successfully for /Data1/reference/her-1_FULL.bed.gz.tbi
Saved the following bedfiles: ['/Data1/reference/her-1_FULL.bed.gz']
| modkit_bed_df | first 0 out of total 0 rows.


Unnamed: 0,0,1,2,3,4,5


Program finished!
new_bam_files:  ['/Data1/seq_data/AG_merged_N2_DPY21null_DPY21jmjc_11_30_24/mod_mappings_AG1_merged.sorted.bam']
exp_ids:  ['AG-22_11_30_23']


In [5]:
for each_bed in new_bed_files:
    for each_bam in new_bam_files:
        wig_filename = wf.modbam_to_wig(each_bam,each_bed,m6A_thresh,output_stem)
        print("wig_filename: ", wig_filename)
        # display header of wig file
        ! head -n 10 {wig_filename}

Loading BAM file: /Data1/seq_data/AG_merged_N2_DPY21null_DPY21jmjc_11_30_24/mod_mappings_AG1_merged.sorted.bam
BAM file /Data1/seq_data/AG_merged_N2_DPY21null_DPY21jmjc_11_30_24/mod_mappings_AG1_merged.sorted.bam has 2543536 reads
wig_filename:  None
head: cannot open 'None' for reading: No such file or directory
