In [3]:
# Purpose of this script is to take in a number of .bam files and a .bed file and return %m6A across all regions in the .bam files file
# in bin increments.
__author__ = "Yuri Malina"
__contact__ = "ymalina@berkeley.edu"
__copyright__ = "The Meyer Lab, UC Berkeley"
__credits__ = [""]
__date__ = "3/27/2023"
__deprecated__ = False
__status__ = "In development"
__version__ = "0.0.1"

In [1]:
import pandas as pd
import numpy as np
import random
import os

import plotly.express as px # Used for plotting
import plotly.graph_objects as go # Used for plotting
from plotly.subplots import make_subplots # Used for plotting

import pysam
from scipy import stats
from scipy import signal #used for smoothing in plots

import multiprocessing # used for parallel processing
from multiprocessing import Pool # used for parallel processing
lock1 = multiprocessing.Lock()

# install tabix with:
# apt-get install tabix 

In [2]:
### Configurations
m6A_thresh = 194 #default is 129 = 50%; 181=70%; 194=75%; 207 = 80%; 232 = 90%
#mC_thresh = 129 #default is 129

## Bed file configurations:
sample_source = "type" # "chr_type" or "type" or "chromosome"
sampleName = ["X","Autosome"] # "TES_q1" "strong_rex" "weak_rex" "type", "X", "Autosome"; Must be same number of unique values in selected bed rows.
chr_type_selected = ["X","Autosome"] # 'X' or "Autosome"
type_selected = ["TSS_q1","TSS_q2","TSS_q3","TSS_q4"] #TES_q1-4 | #TSS_q1-4 | strong/weak rex | whole_chr | 200kb_region | 50kb_region
max_regions = 0 # max regions to consider; 0 = full set;
chromosome_selected = ["CHROMOSOME_X","CHROMOSOME_I", "CHROMOSOME_II", "CHROMOSOME_III", "CHROMOSOME_IV", "CHROMOSOME_V"]
strand_selected = ["+","-"] #+ and/or -
bed_file = "/Data1/reference/tss_tes_rex_combined.bed"
bed_window = 1000 # +/- around bed elements.
mods = "A" # {A,CG,A+CG}
plotting_mov_avg = 30 # number of bins to use for moving average in plots
if sample_source == "chr_type":
    selection = chr_type_selected
if sample_source == "type":
    selection = type_selected
if sample_source == "chromosome":
    selection = chromosome_selected

## Bam file configurations
random.seed(10)

### Tube D
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/mod_mappings.sorted.bam"
#output_stem = "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/"
#condition = "N2; 2uM Hia5 30min"

### Tube 4
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam"
#output_stem = "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/"
#condition = "N2; 2uM Hia5 120min"

### Tube H
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam"
#output_stem = "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/"
#condition = "AID::SDC-2 + Auxin; 2uM Hia5 30min"

### Tube T
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/mod_mappings.sorted.m6Aonly.bam"
#output_stem = "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/"
#condition = "N2 Young; 2uM Hia5 30min"

#"/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/analysis/mod_mappings_No-Met-RG_0.1_200kb_region.sorted.bam"
#"No-Met-RG",

'''### Time Course
#bam_fracs = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1] # For full .bam set to = 1
bam_fracs = [1,1,1,1,1,1,1,1,1,1] # For full .bam set to = 1
bam_files = [
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_01.bam",
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_02.bam",
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_03.bam",
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_05.bam",
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_04.bam",
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_06.bam",
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_07.bam",
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_08.bam",
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_09.bam",
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_10.bam"
]
output_stem = "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/analysis/"
conditions = ["N2-No-Met","N2-3-min","N2-10-min","N2-30-min", "N2-120-min",
             "#021+Aux-No-Met","#021+Aux-3-min","#021+Aux-10-min","#021+Aux-30-min", "#021+Aux-120-min"]
conditions_min=[0,3,10,30,120,0,3,10,30,120]'''

### Time Course
#bam_fracs = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1] # For full .bam set to = 1
bam_fracs = [1,1,1,1] # For full .bam set to = 1
bam_files = [
    "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/mod_mappings.sorted.bam",
    "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam",
    "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam",
    "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/mod_mappings.sorted.m6Aonly.bam"
]
output_stem = "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/analysis/"
conditions = ["N2-2uM-Hia5-30min","AID-SDC-2Auxin-2uM-Hia5-30min",
              "N2-2uM-Hia5-120min","N2-Young-2uM-Hia5-30min"]
conditions_min=[30,30,120,30]
file_prefix = "dec_23_"

In [3]:
### Select bed file
full_bed = pd.read_csv(bed_file,sep='\t')
bed=[]

for each_type in selection:
# REGION CONFIGURATION
    if sample_source == "type":
        temp_bed = full_bed[full_bed["chromosome"].isin(chromosome_selected) &
                            full_bed["chr-type"].isin(chr_type_selected) &
                            full_bed["type"].str.contains(each_type) &
                            full_bed["strand"].isin(strand_selected)]
    if sample_source == "chr_type":
        temp_bed = full_bed[full_bed["chromosome"].isin(chromosome_selected) &
                            full_bed["chr-type"].str.contains(each_type) &
                            full_bed["type"].isin(type_selected) &
                            full_bed["strand"].isin(strand_selected)]
    if sample_source == "chromosome":
        temp_bed = full_bed[full_bed["chromosome"].str.contains(each_type) &
                            full_bed["chr-type"].isin(chr_type_selected) &
                            full_bed["type"].isin(type_selected) &
                            full_bed["strand"].isin(strand_selected)]

    # Drop random regions to match max_regions
    drop_count = len(temp_bed)-max_regions
    # If max regions > selected regions, do not drop any.
    if(drop_count<0):
        drop_count=0
    # If max_regions = 0, do not drop any.
    if (max_regions == 0):
        drop_count = 0

    drop_indices = np.random.choice(temp_bed.index, drop_count, replace=False)
    temp_bed.drop(drop_indices,inplace=True)
    temp_bed.sort_values(by=["chromosome","start"],ascending=True,inplace=True)
    temp_bed.reset_index(drop=True, inplace=True)

    # If feature is closer than bed window to chr start skip
    temp_bed["start"]=temp_bed["start"] - bed_window
    temp_bed["end"]=temp_bed["end"] + bed_window

    # Drop rows where start < 0
    temp_bed = temp_bed[temp_bed["start"]>0]
    # and reset index
    temp_bed.reset_index(drop=True, inplace=True)

    temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed"
    temp_bedfile_gz = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
    temp_bed.to_csv(temp_bedfile, sep="\t",header=False,index=False)
    
    # Create indexed tabix files
    ! bgzip -c {temp_bedfile} > {temp_bedfile_gz}
    ! tabix -f -p bed {temp_bedfile_gz}

    # For first iteration
    if bed == []:
        bed = [temp_bedfile]

    # Otherwise append region to temporary bed file.
    else:
        bed.append(temp_bedfile)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_bed.drop(drop_indices,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_bed.sort_values(by=["chromosome","start"],ascending=True,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_bed["start"]=temp_bed["start"] - bed_window
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docume

In [26]:
### Extract only reads in bam file that overlap selected regions, and subselect down using fraction
### NOT NECESSARY IF RUNNING WHOLE CHROMOSOMES
# Define function to generate redux .bam file
def extract_overlapping_reads(bed_file, bam_file, output_file):
    # Load the BAM file
    bam_ext = pysam.AlignmentFile(bam_file, "rb")

    # Load the BED file
    regions = pysam.TabixFile(bed_file)

    '''# Create the output BAM file
    out_bam = pysam.AlignmentFile(output_file, "wb", template=bam)

    # Iterate over the reads in the input BAM file
    for read in bam:
        # Check if the read overlaps a region in the BED file
        overlaps = [r for r in regions.fetch(read.reference_name, read.reference_start, read.reference_end)]
        if len(overlaps) > 0:
            # Write the read to the output BAM file if it overlaps a region in the BED file
            out_bam.write(read)

    # Close the input BAM file and the output BAM file
    bam.close()
    out_bam.close()'''

    # Create a set to store the reads
    seen = set()

    # Create the output .bam file
    with pysam.AlignmentFile(output_file, "wb", header=bam_ext.header) as out:
        # Iterate over the regions in the .bed file
        for region in regions.fetch(multiple_iterators=True):
            print(region)
            chrom, start, end, strand, region_type, chr_type = region.split()
            start, end = int(start), int(end)

            # Iterate over the reads in the current region
            for pileupcolumn in bam_ext.pileup(chrom, start, end):
                for pileupread in pileupcolumn.pileups:
                    # Check if the read has not been seen before
                    if pileupread.alignment.query_name not in seen:
                        # Add the read to the set of seen reads
                        seen.add(pileupread.alignment.query_name)
                        # Write the read to the output .bam file
                        out.write(pileupread.alignment)
                        
                        
def process_bam_file(bam_file, condition, bam_frac, selection, output_stem):
    with lock1:
        print("starting on: ",bam_file)
    for each_type in selection:
        temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
        output_bamfile = output_stem + "mod_mappings_" + condition + "_" + str(bam_frac)+"_"+each_type+".sorted.bam"
        extract_overlapping_reads(temp_bedfile, bam_file, output_bamfile)
        ### Subselect bam file using fraction
        ! samtools view -h -s {bam_frac} -L {temp_bedfile} {bam_file} | samtools view -h -b - > {output_bamfile}
        ! samtools index {output_bamfile}

# Parallelize for each bam file:
args_list = [(bam_file, condition, bam_frac, selection, output_stem) for bam_file, condition, bam_frac in zip(bam_files,conditions,bam_fracs)]
if __name__ == "__main__":
    with Pool() as pool: #processes=1
        result = pool.starmap(process_bam_file, args_list)
    print("Program finished!")
    
'''for bam_file, condition, bam_frac in zip(bam_files,conditions, bam_fracs):
    print("Starting on: ",bam_file," | chr type: ", condition," | with bam fraction: ",bam_frac)
    for each_type in selection:
        temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
        output_bamfile = output_stem + "mod_mappings_" + condition + "_" + str(bam_frac)+"_"+each_type+".sorted.bam"
        extract_overlapping_reads(temp_bedfile, bam_file, output_bamfile)
        ### Subselect bam file using fraction
        ! samtools view -h -s {bam_frac} -L {temp_bedfile} {bam_file} | samtools view -h -b - > {output_bamfile}
        ! samtools index {output_bamfile}'''

starting on:  /Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_01.bam
starting on:  /Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_02.bam
starting on:  /Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_03.bam
starting on:  /Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_05.bam
starting on:  /Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_04.bam
CHROMOSOME_I	26595	28595	-	TSS_q1	Autosome
starting on:  /Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_06.bam
starting on: CHROMOSOME_I	26595	28595	-	TSS_q1	Autosome
 /Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_07.bam
CHROMOSOME_I	26595	28595	-	TSS_q1	Autosome
starting o

'for bam_file, condition, bam_frac in zip(bam_files,conditions, bam_fracs):\n    print("Starting on: ",bam_file," | chr type: ", condition," | with bam fraction: ",bam_frac)\n    for each_type in selection:\n        temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"\n        output_bamfile = output_stem + "mod_mappings_" + condition + "_" + str(bam_frac)+"_"+each_type+".sorted.bam"\n        extract_overlapping_reads(temp_bedfile, bam_file, output_bamfile)\n        ### Subselect bam file using fraction\n        ! samtools view -h -s {bam_frac} -L {temp_bedfile} {bam_file} | samtools view -h -b - > {output_bamfile}\n        ! samtools index {output_bamfile}'

In [None]:
### Calculate average m6A at each position in bed_window/2 centered on selected regions
def extract_m6A_per_base(bam_file2, bed_file, threshold,condition):
    global bed_window
    global file_prefix
    # Load the BAM file
    bam_ext = pysam.AlignmentFile(bam_file2, "rb")

    # Load the BED file
    regions = pysam.TabixFile(bed_file)

    # Initialize dataframe to store results
    data_header = ["chr","start","end","strand","region_type","chr_type","condition","read","base_type"]
    data_header_len = len(data_header)
    methylation_status = pd.DataFrame(columns= data_header + list(map(str,list(range(bed_window*2)))))

    # For each region in input bed file
    for region in regions.fetch(multiple_iterators=True):
        # Split the region string into the chromosome, start, and end positions
        #print("Starting on region:",region)
        chromosome, start, end, strand, region_type, chr_type = region.split()
        start = int(start)
        end = int(end)
        
        # Iterate over the reads that overlap the region
        for read in bam_ext.fetch(chromosome, start, end,multiple_iterators=True):
            if not read.is_unmapped:
                # Get read name and positions on reference
                read_name = read.query_name
                ref_positions = read.get_reference_positions()  # Get the positions on the reference genome that the read covers

                # Set start and end index, if in region of interest
                start_index = ref_positions.index(start) if start in ref_positions else None
                end_index = ref_positions.index(end) if end in ref_positions else None

                # Note: Currently only handles reads that fully overlap region of interest.
                # consider adding handling if read does not overlap entire region
                if start_index is not None and end_index is not None:  # If the read covers both the start and end positions
                    #print("start_index_in_read: ", start_index," | end_index_in_read: ", end_index)
                    sequence = read.query_sequence[start_index:end_index]  # Extract the part of the read sequence that overlaps with the specified genomic region
                    #print("sequence:", sequence)

                    # Pre populate row with 0s
                    seq_row_A = [chromosome, start, end, strand,region_type, chr_type,condition,read_name,"A"] + [0] *(bed_window*2)
                    seq_row_m6A = [chromosome, start, end, strand,region_type, chr_type,condition,read_name,"m6A"] + [0] *(bed_window*2)

                    # Extract the modified base probabilities for the read
                    # and set strand and canonical base for denominator.
                    # Since reverse strand is stored as revC in pysam, we need to count 'T's
                    if read.is_forward == True:
                        try: m6A_dict = read.modified_bases[('A', 0, 'Y')]
                        except: m6A_dict = read.modified_bases[('A', 0, 'a')]
                        mod_can_base = "A"
                        strand = "+"

                    if read.is_forward == False:
                        try: m6A_dict = read.modified_bases[('A', 1, 'Y')]
                        except: m6A_dict = read.modified_bases[('A', 1, 'a')]
                        strand = "-"
                        mod_can_base = "T"

                    # Extract the modified base probabilities only in the region of interest and convert to dict
                    m6A_dict = [t for t in m6A_dict if t[0] >= start_index and t[0] <= end_index]
                    m6A_dict = dict(m6A_dict)

                    # For each position in the region of interest, set value to 1 in the A row if A/T present
                    # and set m6A value to 1 if m6A present.
                    for i, pos in enumerate(ref_positions[start_index:end_index]):  # Loop over the positions in the extracted sequence
                        #print("i+start_index: ",i+start_index," | pos:",pos)
                        #print("SEQUENCE[",pos,"]:", sequence[i])
                        if sequence[i] == mod_can_base:  # If the base is A (or T for reverse reads), increment the count for that position in the dictionary
                            seq_row_A[i+data_header_len] += 1
                            #print("i+start_index = ",i+start_index)
                            #print("m6A_dict[",i+start_index,"]",m6A_dict[i+start_index])
                            if m6A_dict.get(i+start_index,0) > threshold:  # If the base is m6A, increment the count for that position in the dictionary
                                seq_row_m6A[i+data_header_len] += 1

                    # Append new rows for each read. 1 for m6a and one for A.
                    A_row = pd.Series(seq_row_A,index=methylation_status.columns)
                    m6A_row = pd.Series(seq_row_m6A,index=methylation_status.columns)
                    methylation_status = pd.concat([methylation_status,A_row.to_frame().T,m6A_row.to_frame().T], ignore_index=True)
                    #print(methylation_status)
                    #print("Adding A row:",seq_row_A)
                    #print("Adding m6A row:",seq_row_m6A)
                    #sequence=[]
                    #seq_row_A=[]
                    #seq_row_m6A=[]
                    m6A_dict.clear()
    return methylation_status

def process_bam_file(bam_file, condition, bam_frac, selection, m6A_thresh, output_stem):
    global lock1
    for each_type in selection:
        bam = bam_file
        # Uncomment if you only want to process subset of bam file
        #bam = output_stem + "mod_mappings_" + condition + "_" + str(bam_frac)+"_"+each_type+".sorted.bam"
        with lock1:
            temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
            print("starting on: ",bam, " with :",temp_bedfile)
        output_df = extract_m6A_per_base(bam,temp_bedfile,m6A_thresh,condition)
        output_df.to_csv(
            output_stem + file_prefix + "m6A_frac_line_" + condition + "_" + str(m6A_thresh)+"_"+each_type+"_w"+str(bed_window)+".csv",
            index=False, mode='w')
        print(output_df)

# Parallelize for each bam file:
args_list = [(bam_file, condition, bam_frac, selection, m6A_thresh, output_stem) for bam_file, condition, bam_frac in zip(bam_files,conditions,bam_fracs)]
# Run with one process:
#process_bam_file(*args_list[0])

# Run with multiple processes
if __name__ == "__main__":
    with Pool() as pool: #processes=1
        result = pool.starmap(process_bam_file, args_list)
    print("Program finished!")

'''for each_type in selection:
    bam = output_stem + "mod_mappings_" + str(bam_frac)+"_"+each_type+".sorted.bam"
    temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
    output_df = extract_m6A_per_base(bam,temp_bedfile,m6A_thresh)
    #output_df.to_csv(output_stem + "m6A_frac" + str(bam_frac)+"_"+each_type+".csv", index=False, mode='w')
    print(output_df)'''

starting on:  /Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/mod_mappings.sorted.bam  with : /Data1/reference/temp_do_not_use_TSS_q1.bed.gz
starting on:  /Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam  with : /Data1/reference/temp_do_not_use_TSS_q1.bed.gz
starting on:  /Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam  with : /Data1/reference/temp_do_not_use_TSS_q1.bed.gz
starting on:  /Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/mod_mappings.sorted.m6Aonly.bam  with : /Data1/reference/temp_do_not_use_TSS_q1.bed.gz
               chr     start       end strand region_type  chr_type  \
0     CHROMOSOME_I    100213    102213      -      TSS_q1  Autosome   
1     CHROMOSOME_I    100213    102213      -      TSS_q1  Autosome   
2     CHROMOSOME_I    100213    102213      -      TSS_q1  Autosome   
3     CHROMOSOME_I    100213    102213      -      TSS_q1 

In [None]:
### Build dataframe for plotting
#filenames = ["/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/m6A_frac1_200kb_region.csv",
#             "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/m6A_frac1_200kb_region.csv",
#             "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/m6A_frac1_200kb_region.csv",
#            "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/m6A_frac1_200kb_region.csv"]

# If combined regions file already exists, read dataframe from csv
if os.path.exists(output_stem + file_prefix + "pm_combined_regions_"  + str(m6A_thresh) +".csv"):
    combined_regions = pd.read_csv(output_stem + file_prefix + "combined_regions_"  + str(m6A_thresh) +".csv")
    print("File: ",
          output_stem + file_prefix + "combined_regions_"  + str(m6A_thresh) +".csv",
          "already exists! Imported directly:")
    print(combined_regions)

else:
    print("Building combined regions file...")
    # Build list of filenames
    filenames = []
    for each_type in selection:
        for each_cond, each_frac in zip(conditions,bam_fracs):
            filenames.append(output_stem + file_prefix + "m6A_frac_line_" + each_cond + "_" + str(m6A_thresh)+"_"+each_type+"_pmw"+str(bed_window)+".csv")

    df_list = []
    combined_regions = []

    # Loop through the list of file names
    for filename in filenames:
        # Read each file into a dataframe
        df = pd.read_csv(filename)
        # Add the dataframe to the list of dataframes
        df_list.append(df)

    # Concatenate the list of dataframes into a single dataframe
    combined_regions = pd.concat(df_list)
    print(combined_regions)
    combined_regions.to_csv(output_stem + file_prefix + "combined_regions_"  + str(m6A_thresh) +".csv", index=False, mode='w')

In [8]:
summary_table = combined_regions.groupby(["region_type", "chr_type", "condition", "base_type"])[list(map(str, list(range(bed_window*2))))].agg(sum).reset_index()
#print("summary_table",summary_table)
# keep only rows where region_type == TSS_q1 or TSS_q2
summary_table = summary_table[summary_table['region_type'].isin(['TSS_q4'])].reset_index(drop=True)
#summary_table = summary_table[summary_table['region_type'] == 'TSS_q1' or summary_table['region_type'] == 'TSS_q2'].reset_index(drop=True)

# Reset index
#summary_table = summary_table.reset_index(drop=True)

# Convert all value values after column 4 to numeric
summary_table.iloc[:,4:] = summary_table.iloc[:,4:].apply(pd.to_numeric, errors='coerce')
summary_table = summary_table.fillna(0)
# Sort summary table by region_type, chr_type, condition and base_type
summary_table = summary_table.sort_values(by=['region_type', 'chr_type', 'condition', 'base_type']).reset_index(drop=True)
#print("summary_table",summary_table)

summary_table_A = summary_table[summary_table['base_type']=='A'].reset_index(drop=True)
summary_table_m6A = summary_table[summary_table['base_type']=='m6A'].reset_index(drop=True)
#print("summary_table_A",summary_table_A)
#print("summary_table_m6A",summary_table_m6A)

# select only the real number columns
summary_table_A_num = summary_table_A.iloc[:,1:].select_dtypes(include='number').reset_index(drop=True)
summary_table_m6A_num = summary_table_m6A.iloc[:,1:].select_dtypes(include='number').reset_index(drop=True)
#print("summary_table_A_num",summary_table_A_num)
#print("summary_table_m6A_num",summary_table_m6A_num)

# divide df1_num by df2_num
summary_table_div = summary_table_m6A_num.div(summary_table_A_num).reset_index(drop=True)
#print("summary_table_div",summary_table_div)

# merge the result
summary_table_div = pd.concat([summary_table_A[['region_type','chr_type','condition']].reset_index(drop=True), summary_table_div], axis=1)
# Calculate the rolling average across all columns for each group
summary_table_div = summary_table_div.T
# Rolling average skipping first 4 rows
summary_table_div.iloc[3:] = summary_table_div.iloc[3:].rolling(window=100, min_periods=50,center=True).mean()
summary_table_div = summary_table_div.T
summary_table_div = summary_table_div.groupby(['chr_type', 'condition']).agg('mean').reset_index()

### Create dataframe to calculat average autosome methylation
summary_table_div_A = summary_table_div[summary_table_div['chr_type']=='Autosome'].reset_index(drop=True)
summary_table_div_A_mean = summary_table_div_A
#print("summary_table_div_A_mean",summary_table_div_A_mean)
# Add a column for the mean of all columns after column 2 in summary_table_div_A_mean
summary_table_div_A_mean['mean'] = summary_table_div_A_mean.iloc[:,2:].mean(axis=1)

#summary_table_div_A_mean.iloc[:,2:] = summary_table_div_A.iloc[:,2:].mean(axis=1)
#summary_table_div_A_mean[2:] = summary_table_div_A[2:].mean(axis=1)
#print("summary_table_div_A_mean",summary_table_div_A_mean)

# Create summary_table_div_norm dataframe by dividing all summary_table_div column by the corresponding mean column in summary_table_div_A_mean using condition as the key
summary_table_div_norm = pd.merge(summary_table_div, summary_table_div_A_mean[['condition','mean']], on='condition')
summary_table_div_norm.iloc[:,2:-1] = summary_table_div_norm.iloc[:,2:-1].div(summary_table_div_norm['mean'], axis=0)
summary_table_div_norm = summary_table_div_norm.drop(columns=['mean'])
#print("summary_table_div_norm",summary_table_div_norm)

### Uncomment to normalize to average Autosome methylation
#summary_table_div = summary_table_div_norm
#print(summary_table_div)

# Plot the boxplot
marker_colors =["#c45746","#16415e"]

subtitle_list = [None]*(len(conditions))
print(subtitle_list)
print(subtitle_list[::2],conditions[:5])
subtitle_list[::2] = conditions[:5]

subtitle_list[1::2] = conditions[5:]
print(subtitle_list[1::2],conditions[:5])

fig = make_subplots(rows=len(conditions)-5, cols=2,
                y_title = "m6A/A",
                shared_yaxes=True,
                subplot_titles=subtitle_list)

for i in range(0,len(conditions)):
    # set tube_df to groupby of summary_table_div by region_type using mean
    tube_df = summary_table_div
    #print("tube_df:" , tube_df)
    tube_df = tube_df.loc[tube_df['condition']==conditions[i]]
    #print("tube_df:" , tube_df)
    # melt the DataFrame so that "condition" and "region_type" are variables
    df_plot = tube_df.loc[tube_df['chr_type']=="X"]
    df_plot = df_plot.melt(id_vars=['chr_type', 'condition'], var_name='position', value_name='value')
    #print("df_plot: " , df_plot)
    #df_plot=tube_df.sample(frac=17/100,replace=False,random_state=1)
    trace0 = go.Scatter(x=df_plot['position'],
                        y=df_plot['value'],
                        name='X',
                        line=dict(shape='spline', smoothing=1.3,color=marker_colors[0]))

    tube_df = summary_table_div
    tube_df = tube_df.loc[tube_df['condition']==conditions[i]]
    df_plot = tube_df.loc[tube_df['chr_type']=="Autosome"]
    df_plot = df_plot.melt(id_vars=['chr_type', 'condition'], var_name='position', value_name='value')
    #print("df_plot: " , df_plot)
    #df_plot=tube_df.sample(frac=17/100,replace=False,random_state=1)
    trace1 = go.Scatter(x=df_plot['position'],
                        y=df_plot['value'],
                        name='Autosome',
                        line=dict(shape='spline', smoothing=1.3,color=marker_colors[1]))

    if i < 5:
        fig.append_trace(trace0, row = i+1, col = 1)
        fig.append_trace(trace1, row = i+1, col = 1)
        # Add a vertical dashed line at 0 to row i+1 and col 1
        fig.add_vline(x=1000, line_width=1, line_dash="dash", row=i+1, col=1)
    else:
        fig.append_trace(trace0, row = i+1-5, col = 2)
        fig.append_trace(trace1, row = i+1-5, col = 2)
        fig.add_vline(x=1000, line_width=1, line_dash="dash", row=i+1-5, col=2)

fig['layout'].update(height = 2400)
fig['layout'].update(width = 800)
fig.update_layout(template="plotly_white")
# fig.update_yaxes(range=[0, 0.4])
# Display x-axis labels
#fig.update_xaxes(showticklabels=True)
# Remap x axis labels only to between -1000 and 1000

# Show xaxis tick labels in increments of 100 between min and max value
fig.update_xaxes(tick0=0, dtick=100)
# Remove grid lines
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.update_yaxes(tickformat="0.1%")



#fig.update_layout(boxmode='group', xaxis_tickangle=0)
fig.show()

#Export plotly figure to .svg
fig.write_image(output_stem + file_prefix + "combined_regions_"  + str(m6A_thresh) +".svg")
#Export plotly figure to .svg
fig.write_image(output_stem + file_prefix + "combined_regions_"  + str(m6A_thresh) +".png")


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



[None, None, None, None, None, None, None, None, None, None]
[None, None, None, None, None] ['N2-No-Met', 'N2-3-min', 'N2-10-min', 'N2-30-min', 'N2-120-min']
['#021+Aux-No-Met', '#021+Aux-3-min', '#021+Aux-10-min', '#021+Aux-30-min', '#021+Aux-120-min'] ['N2-No-Met', 'N2-3-min', 'N2-10-min', 'N2-30-min', 'N2-120-min']


In [None]:
summary_table_div_X = summary_table_div[summary_table_div['chr_type']=='X'].reset_index(drop=True)
summary_table_div_A = summary_table_div[summary_table_div['chr_type']=='Autosome'].reset_index(drop=True)
print("summary_table_div_X:",summary_table_div_X)

summary_table_div_X_num = summary_table_div_X.iloc[:,1:].select_dtypes(include='number').reset_index(drop=True)
summary_table_div_A_num = summary_table_div_A.iloc[:,1:].select_dtypes(include='number').reset_index(drop=True)
print("summary_table_div_X_num:",summary_table_div_X_num)

summary_table_sub = summary_table_div_X_num.sub(summary_table_div_A_num).reset_index(drop=True) #.div(summary_table_div_A_num).reset_index(drop=True)
print("summary_table_div_X_sub_num:",summary_table_sub)

summary_table_sub = pd.concat([summary_table_div_X[['condition']].reset_index(drop=True), summary_table_sub], axis=1)
print("summary_table_sub:",summary_table_sub)

# Plot the boxplot
marker_colors =["#c45746","#16415e"]

#my_own_order = [conditions[0],conditions[1],conditions[2],conditions[3],conditions[4],
#                conditions[5],conditions[6],conditions[7],conditions[8],conditions[9]]
my_own_order = conditions

fig = go.Figure()
df_plot = summary_table_sub.melt(id_vars=['condition'], var_name='position', value_name='value')
df_plot['condition_c'] = pd.Categorical(df_plot['condition'], my_own_order)
df_plot.sort_values("condition_c", inplace=True)

# sort df_plot in specific named condition order
print("df_plot:",df_plot)
# Set tube_df to df_plot where 'condition' is equal to one of the first 5 conditions
tube_df = df_plot[df_plot['condition'].isin(conditions[0:5])]

trace0 = go.Box(x=tube_df['condition']+" ", y=tube_df['value'],marker_color='slategrey')

tube_df = df_plot[df_plot['condition'].isin(conditions[5:10])]
trace1 = go.Box(x=tube_df['condition']+" ", y=tube_df['value'],marker_color='darkgrey')

fig.add_trace(trace0)
fig.add_trace(trace1)
fig.update_layout(template="plotly_white")
# Label y axis as "% chrX methylation relative to Autosome methylation"
fig.update_yaxes(title_text="% chrX methylation relative to Autosome methylation")
# Convert y axis to %
fig.update_yaxes(tickformat="1%")
#Set fig width to 800
#fig.update_layout(width=800)
fig.update_layout(width=1200)
# Show the figure
fig.show()

# Initate plotly figure
fig = go.Figure()
for i in [0,5]:
    # set tube_df to groupby of summary_table_div by region_type using mean
    tube_df = summary_table_sub
    #print("tube_df:" , tube_df)
    tube_df = tube_df.loc[tube_df['condition']==conditions[i]]

    trace0 = go.Scatter(x=tube_df.columns[1:],
                    y=tube_df.iloc[0,1:],
                    name=conditions[i],
                    line=dict(shape='spline', smoothing=1.3))

    # Add trace0 to fig
    fig.add_trace(trace0)
    fig.update_layout(template="plotly_white")

# Show the figure
fig.show()

fig0 = go.Figure()
for i in [1,6]:
    # set tube_df to groupby of summary_table_div by region_type using mean
    tube_df = summary_table_sub
    #print("tube_df:" , tube_df)
    tube_df = tube_df.loc[tube_df['condition']==conditions[i]]

    trace0 = go.Scatter(x=tube_df.columns[1:],
                    y=tube_df.iloc[0,1:],
                    name=conditions[i],
                    line=dict(shape='spline', smoothing=1.3))

# Add trace0 to fig
    fig0.add_trace(trace0)
    fig0.update_layout(template="plotly_white")

# Show the figure
fig0.show()

fig1 = go.Figure()
for i in [2,7]:
    # set tube_df to groupby of summary_table_div by region_type using mean
    tube_df = summary_table_sub
    #print("tube_df:" , tube_df)
    tube_df = tube_df.loc[tube_df['condition']==conditions[i]]

    trace1 = go.Scatter(x=tube_df.columns[1:],
                    y=tube_df.iloc[0,1:],
                    name=conditions[i],
                    line=dict(shape='spline', smoothing=1.3))

# Add trace0 to fig
    fig1.add_trace(trace1)
    fig1.update_layout(template="plotly_white")

# Show the figure
fig1.show()

fig2 = go.Figure()
for i in [3,8]:
    # set tube_df to groupby of summary_table_div by region_type using mean
    tube_df = summary_table_sub
    #print("tube_df:" , tube_df)
    tube_df = tube_df.loc[tube_df['condition']==conditions[i]]

    trace2 = go.Scatter(x=tube_df.columns[1:],
                    y=tube_df.iloc[0,1:],
                    name=conditions[i],
                    line=dict(shape='spline', smoothing=1.3))

# Add trace0 to fig
    fig2.add_trace(trace2)
    fig2.update_layout(template="plotly_white")

# Show the figure
fig2.show()

fig3 = go.Figure()
for i in [4,9]:
    # set tube_df to groupby of summary_table_div by region_type using mean
    tube_df = summary_table_sub
    #print("tube_df:" , tube_df)
    tube_df = tube_df.loc[tube_df['condition']==conditions[i]]

    trace3 = go.Scatter(x=tube_df.columns[1:],
                    y=tube_df.iloc[0,1:],
                    name=conditions[i],
                    line=dict(shape='spline', smoothing=1.3))

# Add trace0 to fig
    fig3.add_trace(trace3)
    fig3.update_layout(template="plotly_white")

# Show the figure
fig3.show()



'''fig1 = make_subplots(rows=2, cols=1,
                y_title = "m6A/A",
                shared_yaxes=True)

for i in range(0,4):
    # set tube_df to groupby of summary_table_div by region_type using mean
    tube_df = summary_table_sub[summary_table_sub['condition']==conditions[i]]
    #print("tube_df:" , tube_df)
    #print("tube_df:" , tube_df)
    # melt the DataFrame so that "condition" and "region_type" are variables
    df_plot = tube_df.melt(id_vars=['condition'], var_name='position', value_name='value')
    print("df_plot: " , df_plot)
    #df_plot=tube_df.sample(frac=17/100,replace=False,random_state=1)
    trace0 = go.Scatter(x=df_plot['position'],
                        y=df_plot['value'],
                        name=conditions[i],
                        line=dict(shape='spline', smoothing=1.3))
    fig1.append_trace(trace0, row = 1, col = 1)

for i in range(5,9):
    # set tube_df to groupby of summary_table_div by region_type using mean
    tube_df = summary_table_sub[summary_table_sub['condition']==conditions[i]]
    #print("tube_df:" , tube_df)
    #print("tube_df:" , tube_df)
    # melt the DataFrame so that "condition" and "region_type" are variables
    df_plot = tube_df.melt(id_vars=['condition'], var_name='position', value_name='value')
    print("df_plot: " , df_plot)
    #df_plot=tube_df.sample(frac=17/100,replace=False,random_state=1)
    trace1 = go.Scatter(x=df_plot['position'],
                        y=df_plot['value'],
                        name=conditions[i],
                        line=dict(shape='spline', smoothing=1.3))
    fig1.append_trace(trace1, row = 2, col = 1)

fig1['layout'].update(height = 1200)
fig1['layout'].update(width = 800)
fig1.update_layout(template="plotly_white")
# fig.update_yaxes(range=[0, 0.4])
# Display x-axis labels
fig1.update_xaxes(showticklabels=True)'''

#fig.update_layout(boxmode='group', xaxis_tickangle=0)
#fig1.show()