In [1]:
# Purpose of this script is to take in a number of .bam files and a .bed file and return %m6A across all regions in the .bam files file
# in bin increments.
__author__ = "Yuri Malina"
__contact__ = "ymalina@berkeley.edu"
__copyright__ = "The Meyer Lab, UC Berkeley"
__credits__ = [""]
__date__ = "3/27/2023"
__deprecated__ = False
__status__ = "In development"
__version__ = "0.0.1"

In [105]:
import pandas as pd
import numpy as np
import random

import plotly.express as px # Used for plotting
import plotly.graph_objects as go # Used for plotting
from plotly.subplots import make_subplots # Used for plotting

import pysam
from scipy import stats

import multiprocessing # used for parallel processing
from multiprocessing import Pool # used for parallel processing

# install tabix with:
# apt-get install tabix 

In [115]:
### Configurations
m6A_thresh = 129 #default is 129 = 50%; 181=70%; 194=75%; 207 = 80%; 232 = 90%
mC_thresh = 129 #default is 129
coreNum = 96 # cores to use

## Bed file configurations:
sample_source = "chr_type" # "chr_type" or "type" or "chromosome"
sampleName = ["CHROMOSOME_I", "CHROMOSOME_II", "CHROMOSOME_III", "CHROMOSOME_IV", "CHROMOSOME_V","CHROMOSOME_X"] # "TES_q1" "strong_rex" "weak_rex" "type", "X", "Autosome"; Must be same number of unique values in selected bed rows.
chr_type_selected = ["X","Autosome"] # 'X' or "Autosome"
type_selected = ["200kb_region"] #TES_q1-4 | #TSS_q1-4 | strong/weak rex | whole_chr | 200kb_region | 50kb_region
max_regions = 0 # max regions to consider; 0 = full set;
chromosome_selected = ["CHROMOSOME_I", "CHROMOSOME_II", "CHROMOSOME_III", "CHROMOSOME_IV", "CHROMOSOME_V","CHROMOSOME_X"]
strand_selected = ["+","-"] #+ and/or -
bed_file = "/Data1/reference/tss_tes_rex_combined.bed"
mods = "A" # {A,CG,A+CG}
if sample_source == "chr_type":
    selection = chr_type_selected
if sample_source == "type":
    selection = type_selected
if sample_source == "chromosome":
    selection = chromosome_selected

## Bam file configurations
random.seed(10)

### Tube D
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/mod_mappings.sorted.bam"
#output_stem = "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/"
#condition = "N2; 2uM Hia5 30min"

### Tube 4
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam"
#output_stem = "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/"
#condition = "N2; 2uM Hia5 120min"

### Tube H
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam"
#output_stem = "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/"
#condition = "AID::SDC-2 + Auxin; 2uM Hia5 30min"

### Tube T
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/mod_mappings.sorted.m6Aonly.bam"
#output_stem = "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/"
#condition = "N2 Young; 2uM Hia5 30min"

#"/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/analysis/mod_mappings_No-Met-RG_0.1_200kb_region.sorted.bam"
#"No-Met-RG",

### Tube Y9
bam_fracs = [1,1,1,1,1,1,1,1] # For full .bam set to = 1
bam_files = ["/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_01.bam",
            "/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_02.bam",
            "/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_03.bam",
            "/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_04.bam",
            "/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_05.bam",
            "/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_06.bam",
            "/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_07.bam",
            "/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_08.bam"]
output_stem = "/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/analysis/"
conditions = ["No-Met","1-min","3-min","5-min","10-min","15-min", "30-min", "120-min"]

In [116]:
### Select bed file
full_bed = pd.read_csv(bed_file,sep='\t')
bed=[]

for each_type in selection:
# REGION CONFIGURATION
    if sample_source == "type":
        temp_bed = full_bed[full_bed["chromosome"].isin(chromosome_selected) &
                            full_bed["chr-type"].isin(chr_type_selected) &
                            full_bed["type"].str.contains(each_type) &
                            full_bed["strand"].isin(strand_selected)]
    if sample_source == "chr_type":
        temp_bed = full_bed[full_bed["chromosome"].isin(chromosome_selected) &
                            full_bed["chr-type"].str.contains(each_type) &
                            full_bed["type"].isin(type_selected) &
                            full_bed["strand"].isin(strand_selected)]
    if sample_source == "chromosome":
        temp_bed = full_bed[full_bed["chromosome"].str.contains(each_type) &
                            full_bed["chr-type"].isin(chr_type_selected) &
                            full_bed["type"].isin(type_selected) &
                            full_bed["strand"].isin(strand_selected)]

    # Drop random regions to match max_regions
    drop_count = len(temp_bed)-max_regions
    # If max regions > selected regions, do not drop any.
    if(drop_count<0):
        drop_count=0
    # If max_regions = 0, do not drop any.
    if (max_regions == 0):
        drop_count = 0

    drop_indices = np.random.choice(temp_bed.index, drop_count, replace=False)
    temp_bed.drop(drop_indices,inplace=True)
    temp_bed.sort_values(by=["chromosome","start"],ascending=True,inplace=True)
    temp_bed.reset_index(drop=True, inplace=True)
    temp_bed["start"]=temp_bed["start"]
    temp_bed["end"]=temp_bed["end"]
    temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed"
    temp_bedfile_gz = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
    temp_bed.to_csv(temp_bedfile, sep="\t",header=False,index=False)
    
    # Create indexed tabix files
    ! bgzip -c {temp_bedfile} > {temp_bedfile_gz}
    ! tabix -f -p bed {temp_bedfile_gz}

    # For first iteration
    if bed == []:
        bed = [temp_bedfile]

    # Otherwise append region to temporary bed file.
    else:
        bed.append(temp_bedfile)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice

In [118]:
### Extract only reads in bam file that overlap selected regions, and subselect down using fraction
### NOT NECESSARY IF RUNNING WHOLE CHROMOSOMES
# Define function to generate redux .bam file
lock1 = multiprocessing.Lock()

def extract_overlapping_reads(bed_file, bam_file, output_file):
    # Load the BAM file
    bam_ext = pysam.AlignmentFile(bam_file, "rb")

    # Load the BED file
    regions = pysam.TabixFile(bed_file)

    '''# Create the output BAM file
    out_bam = pysam.AlignmentFile(output_file, "wb", template=bam)

    # Iterate over the reads in the input BAM file
    for read in bam:
        # Check if the read overlaps a region in the BED file
        overlaps = [r for r in regions.fetch(read.reference_name, read.reference_start, read.reference_end)]
        if len(overlaps) > 0:
            # Write the read to the output BAM file if it overlaps a region in the BED file
            out_bam.write(read)

    # Close the input BAM file and the output BAM file
    bam.close()
    out_bam.close()'''

    # Create a set to store the reads
    seen = set()

    # Create the output .bam file
    with pysam.AlignmentFile(output_file, "wb", header=bam_ext.header) as out:
        # Iterate over the regions in the .bed file
        for region in regions.fetch():
            print(region)
            chrom, start, end, strand, region_type, chr_type = region.split()
            start, end = int(start), int(end)

            # Iterate over the reads in the current region
            for pileupcolumn in bam_ext.pileup(chrom, start, end):
                for pileupread in pileupcolumn.pileups:
                    # Check if the read has not been seen before
                    if pileupread.alignment.query_name not in seen:
                        # Add the read to the set of seen reads
                        seen.add(pileupread.alignment.query_name)
                        # Write the read to the output .bam file
                        out.write(pileupread.alignment)
                        
                        
def process_bam_file(bam_file, condition, bam_frac, selection, m6A_thresh, output_stem):
    with lock1:
        print("starting on: ",bam_file)
    for each_type in selection:
        temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
        output_bamfile = output_stem + "mod_mappings_" + condition + "_" + str(bam_frac)+"_"+each_type+".sorted.bam"
        extract_overlapping_reads(temp_bedfile, bam_file, output_bamfile)
        ### Subselect bam file using fraction
        ! samtools view -h -s {bam_frac} -L {temp_bedfile} {bam_file} | samtools view -h -b - > {output_bamfile}
        ! samtools index {output_bamfile}

# Parallelize for each bam file:
args_list = [(bam_file, condition, bam_frac, selection, m6A_thresh, output_stem) for bam_file, condition, bam_frac in zip(bam_files,conditions,bam_fracs)]
if __name__ == "__main__":
    with Pool() as pool: #processes=1
        result = pool.starmap(process_bam_file, args_list)
    print("Program finished!")
    
'''for bam_file, condition, bam_frac in zip(bam_files,conditions, bam_fracs):
    print("Starting on: ",bam_file," | chr type: ", condition," | with bam fraction: ",bam_frac)
    for each_type in selection:
        temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
        output_bamfile = output_stem + "mod_mappings_" + condition + "_" + str(bam_frac)+"_"+each_type+".sorted.bam"
        extract_overlapping_reads(temp_bedfile, bam_file, output_bamfile)
        ### Subselect bam file using fraction
        ! samtools view -h -s {bam_frac} -L {temp_bedfile} {bam_file} | samtools view -h -b - > {output_bamfile}
        ! samtools index {output_bamfile}'''

starting on:  /Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_01.bam
starting on:  /Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_02.bamCHROMOSOME_X	0	199999	+	200kb_region	X

starting on:  /Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_03.bam
starting on:  /Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/basecalls/m6A/demux/mod_mappings_barcode_04.bam
CHROMOSOME_X	0	199999	+	200kb_region	XCHROMOSOME_X	0	199999	+	200kb_region	XCHROMOSOME_X	0	199999	+	200kb_region	X


CHROMOSOME_X	200000	399999	+	200kb_region	X
CHROMOSOME_X	200000	399999	+	200kb_region	X
CHROMOSOME_X	200000	399999	+	200kb_region	X
CHROMOSOME_X	400000	599999	+	200kb_region	X
CHROMOSOME_X	200000	399999	+	200kb_region	X
CHROMOSOME_X	400000	599999	+	200kb_region	X
CHROMOSOME_X	600000	799999	+	200kb_region	X
CHROMOSOME_X	400000	599999	+	200kb_region	X
CHROMOSOME_X	600000	

CHROMOSOME_I	5200000	5399999	+	200kb_region	Autosome
CHROMOSOME_X	14200000	14399999	+	200kb_region	X
CHROMOSOME_I	600000	799999	+	200kb_region	Autosome
CHROMOSOME_I	5400000	5599999	+	200kb_region	Autosome
CHROMOSOME_I	5600000	5799999	+	200kb_region	Autosome
CHROMOSOME_X	11800000	11999999	+	200kb_region	X
CHROMOSOME_I	800000	999999	+	200kb_region	Autosome
CHROMOSOME_X	14400000	14599999	+	200kb_region	X
CHROMOSOME_I	5800000	5999999	+	200kb_region	Autosome
CHROMOSOME_I	1000000	1199999	+	200kb_region	Autosome
CHROMOSOME_I	6000000	6199999	+	200kb_region	Autosome
CHROMOSOME_X	14600000	14799999	+	200kb_region	X
CHROMOSOME_X	12000000	12199999	+	200kb_region	X
CHROMOSOME_I	1200000	1399999	+	200kb_region	Autosome
CHROMOSOME_I	6200000	6399999	+	200kb_region	Autosome
CHROMOSOME_I	1400000	1599999	+	200kb_region	Autosome
CHROMOSOME_I	6400000	6599999	+	200kb_region	Autosome
CHROMOSOME_X	14800000	14999999	+	200kb_region	X
CHROMOSOME_I	6600000	6799999	+	200kb_region	Autosome
CHROMOSOME_I	1600000	179999

CHROMOSOME_I	3400000	3599999	+	200kb_region	Autosome
CHROMOSOME_II	4000000	4199999	+	200kb_region	Autosome
CHROMOSOME_I	8200000	8399999	+	200kb_region	Autosome
CHROMOSOME_II	12200000	12399999	+	200kb_region	Autosome
CHROMOSOME_II	4200000	4399999	+	200kb_region	Autosome
CHROMOSOME_II	12400000	12599999	+	200kb_region	Autosome
CHROMOSOME_I	8400000	8599999	+	200kb_region	Autosome
CHROMOSOME_II	4400000	4599999	+	200kb_region	Autosome
CHROMOSOME_I	3600000	3799999	+	200kb_region	Autosome
CHROMOSOME_II	12600000	12799999	+	200kb_region	Autosome
CHROMOSOME_I	8600000	8799999	+	200kb_region	Autosome
CHROMOSOME_II	4600000	4799999	+	200kb_region	Autosome
CHROMOSOME_II	12800000	12999999	+	200kb_region	Autosome
CHROMOSOME_II	13000000	13199999	+	200kb_region	Autosome
CHROMOSOME_II	4800000	4999999	+	200kb_region	Autosome
CHROMOSOME_I	3800000	3999999	+	200kb_region	Autosome
CHROMOSOME_I	8800000	8999999	+	200kb_region	Autosome
CHROMOSOME_II	13200000	13399999	+	200kb_region	Autosome
CHROMOSOME_II	5000000	5

CHROMOSOME_II	4200000	4399999	+	200kb_region	Autosome
CHROMOSOME_IV	4800000	4999999	+	200kb_region	Autosome
CHROMOSOME_I	12400000	12599999	+	200kb_region	Autosome
CHROMOSOME_III	6600000	6799999	+	200kb_region	Autosome
CHROMOSOME_IV	5000000	5199999	+	200kb_region	Autosome
CHROMOSOME_IV	5200000	5399999	+	200kb_region	Autosome
CHROMOSOME_III	6800000	6999999	+	200kb_region	Autosome
CHROMOSOME_II	4400000	4599999	+	200kb_region	Autosome
CHROMOSOME_I	12600000	12799999	+	200kb_region	Autosome
CHROMOSOME_IV	5400000	5599999	+	200kb_region	Autosome
CHROMOSOME_III	7000000	7199999	+	200kb_region	Autosome
CHROMOSOME_II	4600000	4799999	+	200kb_region	Autosome
CHROMOSOME_IV	5600000	5799999	+	200kb_region	Autosome
CHROMOSOME_I	12800000	12999999	+	200kb_region	Autosome
CHROMOSOME_III	7200000	7399999	+	200kb_region	Autosome
CHROMOSOME_IV	5800000	5999999	+	200kb_region	Autosome
CHROMOSOME_II	4800000	4999999	+	200kb_region	Autosome
CHROMOSOME_III	7400000	7599999	+	200kb_region	Autosome
CHROMOSOME_I	1300000

CHROMOSOME_V	7600000	7799999	+	200kb_region	Autosome
CHROMOSOME_II	6600000	6799999	+	200kb_region	Autosome
CHROMOSOME_III	1400000	1599999	+	200kb_region	Autosome
CHROMOSOME_IV	10400000	10599999	+	200kb_region	Autosome
CHROMOSOME_V	7800000	7999999	+	200kb_region	Autosome
CHROMOSOME_II	6800000	6999999	+	200kb_region	Autosome
CHROMOSOME_III	1600000	1799999	+	200kb_region	Autosome
CHROMOSOME_IV	10600000	10799999	+	200kb_region	Autosome
CHROMOSOME_V	8000000	8199999	+	200kb_region	Autosome
CHROMOSOME_IV	10800000	10999999	+	200kb_region	Autosome
CHROMOSOME_III	1800000	1999999	+	200kb_region	Autosome
CHROMOSOME_V	8200000	8399999	+	200kb_region	Autosome
CHROMOSOME_II	7000000	7199999	+	200kb_region	Autosome
CHROMOSOME_IV	11000000	11199999	+	200kb_region	Autosome
CHROMOSOME_III	2000000	2199999	+	200kb_region	Autosome
CHROMOSOME_V	8400000	8599999	+	200kb_region	Autosome
CHROMOSOME_IV	11200000	11399999	+	200kb_region	Autosome
CHROMOSOME_II	7200000	7399999	+	200kb_region	Autosome
CHROMOSOME_III	2200

CHROMOSOME_III	1200000	1399999	+	200kb_region	Autosome
CHROMOSOME_V	12800000	12999999	+	200kb_region	Autosome
CHROMOSOME_V	13000000	13199999	+	200kb_region	Autosome
CHROMOSOME_X	4200000	4399999	+	200kb_region	X
CHROMOSOME_III	1400000	1599999	+	200kb_region	Autosome
CHROMOSOME_IV	0	199999	+	200kb_region	Autosome
CHROMOSOME_V	13200000	13399999	+	200kb_region	Autosome
CHROMOSOME_X	4400000	4599999	+	200kb_region	X
CHROMOSOME_III	1600000	1799999	+	200kb_region	Autosome
CHROMOSOME_IV	200000	399999	+	200kb_region	Autosome
CHROMOSOME_V	13400000	13599999	+	200kb_region	Autosome
CHROMOSOME_III	1800000	1999999	+	200kb_region	Autosome
CHROMOSOME_X	4600000	4799999	+	200kb_region	X
CHROMOSOME_V	13600000	13799999	+	200kb_region	Autosome
CHROMOSOME_IV	400000	599999	+	200kb_region	Autosome
CHROMOSOME_X	4800000	4999999	+	200kb_region	X
CHROMOSOME_III	2000000	2199999	+	200kb_region	Autosome
CHROMOSOME_V	13800000	13999999	+	200kb_region	Autosome
CHROMOSOME_IV	600000	799999	+	200kb_region	Autosome
CHROMOSO

CHROMOSOME_III	13000000	13199999	+	200kb_region	Autosome
CHROMOSOME_I	5400000	5599999	+	200kb_region	Autosome
CHROMOSOME_IV	14000000	14199999	+	200kb_region	Autosome
CHROMOSOME_X	9800000	9999999	+	200kb_region	X
CHROMOSOME_I	5600000	5799999	+	200kb_region	Autosome
CHROMOSOME_III	13200000	13399999	+	200kb_region	Autosome
CHROMOSOME_IV	14200000	14399999	+	200kb_region	Autosome
CHROMOSOME_I	5800000	5999999	+	200kb_region	Autosome
CHROMOSOME_X	10000000	10199999	+	200kb_region	X
CHROMOSOME_IV	14400000	14599999	+	200kb_region	Autosome
CHROMOSOME_III	13400000	13599999	+	200kb_region	Autosome
CHROMOSOME_I	6000000	6199999	+	200kb_region	Autosome
CHROMOSOME_IV	14600000	14799999	+	200kb_region	Autosome
CHROMOSOME_X	10200000	10399999	+	200kb_region	X
CHROMOSOME_I	6200000	6399999	+	200kb_region	Autosome
CHROMOSOME_III	13600000	13783801	+	200kb_region	Autosome
CHROMOSOME_IV	14800000	14999999	+	200kb_region	Autosome
CHROMOSOME_I	6400000	6599999	+	200kb_region	Autosome
CHROMOSOME_X	10400000	10599999	+

CHROMOSOME_I	8000000	8199999	+	200kb_region	Autosome
CHROMOSOME_IV	11000000	11199999	+	200kb_region	Autosome
CHROMOSOME_II	9200000	9399999	+	200kb_region	Autosome
CHROMOSOME_I	8200000	8399999	+	200kb_region	Autosome
CHROMOSOME_IV	11200000	11399999	+	200kb_region	Autosome
CHROMOSOME_V	11400000	11599999	+	200kb_region	Autosome
CHROMOSOME_I	8400000	8599999	+	200kb_region	Autosome
CHROMOSOME_II	9400000	9599999	+	200kb_region	Autosome
CHROMOSOME_V	11600000	11799999	+	200kb_region	Autosome
CHROMOSOME_II	9600000	9799999	+	200kb_region	Autosome
CHROMOSOME_I	8600000	8799999	+	200kb_region	Autosome
CHROMOSOME_IV	11400000	11599999	+	200kb_region	Autosome
CHROMOSOME_V	11800000	11999999	+	200kb_region	Autosome
CHROMOSOME_IV	11600000	11799999	+	200kb_region	Autosome
CHROMOSOME_II	9800000	9999999	+	200kb_region	Autosome
CHROMOSOME_V	12000000	12199999	+	200kb_region	Autosome
CHROMOSOME_I	8800000	8999999	+	200kb_region	Autosome
CHROMOSOME_IV	11800000	11999999	+	200kb_region	Autosome
CHROMOSOME_V	122000

CHROMOSOME_II	10200000	10399999	+	200kb_region	Autosome
CHROMOSOME_X	3000000	3199999	+	200kb_region	X
CHROMOSOME_III	12200000	12399999	+	200kb_region	Autosome
CHROMOSOME_II	10400000	10599999	+	200kb_region	Autosome
CHROMOSOME_X	3200000	3399999	+	200kb_region	X
CHROMOSOME_III	12400000	12599999	+	200kb_region	Autosome
CHROMOSOME_V	4800000	4999999	+	200kb_region	Autosome
CHROMOSOME_II	10600000	10799999	+	200kb_region	Autosome
CHROMOSOME_X	3400000	3599999	+	200kb_region	X
CHROMOSOME_III	12600000	12799999	+	200kb_region	Autosome
CHROMOSOME_X	3600000	3799999	+	200kb_region	X
CHROMOSOME_II	10800000	10999999	+	200kb_region	Autosome
CHROMOSOME_III	12800000	12999999	+	200kb_region	Autosome
CHROMOSOME_V	5000000	5199999	+	200kb_region	Autosome
CHROMOSOME_X	3800000	3999999	+	200kb_region	X
CHROMOSOME_III	13000000	13199999	+	200kb_region	Autosome
CHROMOSOME_II	11000000	11199999	+	200kb_region	Autosome
CHROMOSOME_X	4000000	4199999	+	200kb_region	X
CHROMOSOME_III	13200000	13399999	+	200kb_region	Autos

CHROMOSOME_III	12000000	12199999	+	200kb_region	Autosome
CHROMOSOME_I	2000000	2199999	+	200kb_region	Autosome
CHROMOSOME_V	15000000	15199999	+	200kb_region	Autosome
CHROMOSOME_I	2200000	2399999	+	200kb_region	Autosome
CHROMOSOME_IV	16000000	16199999	+	200kb_region	Autosome
CHROMOSOME_III	12200000	12399999	+	200kb_region	Autosome
CHROMOSOME_I	2400000	2599999	+	200kb_region	Autosome
CHROMOSOME_V	15200000	15399999	+	200kb_region	Autosome
CHROMOSOME_I	2600000	2799999	+	200kb_region	Autosome
CHROMOSOME_IV	16200000	16399999	+	200kb_region	Autosome
CHROMOSOME_I	2800000	2999999	+	200kb_region	Autosome
CHROMOSOME_III	12400000	12599999	+	200kb_region	Autosome
CHROMOSOME_V	15400000	15599999	+	200kb_region	Autosome
CHROMOSOME_IV	16400000	16599999	+	200kb_region	Autosome
CHROMOSOME_I	3000000	3199999	+	200kb_region	Autosome
CHROMOSOME_IV	16600000	16799999	+	200kb_region	Autosome
CHROMOSOME_III	12600000	12799999	+	200kb_region	Autosome
CHROMOSOME_I	3200000	3399999	+	200kb_region	Autosome
CHROMOSOME_I

CHROMOSOME_X	1200000	1399999	+	200kb_region	X
CHROMOSOME_IV	15200000	15399999	+	200kb_region	Autosome
CHROMOSOME_II	5000000	5199999	+	200kb_region	Autosome
CHROMOSOME_V	16200000	16399999	+	200kb_region	Autosome
CHROMOSOME_IV	15400000	15599999	+	200kb_region	Autosome
CHROMOSOME_V	16400000	16599999	+	200kb_region	Autosome
CHROMOSOME_II	5200000	5399999	+	200kb_region	Autosome
CHROMOSOME_V	16600000	16799999	+	200kb_region	Autosome
CHROMOSOME_IV	15600000	15799999	+	200kb_region	Autosome
CHROMOSOME_X	1400000	1599999	+	200kb_region	X
CHROMOSOME_II	5400000	5599999	+	200kb_region	Autosome
CHROMOSOME_V	16800000	16999999	+	200kb_region	Autosome
CHROMOSOME_IV	15800000	15999999	+	200kb_region	Autosome
CHROMOSOME_II	5600000	5799999	+	200kb_region	Autosome
CHROMOSOME_X	1600000	1799999	+	200kb_region	X
CHROMOSOME_V	17000000	17199999	+	200kb_region	Autosome
CHROMOSOME_IV	16000000	16199999	+	200kb_region	Autosome
CHROMOSOME_II	5800000	5999999	+	200kb_region	Autosome
CHROMOSOME_IV	16200000	16399999	+	200

CHROMOSOME_V	20800000	20924180	+	200kb_region	Autosome
CHROMOSOME_III	13200000	13399999	+	200kb_region	Autosome
CHROMOSOME_III	13400000	13599999	+	200kb_region	Autosome
CHROMOSOME_III	13600000	13783801	+	200kb_region	Autosome
CHROMOSOME_IV	0	199999	+	200kb_region	Autosome
CHROMOSOME_IV	200000	399999	+	200kb_region	Autosome
CHROMOSOME_X	10800000	10999999	+	200kb_region	X
CHROMOSOME_IV	400000	599999	+	200kb_region	Autosome
CHROMOSOME_IV	600000	799999	+	200kb_region	Autosome
CHROMOSOME_IV	800000	999999	+	200kb_region	Autosome
CHROMOSOME_X	11000000	11199999	+	200kb_region	X
CHROMOSOME_IV	1000000	1199999	+	200kb_region	Autosome
CHROMOSOME_IV	1200000	1399999	+	200kb_region	Autosome
CHROMOSOME_IV	1400000	1599999	+	200kb_region	Autosome
CHROMOSOME_IV	1600000	1799999	+	200kb_region	Autosome
CHROMOSOME_IV	1800000	1999999	+	200kb_region	Autosome
CHROMOSOME_IV	2000000	2199999	+	200kb_region	Autosome
CHROMOSOME_X	11200000	11399999	+	200kb_region	X
CHROMOSOME_IV	2200000	2399999	+	200kb_region	Autoso

CHROMOSOME_I	14800000	14999999	+	200kb_region	Autosome
CHROMOSOME_I	15000000	15072434	+	200kb_region	Autosome
CHROMOSOME_II	0	199999	+	200kb_region	Autosome
CHROMOSOME_II	200000	399999	+	200kb_region	Autosome
CHROMOSOME_II	400000	599999	+	200kb_region	Autosome
CHROMOSOME_II	600000	799999	+	200kb_region	Autosome
CHROMOSOME_II	800000	999999	+	200kb_region	Autosome
CHROMOSOME_II	1000000	1199999	+	200kb_region	Autosome
CHROMOSOME_II	1200000	1399999	+	200kb_region	Autosome
CHROMOSOME_II	1400000	1599999	+	200kb_region	Autosome
CHROMOSOME_II	1600000	1799999	+	200kb_region	Autosome
CHROMOSOME_II	1800000	1999999	+	200kb_region	Autosome
CHROMOSOME_II	2000000	2199999	+	200kb_region	Autosome
CHROMOSOME_II	2200000	2399999	+	200kb_region	Autosome
CHROMOSOME_II	2400000	2599999	+	200kb_region	Autosome
CHROMOSOME_II	2600000	2799999	+	200kb_region	Autosome
CHROMOSOME_II	2800000	2999999	+	200kb_region	Autosome
CHROMOSOME_II	3000000	3199999	+	200kb_region	Autosome
CHROMOSOME_II	3200000	3399999	+	200kb_reg

CHROMOSOME_V	12800000	12999999	+	200kb_region	Autosome
CHROMOSOME_V	13000000	13199999	+	200kb_region	Autosome
CHROMOSOME_V	13200000	13399999	+	200kb_region	Autosome
CHROMOSOME_V	13400000	13599999	+	200kb_region	Autosome
CHROMOSOME_V	13600000	13799999	+	200kb_region	Autosome
CHROMOSOME_V	13800000	13999999	+	200kb_region	Autosome
CHROMOSOME_V	14000000	14199999	+	200kb_region	Autosome
CHROMOSOME_V	14200000	14399999	+	200kb_region	Autosome
CHROMOSOME_V	14400000	14599999	+	200kb_region	Autosome
CHROMOSOME_V	14600000	14799999	+	200kb_region	Autosome
CHROMOSOME_V	14800000	14999999	+	200kb_region	Autosome
CHROMOSOME_V	15000000	15199999	+	200kb_region	Autosome
CHROMOSOME_V	15200000	15399999	+	200kb_region	Autosome
CHROMOSOME_V	15400000	15599999	+	200kb_region	Autosome
CHROMOSOME_V	15600000	15799999	+	200kb_region	Autosome
CHROMOSOME_V	15800000	15999999	+	200kb_region	Autosome
CHROMOSOME_V	16000000	16199999	+	200kb_region	Autosome
CHROMOSOME_V	16200000	16399999	+	200kb_region	Autosome
CHROMOSOME

'for bam_file, condition, bam_frac in zip(bam_files,conditions, bam_fracs):\n    print("Starting on: ",bam_file," | chr type: ", condition," | with bam fraction: ",bam_frac)\n    for each_type in selection:\n        temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"\n        output_bamfile = output_stem + "mod_mappings_" + condition + "_" + str(bam_frac)+"_"+each_type+".sorted.bam"\n        extract_overlapping_reads(temp_bedfile, bam_file, output_bamfile)\n        ### Subselect bam file using fraction\n        ! samtools view -h -s {bam_frac} -L {temp_bedfile} {bam_file} | samtools view -h -b - > {output_bamfile}\n        ! samtools index {output_bamfile}'

In [126]:
lock1 = multiprocessing.Lock()

def extract_m6A_per_region(bam_file, bed_file, threshold,condition):
    # Load the BAM file
    bam_ext = pysam.AlignmentFile(bam_file, "rb")

    # Load the BED file
    regions = pysam.TabixFile(bed_file)

    # Initialize a list to store the results
    results = []
            
    # Iterate over the regions in the BED file
    for region in regions.fetch():
        # Split the region string into the chromosome, start, and end positions
        #with lock1:
        #print("Starting on region:",region)
        chromosome, start, end, strand, region_type, chr_type = region.split()
        start = int(start)
        end = int(end)

        # Initialize counters for the total number of bases and the total number of m6A
        total_bases = 0
        total_m6A = 0
        read_counter = 0

        # Iterate over the reads that overlap the region
        for read in bam_ext.fetch(chromosome, start, end):
            #print("starting on read:", read)
            # Count the total number of "A" bases in the read that overlap the region
            total_bases = total_bases + read.query_sequence.count("A")

            if read.is_forward == True:
                try: m6A_dict_values = [x[1] for x  in read.modified_bases_forward[('A', 0, 'Y')]]
                except: m6A_dict_values = [x[1] for x  in read.modified_bases_forward[('A', 0, 'a')]]
            else:
                try: m6A_dict_values = [x[1] for x  in read.modified_bases[('A', 0, 'Y')]]
                except:
                    pass
                try: m6A_dict_values = [x[1] for x  in read.modified_bases[('A', 0, 'a')]]
                except:
                    pass
                try: m6A_dict_values = [x[1] for x  in read.modified_bases[('A', 1, 'Y')]]
                except: m6A_dict_values = [x[1] for x  in read.modified_bases[('A', 1, 'a')]]
    
            total_m6A = sum(i > threshold for i in m6A_dict_values) + total_m6A
            read_counter = read_counter + 1

        # Add the region information to the results list
        results.append([chromosome, start, end, region_type, chr_type, total_bases, total_m6A, read_counter])
    # Close the BAM file
    bam_ext.close()
    del bam_ext

    # Convert the results list to a pandas dataframe
    df = pd.DataFrame(results, columns=["chromosome", "start", "end", "region_type", "chr_type", "total_bases", "total_m6A", "overlapping_reads"])
    df["m6A_frac"] = df["total_m6A"]/df["total_bases"]
    #df["norm_m6A_frac"] = df["m6A_frac"]/(df.loc[df['chr_type'] == 'Autosome', 'm6A_frac'].mean())
    #df["norm_m6A_frac_upd"] = df["m6A_frac"]/(
    #    df.loc[df['chr_type'] == 'Autosome', 'total_m6A'].sum()/(
    #        df.loc[df['chr_type'] == 'Autosome', 'total_bases'].sum()))
    df["condition"]= condition
    return df

def process_bam_file(bam_file, condition, bam_frac, selection, m6A_thresh, output_stem):
    
    for each_type in selection:
        bam = output_stem + "mod_mappings_"  + condition + "_" + str(bam_frac)+"_"+each_type+".sorted.bam"
        with lock1:
            print("starting on: ",bam)
        temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
        output_df = extract_m6A_per_region(bam,temp_bedfile,m6A_thresh,condition)
        output_df.to_csv(output_stem + "m6A_frac_" + condition + "_"  + str(m6A_thresh)+"_"+each_type+".csv", index=False, mode='w')
        print(output_df)

# Parallelize for each bam file:
args_list = [(bam_file, condition, bam_frac, selection, m6A_thresh, output_stem) for bam_file, condition, bam_frac in zip(bam_files,conditions,bam_fracs)]
if __name__ == "__main__":
    with Pool() as pool: #processes=1
        result = pool.starmap(process_bam_file, args_list)
    print("Program finished!")

'''for bam_file, condition, bam_frac in zip(bam_files,conditions,bam_fracs):
    print("starting on: ",bam_file)
    for each_type in selection:
        #bam = output_stem + "mod_mappings_"  + condition + "_" + str(bam_frac)+"_"+each_type+".sorted.bam"
        temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
        output_df = extract_m6A_per_region(bam_file,temp_bedfile,m6A_thresh)
        output_df.to_csv(output_stem + "m6A_frac_" + condition + "_"  + str(m6A_thresh)+"_"+each_type+".csv", index=False, mode='w')
        print(output_df)'''

starting on:  /Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/analysis/mod_mappings_No-Met_1_X.sorted.bam
starting on:  /Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/analysis/mod_mappings_1-min_1_X.sorted.bam
starting on:  /Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/analysis/mod_mappings_3-min_1_X.sorted.bam
starting on:  /Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/analysis/mod_mappings_5-min_1_X.sorted.bam
      chromosome     start       end   region_type chr_type  total_bases  \
0   CHROMOSOME_X         0    199999  200kb_region        X       187644   
1   CHROMOSOME_X    200000    399999  200kb_region        X       238238   
2   CHROMOSOME_X    400000    599999  200kb_region        X       183596   
3   CHROMOSOME_X    600000    799999  200kb_region        X       219953   
4   CHROMOSOME_X    800000    999999  200kb_region        X       168138   
..           ...       ...       ...           ...      ...          ...   
84  CHROMOSOME_

[415 rows x 10 columns]
starting on:  /Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/analysis/mod_mappings_10-min_1_X.sorted.bam
       chromosome     start       end   region_type  chr_type  total_bases  \
0    CHROMOSOME_I         0    199999  200kb_region  Autosome       153351   
1    CHROMOSOME_I    200000    399999  200kb_region  Autosome       168132   
2    CHROMOSOME_I    400000    599999  200kb_region  Autosome       193548   
3    CHROMOSOME_I    600000    799999  200kb_region  Autosome       173960   
4    CHROMOSOME_I    800000    999999  200kb_region  Autosome       198705   
..            ...       ...       ...           ...       ...          ...   
410  CHROMOSOME_V  20000000  20199999  200kb_region  Autosome       211193   
411  CHROMOSOME_V  20200000  20399999  200kb_region  Autosome       131899   
412  CHROMOSOME_V  20400000  20599999  200kb_region  Autosome       231746   
413  CHROMOSOME_V  20600000  20799999  200kb_region  Autosome       277741   
414 

[89 rows x 10 columns]
starting on:  /Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/analysis/mod_mappings_30-min_1_Autosome.sorted.bam
       chromosome     start       end   region_type  chr_type  total_bases  \
0    CHROMOSOME_I         0    199999  200kb_region  Autosome       349351   
1    CHROMOSOME_I    200000    399999  200kb_region  Autosome       348481   
2    CHROMOSOME_I    400000    599999  200kb_region  Autosome       344947   
3    CHROMOSOME_I    600000    799999  200kb_region  Autosome       373325   
4    CHROMOSOME_I    800000    999999  200kb_region  Autosome       310086   
..            ...       ...       ...           ...       ...          ...   
410  CHROMOSOME_V  20000000  20199999  200kb_region  Autosome       383209   
411  CHROMOSOME_V  20200000  20399999  200kb_region  Autosome       364615   
412  CHROMOSOME_V  20400000  20599999  200kb_region  Autosome       438976   
413  CHROMOSOME_V  20600000  20799999  200kb_region  Autosome       430915  

[415 rows x 10 columns]
       chromosome     start       end   region_type  chr_type  total_bases  \
0    CHROMOSOME_I         0    199999  200kb_region  Autosome       465186   
1    CHROMOSOME_I    200000    399999  200kb_region  Autosome       494108   
2    CHROMOSOME_I    400000    599999  200kb_region  Autosome       450083   
3    CHROMOSOME_I    600000    799999  200kb_region  Autosome       448005   
4    CHROMOSOME_I    800000    999999  200kb_region  Autosome       533186   
..            ...       ...       ...           ...       ...          ...   
410  CHROMOSOME_V  20000000  20199999  200kb_region  Autosome       458544   
411  CHROMOSOME_V  20200000  20399999  200kb_region  Autosome       396191   
412  CHROMOSOME_V  20400000  20599999  200kb_region  Autosome       568736   
413  CHROMOSOME_V  20600000  20799999  200kb_region  Autosome       485041   
414  CHROMOSOME_V  20800000  20924180  200kb_region  Autosome       324072   

     total_m6A  overlapping_reads  m6A_

'for bam_file, condition, bam_frac in zip(bam_files,conditions,bam_fracs):\n    print("starting on: ",bam_file)\n    for each_type in selection:\n        #bam = output_stem + "mod_mappings_"  + condition + "_" + str(bam_frac)+"_"+each_type+".sorted.bam"\n        temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"\n        output_df = extract_m6A_per_region(bam_file,temp_bedfile,m6A_thresh)\n        output_df.to_csv(output_stem + "m6A_frac_" + condition + "_"  + str(m6A_thresh)+"_"+each_type+".csv", index=False, mode=\'w\')\n        print(output_df)'

In [158]:
### Build dataframe for plotting
#filenames = ["/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/m6A_frac1_200kb_region.csv",
#             "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/m6A_frac1_200kb_region.csv",
#             "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/m6A_frac1_200kb_region.csv",
#            "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/m6A_frac1_200kb_region.csv"]

filenames = [] 
for each_type in selection:
    for each_cond, each_frac in zip(conditions,bam_fracs):
        filenames.append(output_stem + "m6A_frac_" + each_cond + "_"  + str(m6A_thresh)+"_"+each_type+".csv")

df_list = []
combined_regions = []

# Loop through the list of file names
for filename in filenames:
    # Read each file into a dataframe
    df = pd.read_csv(filename)
    # Add the dataframe to the list of dataframes
    df_list.append(df)

# Concatenate the list of dataframes into a single dataframe
combined_regions = pd.concat(df_list)
print(combined_regions)
combined_regions.to_csv(output_stem + "combined_regions_"  + str(m6A_thresh) +".csv", index=False, mode='w')

       chromosome     start       end   region_type  chr_type  total_bases  \
0    CHROMOSOME_X         0    199999  200kb_region         X       263574   
1    CHROMOSOME_X    200000    399999  200kb_region         X       235010   
2    CHROMOSOME_X    400000    599999  200kb_region         X       208330   
3    CHROMOSOME_X    600000    799999  200kb_region         X       232183   
4    CHROMOSOME_X    800000    999999  200kb_region         X       244005   
..            ...       ...       ...           ...       ...          ...   
410  CHROMOSOME_V  20000000  20199999  200kb_region  Autosome       458544   
411  CHROMOSOME_V  20200000  20399999  200kb_region  Autosome       396191   
412  CHROMOSOME_V  20400000  20599999  200kb_region  Autosome       568736   
413  CHROMOSOME_V  20600000  20799999  200kb_region  Autosome       485041   
414  CHROMOSOME_V  20800000  20924180  200kb_region  Autosome       324072   

     total_m6A  overlapping_reads  m6A_frac condition  
0      

In [121]:
#source: https://stackoverflow.com/questions/67505252/plotly-box-p-value-significant-annotation
def add_p_value_annotation(fig, array_columns, subplot=None, _format=dict(interline=0.07, text_height=1.07, color='black')):
    ''' Adds notations giving the p-value between two box plot data (t-test two-sided comparison)
    
    Parameters:
    ----------
    fig: figure
        plotly boxplot figure
    array_columns: np.array
        array of which columns to compare 
        e.g.: [[0,1], [1,2]] compares column 0 with 1 and 1 with 2
    subplot: None or int
        specifies if the figures has subplots and what subplot to add the notation to
    _format: dict
        format characteristics for the lines

    Returns:
    -------
    fig: figure
        figure with the added notation
    '''
    # Specify in what y_range to plot for each pair of columns
    y_range = np.zeros([len(array_columns), 2])
    for i in range(len(array_columns)):
        y_range[i] = [1.01+i*_format['interline'], 1.02+i*_format['interline']]

    # Get values from figure
    fig_dict = fig.to_dict()

    # Get indices if working with subplots
    if subplot:
        if subplot == 1:
            subplot_str = ''
        else:
            subplot_str =str(subplot)
        indices = [] #Change the box index to the indices of the data for that subplot
        for index, data in enumerate(fig_dict['data']):
            #print(index, data['xaxis'], 'x' + subplot_str)
            if data['xaxis'] == 'x' + subplot_str:
                indices = np.append(indices, index)
        indices = [int(i) for i in indices]
        print((indices))
    else:
        subplot_str = ''

    # Print the p-values
    for index, column_pair in enumerate(array_columns):
        if subplot:
            data_pair = [indices[column_pair[0]], indices[column_pair[1]]]
        else:
            data_pair = column_pair

        # Mare sure it is selecting the data and subplot you want
        #print('0:', fig_dict['data'][data_pair[0]]['name'], fig_dict['data'][data_pair[0]]['xaxis'])
        #print('1:', fig_dict['data'][data_pair[1]]['name'], fig_dict['data'][data_pair[1]]['xaxis'])

        # Get the p-value
        pvalue = stats.ttest_ind(
            fig_dict['data'][data_pair[0]]['y'],
            fig_dict['data'][data_pair[1]]['y'],
            equal_var=False,
        )[1]
        if pvalue >= 0.05:
            symbol = 'ns'
        elif pvalue >= 0.01: 
            symbol = '*'
        elif pvalue >= 0.001:
            symbol = '**'
        else:
            symbol = '***'
        # Vertical line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[0], y0=y_range[index][0], 
            x1=column_pair[0], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        # Horizontal line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[0], y0=y_range[index][1], 
            x1=column_pair[1], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        # Vertical line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[1], y0=y_range[index][0], 
            x1=column_pair[1], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        ## add text at the correct x, y coordinates
        ## for bars, there is a direct mapping from the bar number to 0, 1, 2...
        fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
            x=(column_pair[0] + column_pair[1])/2,
            y=y_range[index][1]*_format['text_height'],
            showarrow=False,
            text=symbol,
            textangle=0,
            xref="x"+subplot_str,
            yref="y"+subplot_str+" domain"
        ))
    return fig

In [164]:
#import plotly.io as pio
#pio.renderers.default = "iframe"

# Plot the boxplot
marker_colors =["#c45746","#16415e"]

fig = make_subplots(rows=1, cols=len(conditions),
                y_title = "m6A/A",
                shared_yaxes=True,
                subplot_titles=(conditions))

def reindex_df(df, weight_col):
    """expand the dataframe to prepare for resampling
    result is 1 row per count per sample"""
    df.reset_index(drop=True, inplace=True)
    df = df.reindex(df.index.repeat(np.ceil(df[weight_col])/10000))
    df.reset_index(drop=True, inplace=True)
    return(df)

weighted_combined_regions = reindex_df(combined_regions,'total_bases')
print(weighted_combined_regions)

for i in range(0,len(conditions)):
    tube_df = weighted_combined_regions.loc[weighted_combined_regions['condition']==conditions[i]]
    chr_type = "Autosome"
    df_plot=tube_df.loc[tube_df['chr_type']==chr_type]
    #df_plot=tube_df.sample(frac=17/100,replace=False,random_state=1)
    trace0 = go.Box(x=df_plot['condition'], y=df_plot['m6A_frac'],
                         notched=True,
                         name=chr_type, marker_color =marker_colors[0])
    
    chr_type = "X"
    df_plot=tube_df.loc[tube_df['chr_type']==chr_type]
    trace1 = go.Box(x=df_plot['condition'], y=df_plot['m6A_frac'],
                         notched=True,
                         name=chr_type, marker_color=marker_colors[1])

    fig.append_trace(trace0, row = 1, col = i+1)
    fig.append_trace(trace1, row = 1, col = i+1)
    
fig['layout'].update(height = 800)
fig.update_layout(template="plotly_white")
fig.update_xaxes(showticklabels=False)
fig.update_yaxes(range=[0, 1])
    
#fig.update_layout(boxmode='group', xaxis_tickangle=0)
fig.show()

          chromosome     start       end   region_type  chr_type  total_bases  \
0       CHROMOSOME_X         0    199999  200kb_region         X       263574   
1       CHROMOSOME_X         0    199999  200kb_region         X       263574   
2       CHROMOSOME_X         0    199999  200kb_region         X       263574   
3       CHROMOSOME_X         0    199999  200kb_region         X       263574   
4       CHROMOSOME_X         0    199999  200kb_region         X       263574   
...              ...       ...       ...           ...       ...          ...   
117573  CHROMOSOME_V  20800000  20924180  200kb_region  Autosome       324072   
117574  CHROMOSOME_V  20800000  20924180  200kb_region  Autosome       324072   
117575  CHROMOSOME_V  20800000  20924180  200kb_region  Autosome       324072   
117576  CHROMOSOME_V  20800000  20924180  200kb_region  Autosome       324072   
117577  CHROMOSOME_V  20800000  20924180  200kb_region  Autosome       324072   

        total_m6A  overlapp

In [150]:
# Plot the boxplot
marker_colors =["#c45746","#16415e"]

fig = make_subplots(rows=1, cols=len(conditions),
                y_title = "Whole Chromosome m6A/A",
                shared_yaxes=True,
                subplot_titles=(conditions))

for i in range(0,len(conditions)):
    tube_df = combined_regions.loc[combined_regions['condition']==conditions[i]]
    chr_type = "Autosome"
    df_plot=tube_df.loc[tube_df['chr_type']==chr_type]
    df_plot=tube_df.sample(frac=17/100,replace=False,random_state=1)
    m6A_frac_tube = [df_plot['total_m6A'].sum()/df_plot['total_bases'].sum()]
    print(m6A_frac_tube)
    trace0 = go.Bar(x=df_plot['condition']+" ", y=m6A_frac_tube,
                         name=chr_type, marker_color =marker_colors[0])

    chr_type = "X"
    df_plot=tube_df.loc[tube_df['chr_type']==chr_type]
    m6A_frac_tube = [df_plot['total_m6A'].sum()/df_plot['total_bases'].sum()]
    trace1 = go.Bar(x=df_plot['condition'], y=m6A_frac_tube,
                         name=chr_type, marker_color =marker_colors[1])

    fig.append_trace(trace0, row = 1, col = i+1)
    fig.append_trace(trace1, row = 1, col = i+1)
    
fig['layout'].update(height = 800)
fig.update_layout(template="plotly_white")
fig.update_xaxes(showticklabels=False)
#fig.update_yaxes(range=[0.7, 1.3])
    
#fig.update_layout(boxmode='group', xaxis_tickangle=0)
fig.show()

[0.309136293865848]
[0.2988300695901791]
[0.3362762178136413]
[0.3402847723218995]
[0.3037240722553867]
[0.31369444192939855]
[0.3258847527773178]
[0.375989595077582]



invalid value encountered in long_scalars



In [68]:
# Plot the boxplot
marker_colors =["#fde725","#a0da39","#4ac16d","#1fa187","#277f8e","#365c8d","#46327e","#440154"]

fig = make_subplots(rows=1, cols=len(conditions),
                y_title = "Coverage",
                shared_yaxes=True,
                subplot_titles=(conditions))

print("Total MB aligned for ALL conditons: ",int(combined_regions['total_bases'].sum()/1000000),
     " | across ", int(combined_regions['overlapping_reads'].sum())," reads with avg. length of: ",
     int(combined_regions['total_bases'].sum()/combined_regions['overlapping_reads'].sum()))
for i in range(0,len(conditions)):
    tube_df = combined_regions.loc[combined_regions['condition']==conditions[i]]
    m6A_frac_tube = [tube_df['total_bases'].sum()/100000000]
    print("Total MB aligned for ",conditions[i],
          ": ",int(tube_df['total_bases'].sum()/1000000), 
          " | across ", int(tube_df['overlapping_reads'].sum()),
          " reads with avg. length of: ",
          int(tube_df['total_bases'].sum()/tube_df['overlapping_reads'].sum()))
    trace0 = go.Bar(x=tube_df['condition']+" ", y=m6A_frac_tube,
                         name=conditions[i], marker_color =marker_colors[i])

    fig.append_trace(trace0, row = 1, col = i+1)
    
fig['layout'].update(height = 800)
fig.update_layout(template="plotly_white")
fig.update_xaxes(showticklabels=False)
#fig.update_yaxes(range=[0.7, 1.3])
    
#fig.update_layout(boxmode='group', xaxis_tickangle=0)
fig.show()

Total MB aligned for ALL conditons:  1195  | across  750740  reads with avg. length of:  1592
Total MB aligned for  No-Met :  109  | across  60806  reads with avg. length of:  1807
Total MB aligned for  1-min :  91  | across  58683  reads with avg. length of:  1563
Total MB aligned for  3-min :  152  | across  77863  reads with avg. length of:  1960
Total MB aligned for  5-min :  191  | across  120035  reads with avg. length of:  1599
Total MB aligned for  10-min :  130  | across  106735  reads with avg. length of:  1222
Total MB aligned for  15-min :  129  | across  87757  reads with avg. length of:  1475
Total MB aligned for  30-min :  124  | across  88811  reads with avg. length of:  1405
Total MB aligned for  120-min :  264  | across  150050  reads with avg. length of:  1765
