In [6]:
# Purpose of this script is to take in a number of .bam files and a .bed file and return %m6A across all regions in the .bam files file
# in bin increments.
__author__ = "Yuri Malina"
__contact__ = "ymalina@berkeley.edu"
__copyright__ = "The Meyer Lab, UC Berkeley"
__credits__ = [""]
__date__ = "3/27/2023"
__deprecated__ = False
__status__ = "In development"
__version__ = "0.0.1"

In [8]:
import importlib
import pysam
import nanotools
import random
import pandas as pd
import numpy as np
import plotly.express as px # Used for plotting
import plotly.graph_objects as go # Used for plotting
from plotly.subplots import make_subplots # Used for plotting
from multiprocessing import Pool # used for parallel processing
importlib.reload(nanotools)
#pio.renderers.default = 'vscode'

# install tabix with:
# apt-get install tabix 

<module 'nanotools' from '/Data1/git/meyer-nanopore/scripts/Analysis/nanotools.py'>

In [24]:
### Configurations
m6A_thresh = 194 #default is 129 = 50%; 181=70%; 194=75%; 207 = 80%; 232 = 90%
mC_thresh = 129 #default is 129
coreNum = 96 # cores to use

## Bed file configurations:
sample_source = "chromosome" # "chr_type" or "type" or "chromosome"
sampleName = ["X","Autosome"] # "TES_q1" "strong_rex" "weak_rex" "type", "X", "Autosome"; Must be same number of unique values in selected bed rows.
chr_type_selected = ["X","Autosome"] # 'X' or "Autosome"
type_selected = ["whole_chr"] #TES_q1-4 | #TSS_q1-4 | strong/weak rex | whole_chr | 200kb_region | 50kb_region
max_regions = 0 # max regions to consider; 0 = full set;
chromosome_selected = ["CHROMOSOME_X","CHROMOSOME_I", "CHROMOSOME_II", "CHROMOSOME_III", "CHROMOSOME_IV", "CHROMOSOME_V"] #"CHROMOSOME_I", "CHROMOSOME_II", "CHROMOSOME_III", "CHROMOSOME_IV", "CHROMOSOME_V",
strand_selected = ["+","-"] #+ and/or -
bed_file = "/Data1/reference/tss_tes_rex_combined_v2.bed"
bed_window = 1000 # +/- around bed elements.
mods = "A" # {A,CG,A+CG}
if sample_source == "chr_type":
    selection = chr_type_selected
if sample_source == "type":
    selection = type_selected
if sample_source == "chromosome":
    selection = chromosome_selected

## Bam file configurations
random.seed(10)

### Tube D
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/mod_mappings.sorted.bam"
#output_stem = "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/"
#condition = "N2; 2uM Hia5 30min"

### Tube 4
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam"
#output_stem = "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/"
#condition = "N2; 2uM Hia5 120min"

### Tube H
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam"
#output_stem = "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/"
#condition = "AID::SDC-2 + Auxin; 2uM Hia5 30min"

### Tube T
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/mod_mappings.sorted.m6Aonly.bam"
#output_stem = "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/"
#condition = "N2 Young; 2uM Hia5 30min"

#"/Data1/seq_data/TubeY9B_N2_fiberseq_timec_mpx_3_21_2023/analysis/mod_mappings_No-Met-RG_0.1_200kb_region.sorted.bam"
#"No-Met-RG",

### Tube AD1
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeAD1_N2_fiberseq_6_13_23/mod_basecalls/mod_mappings.sorted.bam"
#output_stem = "/Data1/seq_data/TubeAD1_N2_fiberseq_6_13_23/mod_basecalls/"
#condition = "N2; 1uM Hia5 120min"

'''
### SUBSET OF COMBINED RUNS

bam_fracs = [1]#,1,1] # For full .bam set to = 1
bam_files = [
    #"/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam",
    #"/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_05.bam",
    "/Data1/seq_data/TubeAD1_N2_fiberseq_6_13_23/mod_basecalls/mod_mappings2.sorted.bam"
    #"/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam",
    #"/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_09.bam"
]
output_stem = "/Data1/seq_data/TubeAD1_N2_fiberseq_6_13_23/mod_basecalls/"
conditions = [#"Tube4 N2; 2uM Hia5 120min","TubeAB N2; 2uM Hia5 120min",
              "TubeAD1 N2; 1uM Hia5 120min"]
              #"TubeH1 AID::SDC-2 + Auxin; 2uM Hia5 30min","TubeAB AID::SDC-2 + Auxin; 2uM Hia5 30min"

conditions_min=[120]#,120,120]#,120,30,30]
file_prefix = "120min_allN2"

bam_fracs = [1,1,1,1,1] # For full .bam set to = 1
bam_files = [
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_01.bam",
    "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/mod_mappings.sorted.bam",
    "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam",
    "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam",
    "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/mod_mappings.sorted.m6Aonly.bam"
]
output_stem = "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/analysis/"
conditions = ["N2 mixed stage;<br>No Mtase","N2 mixed stage;<br>2uM Hia5 30min","N2 mixed stage;<br>2uM Hia5 120min","AID::SDC-2 + Auxin mixed stage;<br>2uM Hia5 30min","N2 11-cell median;<br>2uM Hia5 30min"]'''

'''
### N2 120min + 2/1uM HIa5
# bam_fracs equal to list of repeating 0.01 times length of bam_files
bam_files = [
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_05.bam",
    "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam",
    "/Data1/seq_data/TubeAD1_N2_fiberseq_6_13_23/mod_basecalls/mod_mappings.sorted.bam",
]
bam_fracs = len(bam_files)*[1] # For full .bam set to = 1
output_stem = "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/analysis/"
conditions = ["N2-2uM-120-min","N2; 2uM Hia5 120min","N2; 1uM Hia5 120min"]
conditions_min=[120,120,120]
file_prefix = "aug_3_wholechr_"
'''

"""
### Dimelo antidpy27 tube AI
# bam_fracs equal to list of repeating 0.01 times length of bam_files
bam_files = [
    "/Data1/seq_data/AI_N2_dimelo_antiDPY27_mpx_8_19_23/pod5_pass/barcode05/basecalls/barcode05.mod_mappings.sorted.bam",
    "/Data1/seq_data/AI_N2_dimelo_antiDPY27_mpx_8_19_23/pod5_pass/barcode06/basecalls/barcode06.mod_mappings.sorted.bam"
]
bam_fracs = len(bam_files)*[1] # For full .bam set to = 1
output_stem = "/Data1/seq_data/AI_N2_dimelo_antiDPY27_mpx_8_19_23/analysis/"
conditions = ["N2-DPY27_dimelo_pAHia5","N2-DPY27_dimelo_RbNbHia5"]
conditions_min=[120,120]
file_prefix = "dimelo_"
"""

### Auxin withdrawal experiments tube AH
# bam_fracs equal to list of repeating 0.01 times length of bam_files
bam_files = [
    "/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode07/basecalls/barcode07.mod_mappings.sorted.bam",
    "/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode08/basecalls/barcode08.mod_mappings.sorted.bam",
    "/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode09/basecalls/barcode09.mod_mappings.sorted.bam",
    "/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode10/basecalls/barcode10.mod_mappings.sorted.bam",
    "/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode11/basecalls/barcode11.mod_mappings.sorted.bam",
    "/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode12/basecalls/barcode12.mod_mappings.sorted.bam",
    "/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode13/basecalls/barcode13.mod_mappings.sorted.bam",
    "/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode14/basecalls/barcode14.mod_mappings.sorted.bam"
]
bam_fracs = len(bam_files)*[1] # For full .bam set to = 1
output_stem = "/Data1/seq_data/AI_N2_dimelo_antiDPY27_mpx_8_19_23/analysis/"
conditions = ["N2-2uMHia5-0min","N2-2uMHia5-300min","SDC2-AID+Aux-0min","SDC2-AID+Aux-10min","SDC2-AID+Aux-30min","SDC2-AID+Aux-60min","SDC2-AID+Aux-120min","SDC2-AID+Aux-300min"]
conditions_min=[0,0,0,10,30,60,120,300]
file_prefix = "fiber_"

"""
### SDC-2 AID + Auxin + 2/1uM HIa5
# bam_fracs equal to list of repeating 0.01 times length of bam_files
bam_files = [
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_10.bam",
    "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_09.bam",
    "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam"
]
bam_fracs = len(bam_files)*[1] # For full .bam set to = 1
output_stem = "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/analysis/"
conditions = ["SDC2_AID_120_min","SDC2_AID_30_min","AID_SDC_2 + Auxin; 2uM Hia5 30min"]
conditions_min=[120,120,120]
file_prefix = "aug_3_wholechr_"""

'''### Time Course
#bam_fracs = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1] # For full .bam set to = 1
bam_fracs = [1,1,1,1] # For full .bam set to = 1
bam_files = [
    "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/mod_mappings.sorted.bam",
    "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam",
    "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam",
    "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/mod_mappings.sorted.m6Aonly.bam"
]
output_stem = "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/analysis/"
conditions = ["N2; 2uM Hia5 30min","AID::SDC-2 + Auxin; 2uM Hia5 30min",
              "N2; 2uM Hia5 120min","N2 Young; 2uM Hia5 30min"]
conditions_min=[30,30,120,30]
'''

'### Time Course\n#bam_fracs = [0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1,0.1] # For full .bam set to = 1\nbam_fracs = [1,1,1,1] # For full .bam set to = 1\nbam_files = [\n    "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/mod_mappings.sorted.bam",\n    "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam",\n    "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam",\n    "/Data1/seq_data/TubeT_N2_Young_FiberSeq_02_14_23_v3/basecalls/m6A/mod_mappings.sorted.m6Aonly.bam"\n]\noutput_stem = "/Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/analysis/"\nconditions = ["N2; 2uM Hia5 30min","AID::SDC-2 + Auxin; 2uM Hia5 30min",\n              "N2; 2uM Hia5 120min","N2 Young; 2uM Hia5 30min"]\nconditions_min=[30,30,120,30]\n'

In [25]:
# Filter input bed_file based on input parameters (e.g. chromosome, type, strand, etc.)
# Function saves a new filtered bed file to the same folder as the original bed file
# called temp_do_not_use_"type".bed
importlib.reload(nanotools)
new_bed_files=nanotools.filter_bed_file(
    bed_file,
    sample_source,
    selection,
    chromosome_selected,
    chr_type_selected,
    type_selected,
    strand_selected,
    max_regions,
    bed_window)

print(new_bed_files)

Saving following bed file:       chromosome  start       end strand       type chr-type
0  CHROMOSOME_X      0  17719942      +  whole_chr        X
/Data1/reference/CHROMOSOME_X.bed has been compressed successfully to /Data1/reference/CHROMOSOME_X.bed.gz
Index created successfully for /Data1/reference/CHROMOSOME_X.bed.gz.tbi
Saving following bed file:       chromosome  start       end strand       type  chr-type
0  CHROMOSOME_I      0  15073434      +  whole_chr  Autosome
/Data1/reference/CHROMOSOME_I.bed has been compressed successfully to /Data1/reference/CHROMOSOME_I.bed.gz
Index created successfully for /Data1/reference/CHROMOSOME_I.bed.gz.tbi
Saving following bed file:        chromosome  start       end strand       type  chr-type
0  CHROMOSOME_II      0  15280421      +  whole_chr  Autosome
/Data1/reference/CHROMOSOME_II.bed has been compressed successfully to /Data1/reference/CHROMOSOME_II.bed.gz
Index created successfully for /Data1/reference/CHROMOSOME_II.bed.gz.tbi
Saving fol

In [26]:
importlib.reload(nanotools)
# Extract reads from bam file that overlap with the filtered bed file
# and save as a new bam file. This can be skipped in most instances.
# Parallelize for each bam file:
args_list = [(bam_file, condition, bam_frac, selection, output_stem) for bam_file, condition, bam_frac in zip(bam_files,conditions,bam_fracs)]
new_bam_files=[]
if __name__ == "__main__":
    with Pool() as pool: #processes=1
        new_bam_files = pool.starmap(nanotools.subsample_bam, args_list)
    print("Program finished!")

print(new_bam_files)

Program finished!
['/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode07/basecalls/barcode07.mod_mappings.sorted.bam', '/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode08/basecalls/barcode08.mod_mappings.sorted.bam', '/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode09/basecalls/barcode09.mod_mappings.sorted.bam', '/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode10/basecalls/barcode10.mod_mappings.sorted.bam', '/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode11/basecalls/barcode11.mod_mappings.sorted.bam', '/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode12/basecalls/barcode12.mod_mappings.sorted.bam', '/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode13/basecalls/barcode13.mod_mappings.sorted.bam', '/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode14/basecalls/barcode14.mod_mappings.sorted.bam']


In [29]:
### Extract m6A frac by region
importlib.reload(nanotools)
result_list=[]
result_df=pd.DataFrame()
# Parallelize for each bam file:
args_list = [(bam_file, condition, bam_frac,condition_min,file_prefix, selection, m6A_thresh, output_stem,new_bed_files) for bam_file, condition, bam_frac, condition_min in zip(new_bam_files,conditions,bam_fracs,conditions_min)]
print("Args list:",args_list)
if __name__ == "__main__":
    with Pool(processes=10) as pool: #processes=1
        # append results to pandas df 'result'
        result_list = pool.starmap(nanotools.extract_m6A_per_region_parellized, args_list)
        for result in result_list:
            result_df=pd.concat([result_df,result])
    print("Program finished!")

print(result_df)

Args list: [('/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode07/basecalls/barcode07.mod_mappings.sorted.bam', 'N2-2uMHia5-0min', 1, 0, 'fiber_', ['CHROMOSOME_X', 'CHROMOSOME_I', 'CHROMOSOME_II', 'CHROMOSOME_III', 'CHROMOSOME_IV', 'CHROMOSOME_V'], 194, '/Data1/seq_data/AI_N2_dimelo_antiDPY27_mpx_8_19_23/analysis/', ['/Data1/reference/CHROMOSOME_X.bed.gz', '/Data1/reference/CHROMOSOME_I.bed.gz', '/Data1/reference/CHROMOSOME_II.bed.gz', '/Data1/reference/CHROMOSOME_III.bed.gz', '/Data1/reference/CHROMOSOME_IV.bed.gz', '/Data1/reference/CHROMOSOME_V.bed.gz']), ('/Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/pod5_pass/barcode08/basecalls/barcode08.mod_mappings.sorted.bam', 'N2-2uMHia5-300min', 1, 0, 'fiber_', ['CHROMOSOME_X', 'CHROMOSOME_I', 'CHROMOSOME_II', 'CHROMOSOME_III', 'CHROMOSOME_IV', 'CHROMOSOME_V'], 194, '/Data1/seq_data/AI_N2_dimelo_antiDPY27_mpx_8_19_23/analysis/', ['/Data1/reference/CHROMOSOME_X.bed.gz', '/Data1/reference/CHROMOSOME_I.bed.gz', '/

In [30]:
### Build dataframe for plotting
def reindex_df(df, weight_col):
    """expand the dataframe to prepare for resampling
    result is 1 row per count per sample"""
    df.reset_index(drop=True, inplace=True)
    df = df.reindex(df.index.repeat(np.ceil(df[weight_col])/100000))
    df.reset_index(drop=True, inplace=True)
    return(df)

'''# If combined regions file already exists, read dataframe from csv
if os.path.exists(output_stem  + file_prefix + "weighted_combined_regions_"  + str(m6A_thresh) +".csv"):
    weighted_combined_regions = pd.read_csv(output_stem + file_prefix + "weighted_combined_regions_"  + str(m6A_thresh) +".csv")
    print("File: ",
          output_stem +  file_prefix+"combined_regions_"  + str(m6A_thresh) +".csv",
          "already exists! Imported directly:")
    print(weighted_combined_regions)

else:'''
print("Building combined regions file...")
# Initialize variables
filenames = []
df_list = []
combined_regions = []

# Create "filenames" list that includes the name of each file to be read
for each_type in selection:
    for each_cond, each_frac in zip(conditions,bam_fracs):
        filenames.append(output_stem + file_prefix+"m6A_frac_" + each_cond + "_"  + str(m6A_thresh)+"_"+each_type+".csv")

# Loop through the list of file names
for filename in filenames:
    # Read each file into a dataframe
    df = pd.read_csv(filename)
    # Add the dataframe to the list of dataframes
    df_list.append(df)

# Concatenate the list of dataframes into a single dataframe
combined_regions = pd.concat(df_list)

# Reindex the dataframe to have the number repeated rows based on total bases in the region
# This helps ensure plots are weighted correctly.
weighted_combined_regions = reindex_df(combined_regions,'total_bases')

# Add column equal to average of autosome m6A_frac column for each condition
weighted_combined_regions['mean_autosome_m6A_frac'] = weighted_combined_regions.groupby('condition')['m6A_frac'].transform('mean')

# Add column equal to m6A normalized by the condition's mean_autosome_m6A_frac
weighted_combined_regions['norm_m6A_frac'] = weighted_combined_regions['m6A_frac']/weighted_combined_regions['mean_autosome_m6A_frac']

# Save final dataframe to .csv file
print("Weighted combined:",weighted_combined_regions)
print("Outputting file:",output_stem  + file_prefix+"combined_regions_"  + str(m6A_thresh) +".csv")
weighted_combined_regions.to_csv(output_stem  + file_prefix+"weighted_combined_regions_"  + str(m6A_thresh) +".csv", index=False, mode='w')

# Extract average m6A/A across each chromosome for each condition from weighted_combined_regions
# This is used for plotting the average m6A/A across the chromosome
chromosome_m6A_frac = weighted_combined_regions.groupby(['condition','condition_min','chr_type'])['m6A_frac'].median().reset_index()
# split condition column with character "-" and keep only first column
chromosome_m6A_frac['genotype'] = chromosome_m6A_frac['condition'].str.split('-').str[0]

# sort by genotype, chr_type and condition_min
chromosome_m6A_frac.sort_values(by=['genotype','chr_type','condition_min'], inplace=True)

#Add column for increase in methylation from previous timepoint for each condition and each chr_type, where the first timepoint is 0
chromosome_m6A_frac['m6A_frac_diff'] = chromosome_m6A_frac.groupby(['genotype','chr_type'])['m6A_frac'].diff()
# Set all Nan values in m6A_frac_diff to 0
chromosome_m6A_frac['m6A_frac_diff'].fillna(0, inplace=True)

#reset index
chromosome_m6A_frac.reset_index(drop=True, inplace=True)

# normalize m6A_frac_diff by the first m6A_frac value for each genotype and chr_type
print("chromosome_m6A_frac.groupby(['genotype','chr_type'])['m6A_frac'].transform(lambda x: x/x.iloc[0]):",chromosome_m6A_frac.groupby(['genotype','chr_type'])['m6A_frac'].transform(lambda x: x.iloc[0]))

chromosome_m6A_frac['norm_m6A_frac_diff'] = chromosome_m6A_frac['m6A_frac_diff']/chromosome_m6A_frac.groupby(['genotype','chr_type'])['m6A_frac'].transform(lambda x: x.iloc[0])

chromosome_m6A_frac['m6A_frac_diff_from_first'] = chromosome_m6A_frac['m6A_frac']-chromosome_m6A_frac.groupby(['genotype','chr_type'])['m6A_frac'].transform(lambda x: x.iloc[0])

print(chromosome_m6A_frac)

Building combined regions file...
Weighted combined:         chromosome  start       end region_type  chr_type  total_bases  \
0     CHROMOSOME_X      0  17719942   whole_chr         X     10260850   
1     CHROMOSOME_X      0  17719942   whole_chr         X     10260850   
2     CHROMOSOME_X      0  17719942   whole_chr         X     10260850   
3     CHROMOSOME_X      0  17719942   whole_chr         X     10260850   
4     CHROMOSOME_X      0  17719942   whole_chr         X     10260850   
...            ...    ...       ...         ...       ...          ...   
8008  CHROMOSOME_V      0  20925180   whole_chr  Autosome     29817308   
8009  CHROMOSOME_V      0  20925180   whole_chr  Autosome     29817308   
8010  CHROMOSOME_V      0  20925180   whole_chr  Autosome     29817308   
8011  CHROMOSOME_V      0  20925180   whole_chr  Autosome     29817308   
8012  CHROMOSOME_V      0  20925180   whole_chr  Autosome     29817308   

      total_m6A  overlapping_reads  m6A_frac  norm_m6A_fra

In [31]:
# Plot average m6A/A across the chromosome for each condition in a time course
# Set px background to white
px.defaults.template = "plotly_white"

# list of samples to consider
considered_samples = [0]

# Plot title
#plot_title = "AID::SDC-2 + Auxin; 2uM Hia5 Timecourse; m6A thresh = 75%"
plot_title = "Mean m6A/A across entire chromosomes; m6A Threshold = " + str(round(m6A_thresh/254*100-1)) + "%"

# plot boxplot of norm_m6A_frac by chromosome
fig = px.box(result_df, x="condition", y="m6A_frac", color="chromosome", title=plot_title, points="all")
#Update background to white
fig.update_layout(plot_bgcolor='white')
fig.show()

# plot boxplot of norm_m6A_frac by chromosome
fig = px.box(result_df, x="condition", y="m6A_frac", color="chr_type", title=plot_title, points="all")
fig.update_layout(plot_bgcolor='white')
fig.show()

In [35]:
# Plot average m6A/A across the chromosome for each condition in a time course


# list of samples to consider
considered_samples = [0,1,2]

# Plot title
#plot_title = "AID::SDC-2 + Auxin; 2uM Hia5 Timecourse; m6A thresh = 75%"
plot_title = "Mean m6A/A on X Chromosome 200kb Regions;<br>3min 2uM Hia5 treatment; m6A thresh = " + str(m6A_thresh/254) + "%"

# Plot the boxplot
marker_colors =["#c45746","#16415e"]

plotly_conditions = conditions
#plotly_conditions = ["N2<br>No-Met","N2<br>3-min","N2<br>10-min","N2<br>30-min", "N2<br>120-min",
#"#021+Aux<br>No-Met","#021+Aux<br>3-min","#021+Aux<br>10-min","#021+Aux<br>30-min", "#021+Aux<br>120-min"]

fig = make_subplots(rows=1, cols=len(considered_samples),
                y_title = "m6A/A",
                shared_yaxes=True,
                subplot_titles=(list( plotly_conditions[i] for i in considered_samples )))

plot_iter=0
print("weighted_combined_regions ",weighted_combined_regions)
for i in considered_samples:
    tube_df = weighted_combined_regions.loc[weighted_combined_regions['condition']==conditions[i]]
    chr_type = "Autosome"
    df_plot=tube_df.loc[tube_df['chr_type']==chr_type]
    #df_plot=tube_df.sample(frac=17/100,replace=False,random_state=1)
    trace0 = go.Box(x=df_plot['condition']+" ", y=df_plot['m6A_frac'], #+ " " makes box plots not overlap
                         name=chr_type, marker_color =marker_colors[1],)
    chr_type = "X"
    df_plot=tube_df.loc[tube_df['chr_type']==chr_type]
    trace1 = go.Box(x=df_plot['condition'], y=df_plot['m6A_frac'],
                         name=chr_type, marker_color=marker_colors[0])#, #add scatter points
                            #boxpoints='all', jitter=0.4, pointpos=0) #jitter for SDC-2 degron and N2 only for 3min
    plot_iter += 1
    fig.append_trace(trace0, row = 1, col = plot_iter)
    fig.append_trace(trace1, row = 1, col = plot_iter)
    
# remove boxplot fill color
fig.update_traces(fillcolor='rgba(0,0,0,0)')
fig['layout'].update(height = 600,width = 1000)
fig.update_layout(template="plotly_white",title=plot_title)
fig.update_xaxes(showticklabels=False)
fig.update_annotations(font_size=12)
fig.update_traces(marker=dict(size=3))
'''fig = add_p_value_annotation(fig, [[0,1]], 1, _format=dict(interline=0.07, text_height=1.07, color='black'))
fig = add_p_value_annotation(fig, [[0,1]], 2, _format=dict(interline=0.07, text_height=1.07, color='black'))
fig = add_p_value_annotation(fig, [[0,1]], 3, _format=dict(interline=0.07, text_height=1.07, color='black'))
fig = add_p_value_annotation(fig, [[0,1]], 4, _format=dict(interline=0.07, text_height=1.07, color='black'))
fig = add_p_value_annotation(fig, [[0,1]], 5, _format=dict(interline=0.07, text_height=1.07, color='black'))'''
#fig.update_layout(boxmode='group', xaxis_tickangle=0)

for i in range(0,len([0,10])):
    fig.layout.annotations[i].update(y=-0.1)
fig.update_yaxes(tickformat="1%")
fig.show()
#Export plotly figure to .svg
fig.write_image(output_stem + "combined_regions_"  + str(m6A_thresh) +".svg")

weighted_combined_regions           chromosome  start       end region_type  chr_type  total_bases  \
0      CHROMOSOME_I      0  15073434   whole_chr  Autosome     39194324   
1      CHROMOSOME_I      0  15073434   whole_chr  Autosome     39194324   
2      CHROMOSOME_I      0  15073434   whole_chr  Autosome     39194324   
3      CHROMOSOME_I      0  15073434   whole_chr  Autosome     39194324   
4      CHROMOSOME_I      0  15073434   whole_chr  Autosome     39194324   
...             ...    ...       ...         ...       ...          ...   
12316  CHROMOSOME_X      0  17719942   whole_chr         X    139100269   
12317  CHROMOSOME_X      0  17719942   whole_chr         X    139100269   
12318  CHROMOSOME_X      0  17719942   whole_chr         X    139100269   
12319  CHROMOSOME_X      0  17719942   whole_chr         X    139100269   
12320  CHROMOSOME_X      0  17719942   whole_chr         X    139100269   

       total_m6A  overlapping_reads  m6A_frac  norm_m6A_frac  \
0       

In [36]:
### Plot average m6A by timepoint

# list of samples to consider
considered_samples = [0,1,2]

# Plot title
#plot_title = "AID::SDC-2 + Auxin; 2uM Hia5 Timecourse; m6A thresh = 75%"
plot_title = "Mean m6A/A across X Chromosome;<br> Varying 2uM Hia5 treatment; m6A thresh = 75%"

# Plot the boxplot
marker_colors =["#c45746","#16415e"]

color_map = {'#021+Aux': "#c45746", 'N2': "#16415e"}

# Initiate GO figure
fig = make_subplots(rows=1, cols=2,
                y_title = "Change in m6A/A",
                x_title = "Methylation Duration (min)",
                shared_yaxes=True,
                shared_xaxes=False,
                subplot_titles=(["Autosome","X Chromosome"]))

# Set tube_df dataframe to chromosome_m6A_frac table only where condition index is in considered_samples list
tube_df = chromosome_m6A_frac.loc[chromosome_m6A_frac['condition'].isin(list( conditions[i] for i in considered_samples ))]
# Convert condition_min column to integers
tube_df['condition_min'] = tube_df['condition_min'].astype(int)
#Sort tube_df by conditon_min then by chr_type
tube_df = tube_df.sort_values(by=['condition_min','chr_type'])
# Add column for subtracting chr_type X from Autosome for each condition and dividing by autosome value
tube_df['m6A_frac_diff_X_from_A'] = tube_df.groupby(['condition'])['m6A_frac'].apply(lambda x: x - x.iloc[0])
# Add column for 'm6A_frac_diff_X_from_A' divided by autosome value for each condition
tube_df['m6A_frac_diff_X_from_A_per'] = tube_df.groupby(['condition'])['m6A_frac_diff_X_from_A'].apply(lambda x: x / x.iloc[0])

print(tube_df)

chr_type = "Autosome"
df_plot=tube_df.loc[tube_df['chr_type']==chr_type]
#df_plot=tube_df.sample(frac=17/100,replace=False,random_state=1)
for genotype in df_plot['genotype'].unique():
    df_plot_2 = df_plot[df_plot['genotype'] == genotype]
    trace = go.Scatter(x=df_plot_2['condition_min'], y=df_plot_2['m6A_frac_diff_X_from_A'], mode='lines+markers', name=genotype,line=dict(dash='dash', color=color_map[genotype]))
    fig.append_trace(trace,row = 1, col = 1)

chr_type = "X"
df_plot=tube_df.loc[tube_df['chr_type']==chr_type]
#df_plot=tube_df.sample(frac=17/100,replace=False,random_state=1)
for genotype in df_plot['genotype'].unique():
    df_plot_2 = df_plot[df_plot['genotype'] == genotype]
    trace = go.Scatter(x=df_plot_2['condition_min'], y=df_plot_2['m6A_frac_diff_X_from_A'], mode='lines+markers', name=genotype,line=dict(dash='dash', color=color_map[genotype]))
    fig.append_trace(trace,row = 1, col = 2)

#fig.append_trace(trace0, row = 1, col = 1)
#fig.append_trace(trace1, row = 1, col = 2)

# remove boxplot fill color
# fig.update_traces(fillcolor='rgba(0,0,0,0)')
fig['layout'].update(height = 600,width = 1000)
fig.update_layout(template="plotly_white",title=plot_title)
fig.update_xaxes(showticklabels=True)
#fig.update_yaxes(range=[0.035, 0.08])
fig.update_annotations(font_size=14)
fig.update_traces(marker=dict(size=3))
# change line color to grey and marker color to based on genotype

'''fig = add_p_value_annotation(fig, [[0,1]], 1, _format=dict(interline=0.07, text_height=1.07, color='black'))
fig = add_p_value_annotation(fig, [[0,1]], 2, _format=dict(interline=0.07, text_height=1.07, color='black'))
fig = add_p_value_annotation(fig, [[0,1]], 3, _format=dict(interline=0.07, text_height=1.07, color='black'))
fig = add_p_value_annotation(fig, [[0,1]], 4, _format=dict(interline=0.07, text_height=1.07, color='black'))
fig = add_p_value_annotation(fig, [[0,1]], 5, _format=dict(interline=0.07, text_height=1.07, color='black'))'''
#fig.update_layout(boxmode='group', xaxis_tickangle=0)

#fig.layout.annotations[0].update(y=-0.1)
#fig.layout.annotations[1].update(y=-0.1)
#fig.layout.annotations[1].update(y=-0.1)
#fig.layout.annotations[3].update(y=-0.1)
#fig.layout.annotations[4].update(y=-0.1)

fig.show()
#Export plotly figure to .svg
fig.write_image(output_stem + "change_in_m6A_30min_"  + str(m6A_thresh) +".svg")


                           condition  condition_min  chr_type  m6A_frac  \
0  AID_SDC_2 + Auxin; 2uM Hia5 30min            120  Autosome  0.327751   
2                   SDC2_AID_120_min            120  Autosome  0.238307   
4                    SDC2_AID_30_min            120  Autosome  0.229125   
1  AID_SDC_2 + Auxin; 2uM Hia5 30min            120         X  0.329548   
3                   SDC2_AID_120_min            120         X  0.235286   
5                    SDC2_AID_30_min            120         X  0.225366   

                            genotype  m6A_frac_diff  norm_m6A_frac_diff  \
0  AID_SDC_2 + Auxin; 2uM Hia5 30min            0.0                 0.0   
2                   SDC2_AID_120_min            0.0                 0.0   
4                    SDC2_AID_30_min            0.0                 0.0   
1  AID_SDC_2 + Auxin; 2uM Hia5 30min            0.0                 0.0   
3                   SDC2_AID_120_min            0.0                 0.0   
5                    SDC


Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)


Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)



KeyError: 'AID_SDC_2 + Auxin; 2uM Hia5 30min'

In [8]:
#source: https://stackoverflow.com/questions/67505252/plotly-box-p-value-significant-annotation
def add_p_value_annotation(fig, array_columns, subplot=None, _format=dict(interline=0.07, text_height=1.07, color='black')):
    ''' Adds notations giving the p-value between two box plot data (t-test two-sided comparison)

    Parameters:
    ----------
    fig: figure
        plotly boxplot figure
    array_columns: np.array
        array of which columns to compare
        e.g.: [[0,1], [1,2]] compares column 0 with 1 and 1 with 2
    subplot: None or int
        specifies if the figures has subplots and what subplot to add the notation to
    _format: dict
        format characteristics for the lines

    Returns:
    -------
    fig: figure
        figure with the added notation
    '''
    # Specify in what y_range to plot for each pair of columns
    y_range = np.zeros([len(array_columns), 2])
    for i in range(len(array_columns)):
        y_range[i] = [1.01+i*_format['interline'], 1.02+i*_format['interline']]

    # Get values from figure
    fig_dict = fig.to_dict()

    # Get indices if working with subplots
    if subplot:
        if subplot == 1:
            subplot_str = ''
        else:
            subplot_str =str(subplot)
        indices = [] #Change the box index to the indices of the data for that subplot
        for index, data in enumerate(fig_dict['data']):
            #print(index, data['xaxis'], 'x' + subplot_str)
            if data['xaxis'] == 'x' + subplot_str:
                indices = np.append(indices, index)
        indices = [int(i) for i in indices]
        print((indices))
    else:
        subplot_str = ''

    # Print the p-values
    for index, column_pair in enumerate(array_columns):
        if subplot:
            data_pair = [indices[column_pair[0]], indices[column_pair[1]]]
        else:
            data_pair = column_pair

        # Mare sure it is selecting the data and subplot you want
        #print('0:', fig_dict['data'][data_pair[0]]['name'], fig_dict['data'][data_pair[0]]['xaxis'])
        #print('1:', fig_dict['data'][data_pair[1]]['name'], fig_dict['data'][data_pair[1]]['xaxis'])

        # Get the p-value
        pvalue = stats.ttest_ind(
            fig_dict['data'][data_pair[0]]['y'],
            fig_dict['data'][data_pair[1]]['y'],
            equal_var=False,
        )[1]
        if pvalue >= 0.05:
            symbol = 'ns'
        elif pvalue >= 0.01:
            symbol = '*'
        elif pvalue >= 0.001:
            symbol = '**'
        else:
            symbol = '***'
        # Vertical line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[0], y0=y_range[index][0],
            x1=column_pair[0], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        # Horizontal line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[0], y0=y_range[index][1],
            x1=column_pair[1], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        # Vertical line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[1], y0=y_range[index][0],
            x1=column_pair[1], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        ## add text at the correct x, y coordinates
        ## for bars, there is a direct mapping from the bar number to 0, 1, 2...
        fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
            x=(column_pair[0] + column_pair[1])/2,
            y=y_range[index][1]*_format['text_height'],
            showarrow=False,
            text=symbol,
            textangle=0,
            xref="x"+subplot_str,
            yref="y"+subplot_str+" domain"
        ))
    return fig

In [27]:
# Plot the boxplot
marker_colors =["#c45746","#16415e"]

fig = make_subplots(rows=1, cols=len(conditions),
                y_title = "Whole Chromosome m6A/A",
                shared_yaxes=True,
                subplot_titles=(conditions))

for i in range(0,len(conditions)):
    tube_df = combined_regions.loc[combined_regions['condition']==conditions[i]]
    chr_type = "Autosome"
    df_plot=tube_df.loc[tube_df['chr_type']==chr_type]
    df_plot=tube_df.sample(frac=17/100,replace=False,random_state=1)
    m6A_frac_tube = [df_plot['total_m6A'].sum()/df_plot['total_bases'].sum()]
    print(m6A_frac_tube)
    trace0 = go.Bar(x=df_plot['condition']+" ", y=m6A_frac_tube,
                         name=chr_type, marker_color =marker_colors[0])

    chr_type = "X"
    df_plot=tube_df.loc[tube_df['chr_type']==chr_type]
    m6A_frac_tube = [df_plot['total_m6A'].sum()/df_plot['total_bases'].sum()]
    trace1 = go.Bar(x=df_plot['condition'], y=m6A_frac_tube,
                         name=chr_type, marker_color =marker_colors[1])

    fig.append_trace(trace0, row = 1, col = i+1)
    fig.append_trace(trace1, row = 1, col = i+1)
    
fig['layout'].update(height = 800)
fig.update_layout(template="plotly_white")
fig.update_xaxes(showticklabels=False)
#fig.update_yaxes(range=[0.7, 1.3])
    
#fig.update_layout(boxmode='group', xaxis_tickangle=0)
fig.show()

[0.06123028839323767]
[0.05535863219349458]


In [28]:
# Plot the boxplot
marker_colors =["#fde725","#a0da39","#4ac16d"]#,"#1fa187","#277f8e","#365c8d","#46327e","#440154","#c45746","#16415e"]

plotly_conditions = conditions

fig = make_subplots(rows=1, cols=len(conditions),
                y_title = "Coverage",
                shared_yaxes=True,
                subplot_titles=(plotly_conditions))

print("Total MB aligned for ALL conditons: ",int(combined_regions['total_bases'].sum()/1000000),
     " | across ", int(combined_regions['overlapping_reads'].sum())," reads with avg. length of: ",
     int(combined_regions['total_bases'].sum()/combined_regions['overlapping_reads'].sum()))
for i in range(0,len(conditions)):
    tube_df = combined_regions.loc[combined_regions['condition']==conditions[i]]
    m6A_frac_tube = [tube_df['total_bases'].sum()/100000000*3.125] #3.125 is the scaling factor for adenosines in c elegans genome.
    print("Total MB aligned for ",conditions[i],
          ": ",int(tube_df['total_bases'].sum()/1000000), 
          " | across ", int(tube_df['overlapping_reads'].sum()),
          " reads with avg. length of: ",
          int(tube_df['total_bases'].sum()/tube_df['overlapping_reads'].sum()))
    trace0 = go.Bar(x=tube_df['condition']+" ", y=m6A_frac_tube,
                         name=plotly_conditions[i], marker_color =marker_colors[i])

    fig.append_trace(trace0, row = 1, col = i+1)
    
fig['layout'].update(height = 800)
fig.update_layout(template="plotly_white")
fig.update_xaxes(showticklabels=False)
#fig.update_yaxes(range=[0.7, 1.3])
    
#fig.update_layout(boxmode='group', xaxis_tickangle=0)
fig.show()

Total MB aligned for ALL conditons:  0  | across  291  reads with avg. length of:  413
Total MB aligned for  N2-DPY27_dimelo_pAHia5 :  0  | across  130  reads with avg. length of:  414
Total MB aligned for  N2-DPY27_dimelo_RbNbHia5 :  0  | across  161  reads with avg. length of:  412
