In [1]:
# Purpose of this script is to take in a .bam file and a .bed file and determine single fiber nucleosome positioning
__author__ = "Yuri Malina"
__contact__ = "ymalina@berkeley.edu"
__copyright__ = "The Meyer Lab, UC Berkeley"
__credits__ = [""]
__date__ = "2/6/2023"
__deprecated__ = False
__status__ = "In development"
__version__ = "0.0.1"

In [1]:
import pandas as pd
#import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import pysam
import numpy
import random
import io
from scipy import stats
from IPython.display import clear_output
# install tabix with:
# apt-get install tabix 

# For DANPOS
import gzip
import sys  
sys.path.insert(0, '/Data1/software/DANPOS3')
from danpos import danpos
import os,sys
from time import time
from functions import danpos, div
from math import log10
from copy import deepcopy
from wig import Wig
from wigs import Wigs
import argparse,sys,os
from lib import positionSelectorByGreatTSS,positionDicMinMax,vioplot,positionDistance,positionSelectorByValue,positionSelectorByGeneStructure,batchOccAroundPoints,batchOccInRegions,occAroundPoints,plot,batchOccPSD,retrieve_positions_by_value,batchPositionDistanceDistribution,batchPositionValDistribution
from wiq import rawsort,refquantile,changevalue,qnorwig,wiq,wig2wiq
from rpy2.robjects import r

sys.path.insert(0, '/Data1/git/meyer-nanopore/scripts/Analysis/')
from invert_wiggle import norm_wig, invert_wig, fill_wig, write_wig


In [11]:
### Configurations
m6A_thresh = 129 #default is 129
mC_thresh = 129 #default is 129
coreNum = 96 # cores to use

## Bed file configurations:
window = 1000 #window to add on either side of region start / end (Necessary for TSS and TES)
sample_source = "type" # "chr_type" or "type" or "chromosome"
sampleName = ["CHROMOSOME_I", "CHROMOSOME_II", "CHROMOSOME_III", "CHROMOSOME_IV", "CHROMOSOME_V","CHROMOSOME_X"] # "TES_q1" "strong_rex" "weak_rex" "type", "X", "Autosome"; Must be same number of unique values in selected bed rows.
chr_type_selected = ["X","Autosome"] # 'X' or "Autosome"
type_selected = ["TSS_q1"] #TES_q1-4 | #TSS_q1-4 | strong/weak rex | whole_chr | 200kb_region | 50kb_region | mex_motif | mexII_motif
max_regions = 100 # max regions to consider; 0 = full set;
chromosome_selected = ["CHROMOSOME_I", "CHROMOSOME_II", "CHROMOSOME_III", "CHROMOSOME_IV", "CHROMOSOME_V","CHROMOSOME_X"]
strand_selected = ["+"] #+ and/or -
bed_file = "~/Data1/reference/tss_tes_rex_combined.bed"
mods = "A" # {A,CG,A+CG}
if sample_source == "chr_type":
    selection = chr_type_selected
if sample_source == "type":
    selection = type_selected
if sample_source == "chromosome":
    selection = chromosome_selected

## Bam file configurations
random.seed(10)

### Tube D
bam_frac = 1 # For full .bam set to = 1
bam_file = "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/mod_mappings.sorted.bam"
output_stem = "/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/"
condition = "N2; 2uM Hia5 30min"

### Tube 4
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam"
#output_stem = "/Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/"
#condition = "N2; 2uM Hia5 120min"

### Tube H
#bam_frac = 1 # For full .bam set to = 1
#bam_file = "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/mod_mappings.sorted.bam"
#output_stem = "/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/basecalls/m6A/"
#condition = "AID::SDC-2 + Auxin; 2uM Hia5 30min"

In [11]:
### Select bed file
full_bed = pd.read_csv(bed_file,sep='\t')
bed=[]

for each_type in selection:
# REGION CONFIGURATION
    if sample_source == "type":
        temp_bed = full_bed[full_bed["chromosome"].isin(chromosome_selected) &
                            full_bed["chr-type"].isin(chr_type_selected) &
                            full_bed["type"].str.contains(each_type) &
                            full_bed["strand"].isin(strand_selected)]
    if sample_source == "chr_type":
        temp_bed = full_bed[full_bed["chromosome"].isin(chromosome_selected) &
                            full_bed["chr-type"].str.contains(each_type) &
                            full_bed["type"].isin(type_selected) &
                            full_bed["strand"].isin(strand_selected)]
    if sample_source == "chromosome":
        temp_bed = full_bed[full_bed["chromosome"].str.contains(each_type) &
                            full_bed["chr-type"].isin(chr_type_selected) &
                            full_bed["type"].isin(type_selected) &
                            full_bed["strand"].isin(strand_selected)]

    # Drop random regions to match max_regions
    drop_count = len(temp_bed)-max_regions
    # If max regions > selected regions, do not drop any.
    if(drop_count<0):
        drop_count=0
    # If max_regions = 0, do not drop any.
    if (max_regions == 0):
        drop_count = 0

    drop_indices = numpy.random.choice(temp_bed.index, drop_count, replace=False)
    temp_bed.drop(drop_indices,inplace=True)
    temp_bed.sort_values(by=["chromosome","start"],ascending=True,inplace=True)
    temp_bed.reset_index(drop=True, inplace=True)
    temp_bed["start"]=temp_bed["start"] - window
    temp_bed["end"]=temp_bed["end"] + window
    temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed"
    temp_bedfile_gz = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
    temp_bed.to_csv(temp_bedfile, sep="\t",header=False,index=False)
    
    # Create indexed tabix files
    print(temp_bed)
    ! bgzip -c {temp_bedfile} > {temp_bedfile_gz}
    ! tabix -f -p bed {temp_bedfile_gz}

    # For first iteration
    if bed == []:
        bed = [temp_bedfile]

    # Otherwise append region to temporary bed file.
    else:
        bed.append(temp_bedfile)

      chromosome     start       end strand    type  chr-type
0   CHROMOSOME_I   1697180   1699180      +  TSS_q1  Autosome
1   CHROMOSOME_I   1873124   1875124      +  TSS_q1  Autosome
2   CHROMOSOME_I   4253239   4255239      +  TSS_q1  Autosome
3   CHROMOSOME_I   5356693   5358693      +  TSS_q1  Autosome
4   CHROMOSOME_I   6259448   6261448      +  TSS_q1  Autosome
..           ...       ...       ...    ...     ...       ...
95  CHROMOSOME_X  15705426  15707426      +  TSS_q1         X
96  CHROMOSOME_X  16349201  16351201      +  TSS_q1         X
97  CHROMOSOME_X  16726160  16728160      +  TSS_q1         X
98  CHROMOSOME_X  16960668  16962668      +  TSS_q1         X
99  CHROMOSOME_X  17433488  17435488      +  TSS_q1         X

[100 rows x 6 columns]




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [12]:
### Extract only reads in bam file that overlap selected regions, and subselect down using fraction
# Define function to generate redux .bam file
def extract_overlapping_reads(bed_file, bam_file, output_file):
    # Load the BAM file
    bam_ext = pysam.AlignmentFile(bam_file, "rb")

    # Load the BED file
    regions = pysam.TabixFile(bed_file)

    '''# Create the output BAM file
    out_bam = pysam.AlignmentFile(output_file, "wb", template=bam)

    # Iterate over the reads in the input BAM file
    for read in bam:
        # Check if the read overlaps a region in the BED file
        overlaps = [r for r in regions.fetch(read.reference_name, read.reference_start, read.reference_end)]
        if len(overlaps) > 0:
            # Write the read to the output BAM file if it overlaps a region in the BED file
            out_bam.write(read)

    # Close the input BAM file and the output BAM file
    bam.close()
    out_bam.close()'''

    # Create a set to store the reads
    seen = set()

    # Create the output .bam file
    with pysam.AlignmentFile(output_file, "wb", header=bam_ext.header) as out:
        # Iterate over the regions in the .bed file
        for region in regions.fetch():
            print(region)
            chrom, start, end, strand, region_type, chr_type = region.split()
            start, end = int(start), int(end)

            # Iterate over the reads in the current region
            for pileupcolumn in bam_ext.pileup(chrom, start, end):
                for pileupread in pileupcolumn.pileups:
                    # Check if the read has not been seen before
                    if pileupread.alignment.query_name not in seen:
                        # Add the read to the set of seen reads
                        seen.add(pileupread.alignment.query_name)
                        # Write the read to the output .bam file
                        out.write(pileupread.alignment)
    
for each_type in selection:
    temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
    output_bamfile = output_stem + "mod_mappings_" + str(bam_frac)+"_"+each_type+".sorted.bam"
    extract_overlapping_reads(temp_bedfile, bam_file, output_bamfile)
    ### Subselect bam file using fraction
    ! samtools view -h -s {bam_frac} -L {temp_bedfile} {bam_file} | samtools view -h -b - > {output_bamfile}
    ! samtools index {output_bamfile}

CHROMOSOME_I	1697180	1699180	+	TSS_q1	Autosome
CHROMOSOME_I	1873124	1875124	+	TSS_q1	Autosome
CHROMOSOME_I	4253239	4255239	+	TSS_q1	Autosome
CHROMOSOME_I	5356693	5358693	+	TSS_q1	Autosome
CHROMOSOME_I	6259448	6261448	+	TSS_q1	Autosome
CHROMOSOME_I	6316431	6318431	+	TSS_q1	Autosome
CHROMOSOME_I	6842598	6844598	+	TSS_q1	Autosome
CHROMOSOME_I	9721583	9723583	+	TSS_q1	Autosome
CHROMOSOME_I	9868399	9870399	+	TSS_q1	Autosome
CHROMOSOME_I	14003415	14005415	+	TSS_q1	Autosome
CHROMOSOME_II	867469	869469	+	TSS_q1	Autosome
CHROMOSOME_II	4903565	4905565	+	TSS_q1	Autosome
CHROMOSOME_II	5202258	5204258	+	TSS_q1	Autosome
CHROMOSOME_II	6369394	6371394	+	TSS_q1	Autosome
CHROMOSOME_II	6782099	6784099	+	TSS_q1	Autosome
CHROMOSOME_II	6925685	6927685	+	TSS_q1	Autosome
CHROMOSOME_II	7088532	7090532	+	TSS_q1	Autosome
CHROMOSOME_II	7536856	7538856	+	TSS_q1	Autosome
CHROMOSOME_II	7614657	7616657	+	TSS_q1	Autosome
CHROMOSOME_II	7838857	7840857	+	TSS_q1	Autosome
CHROMOSOME_II	11304884	11306884	+	TSS_q1	Autosome


In [13]:
### Caculate average m6A per region based on input bed and bam files
def extract_fiber_nucpos(bam_file, bed_file, threshold):
    # Load the BAM file
    bam_ext = pysam.AlignmentFile(bam_file, "rb")
    print(bam_ext.count())
    # Load the BED file
    regions = pysam.TabixFile(bed_file)

    # Initialize a list to store the results
    results_df = pd.DataFrame(columns=['chr', 'start', 'end', 'smt_pos', 'read_id'])

    # Iterate over the regions in the BED file
    #methylation_status = pd.DataFrame(columns=["chr","start","end","strand","region_type","chr_type",
    #                               "read","1","2","3","4","5","6","7","8","9","10","11","12"])
    region_count = 0
    for region in regions.fetch():
        region_count += 1
        # Split the region string into the chromosome, start, and end positions
        print("Starting on region:",region_count," out of ",max_regions)
        chromosome, start, end, strand, region_type, chr_type = region.split()
        start = int(start)
        end = int(end)
       
        for read in bam_ext.fetch(chromosome, start, end):
            if not read.is_unmapped and read.is_forward == True:
                #Load up the modified base probilities
                m6A_dict = read.modified_bases[('A', 0, 'Y')] #('A', 0, 'a')

                #Convert list of tuples to dataframe in .wig format
                read_wig = pd.DataFrame(m6A_dict, columns =['variableStep', 'chrom='+chromosome])
                read_wig['span=1']=""
                read_wig['variableStep']=read_wig['variableStep']+read.reference_start

                #Save temp .wig file
                read_wig.to_csv(output_stem+'temp.wig',sep='\t',index=False)

                #open .wig as file in memory
                wig_file_obj = open(os.path.join(output_stem, "temp.wig"),"r")
                wig_file = io.StringIO(wig_file_obj.read())
                wig_file.seek(0)

                #Normalize wig file (divides by 256 so value is between 0-1)
                wig_file_norm = norm_wig(wig_file)
                #print("wig_file_norm",wig_file_norm.readlines())
                #wig_file_norm.seek(0)

                #Invert wig file, since high methylation indicates no protein occupancy. Also puts on 0-10 scale.
                wig_file_inv = invert_wig(wig_file_norm,10)
                #print("wig_file_inv",wig_file_inv.readlines())
                #wig_file_inv.seek(0)

                #Output updated temp.wig file
                write_wig(wig_file_inv,os.path.join(output_stem, "temp.wig"))

                #read_wig.to_csv(output_stem+'temp.wig',sep='\t',index=False)
                temp_result = os.path.join(output_stem, "danpos_temp_height1")
                #print("Starting Danpos on:",wig_file_inv)
                
                #prevent from outputting verbose logs:
                old_stdout = sys.stdout # backup current stdout
                sys.stdout = open(os.devnull, "w")
                danpos(os.path.join(output_stem, "temp.wig"),height=1,opath=temp_result)
                sys.stdout = old_stdout # reset old stdout

                #read nucleosome position into file
                read_nucs = pd.read_csv(os.path.join(output_stem+"danpos_temp_height1/pooled/", "Data1_seq_data_TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22_basecalls_m6A_temp.smooth.positions.xls"), sep="\t")
                read_nucs = read_nucs.drop(columns=["smt_value","fuzziness_score"])
                read_nucs["read_id"] = read.query_name
                read_nucs["chr_type"] = chr_type
                read_nucs = read_nucs[((read_nucs['start']>start) & (read_nucs['end'] <= end))]
                read_nucs['smt_pos'] = read_nucs['smt_pos'] - (start+window)
                results_df = pd.concat([results_df, read_nucs])
                #print("RESULTS:",results_df)
                    
    return results_df

                    
'''                    seq_row = [chromosome, start, end, strand, region_type, chr_type,read_name]
                    total_A = 0

                    #print(m6A_dict)
                    
                    for (i, base) in enumerate(read.seq):
                        #print("start:",start," | read.reference_start + i",read.reference_start + i," | end:",end)
                        if base == "A":
                            total_A += 1
                        if start-1 <= read.reference_start + i < end:
                            #print("i,base",i," , ", base)
                            if base == "A":
                                #print("i: ",i)
                                try: base = [item for item in m6A_dict if item[0] == i][0][1]
                                except: base = 0
                            else:
                                base = numpy.NaN
                            seq_row.append(base)
                            #print("seq_row",seq_row)
                    #print("seq_row:",seq_row)
                    if len(seq_row) == len(methylation_status.columns):
                        methylation_status.loc[len(methylation_status)+1] = seq_row
    print("METH STATUS=",methylation_status)
    return methylation_status'''

for each_type in selection:
    bam = output_stem + "mod_mappings_" + str(bam_frac)+"_"+each_type+".sorted.bam"
    temp_bedfile = "/Data1/reference/temp_do_not_use_"+each_type+".bed.gz"
    output_df = extract_fiber_nucpos(bam,temp_bedfile,m6A_thresh)
    #output_df.to_csv(output_stem + "m6A_frac" + str(bam_frac)+"_"+each_type+".csv", index=False, mode='w')
    print(output_df)

14224
Starting on region: 1  out of  100
Progress: 10.00%
Progress: 20.00%
Progress: 29.99%
Progress: 39.99%
Progress: 49.99%
Progress: 59.99%
Progress: 69.98%
Progress: 79.98%
Progress: 89.98%
Progress: 99.98%
SAVING OUTPUT TO: /Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/temp.wig


FileNotFoundError: [Errno 2] No such file or directory: '/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/basecalls/m6A/danpos_temp_height1/pooled/Data1_seq_data_TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19_basecalls_m6A_temp.smooth.positions.xls'

In [8]:
#Convert output df to n+1 table
nuc_df = output_df
nuc_df['smt_pos']=pd.to_numeric(nuc_df['smt_pos'], errors='coerce')
nuc_df = nuc_df.sort_values(by=['read_id','smt_pos'])


nuc_df['n-5'] = nuc_df['smt_pos'].groupby(nuc_df['read_id']).transform(lambda x: 
                                                                       x[x < 0].nlargest(len(x[x < 0])).iloc[4] if 
                                                                       len(x[x < 0]) >= 5 else numpy.nan)
nuc_df['n-4'] = nuc_df['smt_pos'].groupby(nuc_df['read_id']).transform(lambda x: 
                                                                       x[x < 0].nlargest(len(x[x < 0])).iloc[3] if 
                                                                       len(x[x < 0]) >= 4 else numpy.nan)
nuc_df['n-3'] = nuc_df['smt_pos'].groupby(nuc_df['read_id']).transform(lambda x: 
                                                                       x[x < 0].nlargest(len(x[x < 0])).iloc[2] if 
                                                                       len(x[x < 0]) >= 3 else numpy.nan)
nuc_df['n-2'] = nuc_df['smt_pos'].groupby(nuc_df['read_id']).transform(lambda x: 
                                                                       x[x < 0].nlargest(len(x[x < 0])).iloc[1] if 
                                                                       len(x[x < 0]) >= 2 else numpy.nan)
nuc_df['n-1'] = nuc_df['smt_pos'].groupby(nuc_df['read_id']).transform(lambda x: 
                                                                       x[x < 0].nlargest(len(x[x < 0])).iloc[0] if 
                                                                       len(x[x < 0]) >= 1 else numpy.nan)
nuc_df['n+1'] = nuc_df['smt_pos'].groupby(nuc_df['read_id']).transform(lambda x: 
                                                                       x[x > 0].nsmallest(len(x[x > 0])).iloc[0] if 
                                                                       len(x[x > 0]) >= 1 else numpy.nan)
nuc_df['n+2'] = nuc_df['smt_pos'].groupby(nuc_df['read_id']).transform(lambda x: 
                                                                       x[x > 0].nsmallest(len(x[x > 0])).iloc[1] if 
                                                                       len(x[x > 0]) >= 2 else numpy.nan)
nuc_df['n+3'] = nuc_df['smt_pos'].groupby(nuc_df['read_id']).transform(lambda x: 
                                                                       x[x > 0].nsmallest(len(x[x > 0])).iloc[2] if 
                                                                       len(x[x > 0]) >= 3 else numpy.nan)
nuc_df['n+4'] = nuc_df['smt_pos'].groupby(nuc_df['read_id']).transform(lambda x: 
                                                                       x[x > 0].nsmallest(len(x[x > 0])).iloc[3] if 
                                                                       len(x[x > 0]) >= 4 else numpy.nan)
nuc_df['n+5'] = nuc_df['smt_pos'].groupby(nuc_df['read_id']).transform(lambda x: 
                                                                       x[x > 0].nsmallest(len(x[x > 0])).iloc[4] if 
                                                                       len(x[x > 0]) >= 5 else numpy.nan)

# create dataframe with single row for each read
nuc_pos_df = nuc_df[['chr_type','read_id','n-5','n-4','n-3','n-2','n-1','n+1','n+2','n+3','n+4','n+5']]
nuc_pos_df = nuc_pos_df.drop_duplicates()

# create dataframe with mean n+x nucleosome position by chr_type
nuc_pos_mean_df = nuc_pos_df.groupby(['chr_type']).mean().reset_index()
print(nuc_pos_mean_df)
lookup_m1 = nuc_pos_mean_df.set_index('chr_type')['n-1'].to_dict()
lookup_m2 = nuc_pos_mean_df.set_index('chr_type')['n-2'].to_dict()
lookup_m3 = nuc_pos_mean_df.set_index('chr_type')['n-3'].to_dict()
lookup_m4 = nuc_pos_mean_df.set_index('chr_type')['n-4'].to_dict()
lookup_m5 = nuc_pos_mean_df.set_index('chr_type')['n-5'].to_dict()
lookup_p1 = nuc_pos_mean_df.set_index('chr_type')['n+1'].to_dict()
lookup_p2 = nuc_pos_mean_df.set_index('chr_type')['n+2'].to_dict()
lookup_p3 = nuc_pos_mean_df.set_index('chr_type')['n+3'].to_dict()
lookup_p4 = nuc_pos_mean_df.set_index('chr_type')['n+4'].to_dict()
lookup_p5 = nuc_pos_mean_df.set_index('chr_type')['n+5'].to_dict()
print(nuc_pos_df)
print(nuc_pos_mean_df)

# Create dataframe with nucleosome offset for each read
nuc_offset_df = nuc_pos_df

#Add average offset for each nucleosome by looking up in previous dictionaries.
nuc_offset_df['n-5_mean'] = nuc_offset_df['chr_type'].map(lookup_m5)
nuc_offset_df['n-4_mean'] = nuc_offset_df['chr_type'].map(lookup_m4)
nuc_offset_df['n-3_mean'] = nuc_offset_df['chr_type'].map(lookup_m3)
nuc_offset_df['n-2_mean'] = nuc_offset_df['chr_type'].map(lookup_m2)
nuc_offset_df['n-1_mean'] = nuc_offset_df['chr_type'].map(lookup_m1)
nuc_offset_df['n+1_mean'] = nuc_offset_df['chr_type'].map(lookup_p1)
nuc_offset_df['n+2_mean'] = nuc_offset_df['chr_type'].map(lookup_p2)
nuc_offset_df['n+3_mean'] = nuc_offset_df['chr_type'].map(lookup_p3)
nuc_offset_df['n+4_mean'] = nuc_offset_df['chr_type'].map(lookup_p4)
nuc_offset_df['n+5_mean'] = nuc_offset_df['chr_type'].map(lookup_p5)

#update offset columns
nuc_offset_df['n-5_off']=abs(nuc_offset_df['n-5']-nuc_offset_df['n-5_mean'])
nuc_offset_df['n-4_off']=abs(nuc_offset_df['n-4']-nuc_offset_df['n-4_mean'])
nuc_offset_df['n-3_off']=abs(nuc_offset_df['n-3']-nuc_offset_df['n-3_mean'])
nuc_offset_df['n-2_off']=abs(nuc_offset_df['n-2']-nuc_offset_df['n-2_mean'])
nuc_offset_df['n-1_off']=abs(nuc_offset_df['n-1']-nuc_offset_df['n-1_mean'])
nuc_offset_df['n+1_off']=abs(nuc_offset_df['n+1']-nuc_offset_df['n+1_mean'])
nuc_offset_df['n+2_off']=abs(nuc_offset_df['n+2']-nuc_offset_df['n+2_mean'])
nuc_offset_df['n+3_off']=abs(nuc_offset_df['n+3']-nuc_offset_df['n+3_mean'])
nuc_offset_df['n+4_off']=abs(nuc_offset_df['n+4']-nuc_offset_df['n+4_mean'])
nuc_offset_df['n+5_off']=abs(nuc_offset_df['n+5']-nuc_offset_df['n+5_mean'])
print(nuc_offset_df)


   chr_type         n-5         n-4         n-3         n-2         n-1  \
0  Autosome -760.294118 -772.065891 -677.984424 -460.528977 -204.755030   
1         X -734.146341 -692.400000 -677.206897 -457.830645 -211.411765   

          n+1         n+2         n+3         n+4         n+5  
0  191.051103  450.218519  683.884101  780.079710  703.571429  
1  205.487544  446.793233  652.900000  716.421053  720.428571  
     chr_type                               read_id    n-5    n-4    n-3  \
15   Autosome  0005df47-49f7-4804-8ea6-52291d25e716    NaN    NaN -744.0   
11   Autosome  0086b2a8-3909-49ac-be20-888a11cdf522    NaN    NaN -832.0   
250  Autosome  009f51c6-85f8-431d-a589-1e71d4a7e920    NaN    NaN -893.0   
3    Autosome  00b6e6e5-f688-4aec-a3ab-783865ba5683    NaN    NaN -805.0   
44   Autosome  00c4be0c-a683-4208-a397-5d38199fecf7 -866.0 -676.0 -506.0   
..        ...                                   ...    ...    ...    ...   
26   Autosome  fefe3a99-27b0-410a-a5ed-7b66e10f59c


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



In [15]:
#source: https://stackoverflow.com/questions/67505252/plotly-box-p-value-significant-annotation
def add_p_value_annotation(fig, array_columns, subplot=None, _format=dict(interline=0.07, text_height=1.07, color='black')):
    ''' Adds notations giving the p-value between two box plot data (t-test two-sided comparison)
    
    Parameters:
    ----------
    fig: figure
        plotly boxplot figure
    array_columns: np.array
        array of which columns to compare 
        e.g.: [[0,1], [1,2]] compares column 0 with 1 and 1 with 2
    subplot: None or int
        specifies if the figures has subplots and what subplot to add the notation to
    _format: dict
        format characteristics for the lines

    Returns:
    -------
    fig: figure
        figure with the added notation
    '''
    # Specify in what y_range to plot for each pair of columns
    y_range = numpy.zeros([len(array_columns), 2])
    for i in range(len(array_columns)):
        y_range[i] = [1.01+i*_format['interline'], 1.02+i*_format['interline']]

    # Get values from figure
    fig_dict = fig.to_dict()

    # Get indices if working with subplots
    if subplot:
        if subplot == 1:
            subplot_str = ''
        else:
            subplot_str =str(subplot)
        indices = [] #Change the box index to the indices of the data for that subplot
        for index, data in enumerate(fig_dict['data']):
            #print(index, data['xaxis'], 'x' + subplot_str)
            if data['xaxis'] == 'x' + subplot_str:
                indices = numpy.append(indices, index)
        indices = [int(i) for i in indices]
        print((indices))
    else:
        subplot_str = ''

    # Print the p-values
    for index, column_pair in enumerate(array_columns):
        if subplot:
            data_pair = [indices[column_pair[0]], indices[column_pair[1]]]
        else:
            data_pair = column_pair

        # Mare sure it is selecting the data and subplot you want
        #print('0:', fig_dict['data'][data_pair[0]]['name'], fig_dict['data'][data_pair[0]]['xaxis'])
        #print('1:', fig_dict['data'][data_pair[1]]['name'], fig_dict['data'][data_pair[1]]['xaxis'])

        # Get the p-value
        pvalue = stats.ttest_ind(
            fig_dict['data'][data_pair[0]]['y'],
            fig_dict['data'][data_pair[1]]['y'],
            equal_var=False,
        )[1]
        if pvalue >= 0.05:
            symbol = 'ns'
        elif pvalue >= 0.01: 
            symbol = '*'
        elif pvalue >= 0.001:
            symbol = '**'
        else:
            symbol = '***'
        # Vertical line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[0], y0=y_range[index][0], 
            x1=column_pair[0], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        # Horizontal line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[0], y0=y_range[index][1], 
            x1=column_pair[1], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        # Vertical line
        fig.add_shape(type="line",
            xref="x"+subplot_str, yref="y"+subplot_str+" domain",
            x0=column_pair[1], y0=y_range[index][0], 
            x1=column_pair[1], y1=y_range[index][1],
            line=dict(color=_format['color'], width=2,)
        )
        ## add text at the correct x, y coordinates
        ## for bars, there is a direct mapping from the bar number to 0, 1, 2...
        fig.add_annotation(dict(font=dict(color=_format['color'],size=14),
            x=(column_pair[0] + column_pair[1])/2,
            y=y_range[index][1]*_format['text_height'],
            showarrow=False,
            text=symbol,
            textangle=0,
            xref="x"+subplot_str,
            yref="y"+subplot_str+" domain"
        ))
    return fig

In [10]:
# Plot the boxplot
marker_colors =["#c45746","#16415e"]
output_df_melt = nuc_offset_df.melt(id_vars=["chr_type"],value_vars=['n-4_off','n-3_off','n-2_off','n-1_off',
                                                                    'n+1_off','n+2_off','n+3_off','n+4_off']).dropna()
#print(output_df_melt)
fig = px.box(output_df_melt, x="variable", y="value", color="chr_type")
fig.update_layout(template="plotly_white")
fig.show()


In [None]:
'''#fig=go.Figure()
# Source for subplot schema: https://stackoverflow.com/questions/55698429/different-box-plot-series-traces-within-plotly-subplots
# Tube 4
chr_type = "Autosome"
df_plot=nuc_offset_df.loc[nuc_offset_df['chr_type']==chr_type]
trace0 = go.Box(x=df_plot['chr_type']+" ", y=df_plot['n-5_off'],
                     notched=True,
                     name=chr_type, marker_color =marker_colors[0])
print(df_plot)

chr_type = "X"
df_plot=nuc_offset_df.loc[nuc_offset_df['chr_type']==chr_type]
trace1 = go.Box(x=df_plot['chr_type'], y=df_plot['n-5_off'],
                     notched=True,
                     name=chr_type, marker_color =marker_colors[1])

print(df_plot)

# Tube D
chr_type = "Autosome"
df_plot=tubeD_df.loc[tubeD_df['chr_type']==chr_type]
trace2 = go.Box(x=df_plot['condition']+" ", y=df_plot['norm_m6A_frac'],
                     notched=True,
                     name=chr_type, marker_color =marker_colors[0])

chr_type = "X"
df_plot=tubeD_df.loc[tubeD_df['chr_type']==chr_type]
trace3 = go.Box(x=df_plot['condition'], y=df_plot['norm_m6A_frac'],
                     notched=True,
                     name=chr_type, marker_color =marker_colors[1])

# Tube H
chr_type = "Autosome"
df_plot=tubeH_df.loc[tubeH_df['chr_type']==chr_type]
trace4 = go.Box(x=df_plot['condition']+" ", y=df_plot['norm_m6A_frac'],
                     notched=True,
                     name=chr_type, marker_color =marker_colors[0])

chr_type = "X"
df_plot=tubeH_df.loc[tubeH_df['chr_type']==chr_type]
trace5 = go.Box(x=df_plot['condition'], y=df_plot['norm_m6A_frac'],
                     notched=True,
                     name=chr_type, marker_color =marker_colors[1])

fig = make_subplots(rows=1, cols=1,
                    y_title = "m6A/A (normalized to average Autosome)",
                    shared_yaxes=True,
                    subplot_titles=("N2; <br> 2uM Hia5 120min"))
                                    #"N2; <br> 2uM Hia5 30min", 
                                    #"AID::SDC-2 + Auxin; <br> 2uM Hia5 30min"))
fig.append_trace(trace0, row = 1, col = 1)
fig.append_trace(trace1, row = 1, col = 1)
fig.append_trace(trace2, row = 1, col = 2)
fig.append_trace(trace3, row = 1, col = 2)
fig.append_trace(trace4, row = 1, col = 3)
fig.append_trace(trace5, row = 1, col = 3)
fig = add_p_value_annotation(fig, [[0,1]],0)
fig = add_p_value_annotation(fig, [[0,1]],2)
fig = add_p_value_annotation(fig, [[0,1]],3)
fig.update_yaxes(dtick=0.05)
fig.layout.annotations[0].update(y=-0.1)
fig.layout.annotations[2].update(y=-0.1)
fig.layout.annotations[1].update(y=-0.1)
fig['layout'].update(height = 600)
fig.update_layout(template="plotly_white")
fig.update_xaxes(showticklabels=False)
    
fig.update_layout(boxmode='group', xaxis_tickangle=0)
fig.show()'''

In [119]:
#output_df_mean = output_df[["chr_type","1","2","3","4","5","6","7","8","9","10","11","12"]].groupby('chr_type',as_index=False).mean().transpose()
output_df_melt = output_df.melt(id_vars=["chr_type"],value_vars=["1","2","3","4","5","6","7","8","9","10","11","12"]).dropna()
#print(output_df_melt)
output_df_melt['m6A_binary']=numpy.where(output_df_melt['value'] > m6A_thresh, True, False)
#print(output_df_melt)
output_df_count = output_df_melt.groupby(['variable','chr_type']).size().reset_index(name='count')
output_df_count['variable']=output_df_count['variable'].astype(int)
output_df_count=output_df_count.sort_values(by=["chr_type","variable"],ascending=True)
output_df_count['variable']=output_df_count['variable'].astype(str)
#print(output_df_count)
output_df_mean = output_df_melt.groupby(['variable','chr_type']).mean().reset_index()
output_df_mean['variable']=output_df_mean['variable'].astype(int)
output_df_mean=output_df_mean.sort_values(by=["chr_type","variable"],ascending=True)
output_df_mean['variable']=output_df_mean['variable'].astype(str)
#print(output_df_mean)
#print(output_df_melt)

# Plot the boxplot
marker_colors =["#c45746","#16415e"]

fig = px.box(output_df_melt, x="variable", y="value", color="chr_type")
fig2 = px.bar(output_df_mean, x="variable", y="m6A_binary", color="chr_type",barmode="group")
fig3 = px.bar(output_df_count, x="variable", y="count", color="chr_type",barmode="group")

#fig.show()
fig2.show()
fig3.show()