In [None]:
# Code that reads in nucleosome positions and performs NRL analysis

In [None]:
import os
import nanotools
import importlib
import pandas as pd
import matplotlib.pyplot as plt
import numpy
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression # for linear regression analysis
from scipy.signal import find_peaks # For finding peaks in distribtion
#sns.set_style('darkgrid')


In [None]:
modbam2bed --extended -m 6mA -r CHROMOSOME_X:10000000-10100000 -t 6 --threshold 0.5 /Data1/reference/c_elegans.WS235.genomic.fa /Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/combined_pod5/barcode07/basecalls/barcode07.mod_mappings.sorted.bam /Data1/seq_data/AH_N2_SDC2aid_AuxRem_fiberseq_8_19_23/combined_pod5/barcode08/basecalls/barcode08.mod_mappings.sorted.bam /Data1/seq_data/TubeAB_FiberSeq_TimeC_N2_021Aux_4_10_23/basecalls/m6A_full/demux/mod_mappings_barcode_05.bam /Data1/seq_data/Tube4_b2_2uM-Hia5_fiber-seq_11_21_22/basecalls/mod_mappings.sorted.m6Aonly.bam /Data1/seq_data/TubeAD1_N2_fiberseq_6_13_23/mod_basecalls/mod_mappings.sorted.bam > /Data1/seq_data/combined_N2_fiber/combined_N2_fiber.chrX100kb.m6A.bed

In [None]:
### Import dataframes
# Import nucleosome positions
# Nucleosome cutoff = 9
nucleosome_positions_D = pd.read_csv("/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/wig/output/result_m6A_9/pooled/Data1_seq_data_TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22_wig_output_D_modified_bases.6mA.merged_strand_Hnorm_10SCALE.smooth.positions.xls", delimiter="\t")
nucleosome_positions_H = pd.read_csv("/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/wig/output/result_m6A_9/pooled/Data1_seq_data_TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19_wig_output_H_modified_bases.6mA.merged_strand_NOnorm_10SCALE.smooth.positions.xls", delimiter="\t")

# Nucleosome cutoff = 10
#nucleosome_positions_D = pd.read_csv("/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/wig/output/result_m6a_10/pooled/Data1_seq_data_TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22_wig_output_D_modified_bases.6mA.merged_strand_Hnorm_10SCALE.smooth.positions.xls", delimiter="\t")
#nucleosome_positions_H = pd.read_csv("/Data1/seq_data/TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19/wig/output/result_m6a_10/pooled/Data1_seq_data_TubeH1_021_SDC2-AIDpAux_Hia5_MSssI_12_19_wig_output_H_modified_bases.6mA.merged_strand_NOnorm_10SCALE.smooth.positions.xls", delimiter="\t")
display(nucleosome_positions_D.head(1))

# Import bed file
bed_raw = pd.read_csv("/Data1/reference/tss_tes_rex_combined.bed", delimiter="\t")
bed_raw = bed_raw.drop(columns=["chr-type"])
display('bed_raw:',bed_raw.head(1))
#display(bed_raw.type.unique())
bed_tss = bed_raw[bed_raw['type'].str.startswith('TSS')]
display('bed_tss:',bed_tss.head(1))

bed_rex = bed_raw[bed_raw['type'].str.endswith('rex')]
display('bed_rex:',bed_rex.head(1))
display('nucleosome_positions_D:',nucleosome_positions_D.head(1))

In [None]:
### Compute Frequency Tables
importlib.reload(nanotools)

frequency_df_D = nanotools.frequency_table(nucleosome_positions_D)
frequency_df_H = nanotools.frequency_table(nucleosome_positions_H)

display(frequency_df_D.tail(10))     
display(frequency_df_H.head(10))

#frequency_df_D.to_csv("/Data1/seq_data/TubeD1a_N2_Fiberseq_Hia5_MSssI_12_22_22/wig/output/result_m6a_10/pooled/D_nrl.csv",index=False,)
#frequency_df_H = frequency_table(nucleosome_positions_H)

In [None]:
### Create data subsets for plotting
frequency_df_D['genotype'] = 'N2'
frequency_df_H['genotype'] = 'AID-SDC2 + Aux'
data = pd.concat([frequency_df_D, frequency_df_H])
data['chr-type'] = data['chr'].apply(lambda x: 'X' if x == 'CHROMOSOME_X' else 'A')

data_x=data[data['chr']=='CHROMOSOME_X']
data_a=data[data['chr']!='CHROMOSOME_X']
data_D=data[data['genotype']=='N2']
data_H=data[data['genotype']!='N2']

#fig = sns.displot(data_D, x="dist", hue="chr-type", kde=True, cut=0)
data_D_crop=data_D[data_D['dist']<2200]
data_H_crop=data_H[data_H['dist']<2200]

### Histograms, break out into X and A for separate plotting
D_X = data_D_crop.loc[data_D_crop['chr-type']=='X']
D_A = data_D_crop.loc[data_D_crop['chr-type']=='A']
H_X = data_H_crop.loc[data_H_crop['chr-type']=='X']
H_A = data_H_crop.loc[data_H_crop['chr-type']=='A']

print(D_A)

In [None]:
### PLOT NRL DISTRIBUTION BY CHROMOSOME FOR TWO CONDITIONS (N2 and #021)
importlib.reload(nanotools)

size = (8,12)

# Assuming D_X, D_A, etc., are your dataframes for chromosomes
# You will need to filter D_A for each chromosome I to V
chromosome_data = [
    (D_X, "CHROMOSOME_X", "#c45746", "N2_30min_NRL_X"),
    (D_A[D_A['chr'] == 'CHROMOSOME_I'], "CHROMOSOME_I", "#16415e", "N2_30min_NRL_I"),
    (D_A[D_A['chr'] == 'CHROMOSOME_II'], "CHROMOSOME_II", "#16415e", "N2_30min_NRL_II"),
    (D_A[D_A['chr'] == 'CHROMOSOME_III'], "CHROMOSOME_III", "#16415e", "N2_30min_NRL_III"),
    (D_A[D_A['chr'] == 'CHROMOSOME_IV'], "CHROMOSOME_IV", "#16415e", "N2_30min_NRL_IV"),
    (D_A[D_A['chr'] == 'CHROMOSOME_V'], "CHROMOSOME_V", "#16415e", "N2_30min_NRL_V"),
]

peaks_list = []
for data, chromosome_name, color, output_prefix in chromosome_data:
    peaks = nanotools.plot_NRL_dist(data, chromosome_name, color, output_prefix)
    peaks_list.append(peaks)

# Concatenate the peaks
D_NRL = pd.concat(peaks_list, axis=1)

# Iterate through columns and rename 'x' columns
cols = list(D_NRL.columns)
for i, col in enumerate(cols):
    if 'peak-position' in col:
        chrom = col.split('-peak-position')[0]
        cols[i-1] = f'{chrom}-n-plus'  # rename the 'x' column immediately before the 'peak-position' column

D_NRL.columns = cols
display(D_NRL)

# Assuming D_X, D_A, etc., are your dataframes for chromosomes
# You will need to filter D_A for each chromosome I to V
chromosome_data=[]
chromosome_data = [
    (H_X, "CHROMOSOME_X", "#c45746", "SDC2-AID-Aux_30min_NRL_X"),
    (H_A[H_A['chr'] == 'CHROMOSOME_I'], "CHROMOSOME_I", "#16415e", "AID-SDC2_30min_NRL_I"),
    (H_A[H_A['chr'] == 'CHROMOSOME_II'], "CHROMOSOME_II", "#16415e", "AID-SDC2_30min_NRL_II"),
    (H_A[H_A['chr'] == 'CHROMOSOME_III'], "CHROMOSOME_III", "#16415e", "AID-SDC2_30min_NRL_III"),
    (H_A[H_A['chr'] == 'CHROMOSOME_IV'], "CHROMOSOME_IV", "#16415e", "AID-SDC2_30min_NRL_IV"),
    (H_A[H_A['chr'] == 'CHROMOSOME_V'], "CHROMOSOME_V", "#16415e", "AID-SDC2_30min_NRL_V"),
]

peaks_list = []
for data, chromosome_name, color, output_prefix in chromosome_data:
    peaks = nanotools.plot_NRL_dist(data, chromosome_name, color, output_prefix)
    peaks_list.append(peaks)

# Concatenate the peaks
H_NRL = pd.concat(peaks_list, axis=1)

# Iterate through columns and rename 'x' columns
cols = list(H_NRL.columns)
for i, col in enumerate(cols):
    if 'peak-position' in col:
        chrom = col.split('-peak-position')[0]
        cols[i-1] = f'{chrom}-n-plus'  # rename the 'x' column immediately before the 'peak-position' column

H_NRL.columns = cols
display(H_NRL)


In [None]:
### Calculate NRL Length

# List of chromosomes to compute the linear regression for
chromosomes = ['CHROMOSOME_X', 'CHROMOSOME_I', 'CHROMOSOME_II', 'CHROMOSOME_III', 'CHROMOSOME_IV', 'CHROMOSOME_V']
Dequations=[]
Hequations=[]
# Using D_NRL dataframe
for chrom in chromosomes:
    model.fit(D_NRL[f'{chrom}-n-plus'].values.reshape(-1, 1), D_NRL[f'{chrom}-peak-position'].values)
    Dslopes = model.coef_[0].round(2)
    Dintercepts = model.intercept_.round(2)
    # Append text to Dequations chrom + ' NRL: ' +DslopeA as a single item to list Dequations
    Dequations.append(f'{chrom} NRL: {Dslopes}')

print(Dequations)

# Using H_NRL dataframe
for chrom in chromosomes:
    model.fit(H_NRL[f'{chrom}-n-plus'].values.reshape(-1, 1), H_NRL[f'{chrom}-peak-position'].values)
    Hslopes = model.coef_[0].round(2)
    Hintercepts = model.intercept_.round(2)
    # Append text to Dequations chrom + ' NRL: ' +DslopeA as a single item to list Dequations
    Hequations.append(f'{chrom} NRL: {Hslopes}')

print(Hequations)

In [None]:
importlib.reload(nanotools)
# Generate plots
nanotools.plot_NRL_regression(D_NRL, 'N2 Nucleosome Repeat Length', 'N2_30min',Dequations)
nanotools.plot_NRL_regression(H_NRL, 'AID::SDC-2 Nucleosome Repeat Length', 'AID-SDC2_30min',Hequations)

In [None]:
### PLOT NRL DISTRIBUTION BY CHROMOSOME FOR TWO CONDITIONS (N2 and #021)
importlib.reload(nanotools)

size = (8,12)

# Assuming D_X, D_A, etc., are your dataframes for chromosomes
# You will need to filter D_A for each chromosome I to V
chromosome_data = [
    (D_X, "X", "#c45746", "N2_30min_NRL_X"),
    (D_A, "A", "#16415e", "N2_30min_NRL_A")
]

peaks_list = []
for data, chromosome_name, color, output_prefix in chromosome_data:
    peaks = nanotools.plot_NRL_dist(data, chromosome_name, color, output_prefix)
    peaks_list.append(peaks)

# Concatenate the peaks
D_NRL = pd.concat(peaks_list, axis=1)

# Iterate through columns and rename 'x' columns
cols = list(D_NRL.columns)
for i, col in enumerate(cols):
    if 'peak-position' in col:
        chrom = col.split('-peak-position')[0]
        cols[i-1] = f'{chrom}-n-plus'  # rename the 'x' column immediately before the 'peak-position' column

D_NRL.columns = cols
display(D_NRL)

# Assuming D_X, D_A, etc., are your dataframes for chromosomes
# You will need to filter D_A for each chromosome I to V
chromosome_data=[]
chromosome_data = [
    (H_X, "X", "#c45746", "SDC2-AID-Aux_30min_NRL_X"),
    (H_A, "A", "#16415e", "AID-SDC2_30min_NRL_A")
]

peaks_list = []
for data, chromosome_name, color, output_prefix in chromosome_data:
    peaks = nanotools.plot_NRL_dist(data, chromosome_name, color, output_prefix)
    peaks_list.append(peaks)

# Concatenate the peaks
H_NRL = pd.concat(peaks_list, axis=1)

# Iterate through columns and rename 'x' columns
cols = list(H_NRL.columns)
for i, col in enumerate(cols):
    if 'peak-position' in col:
        chrom = col.split('-peak-position')[0]
        cols[i-1] = f'{chrom}-n-plus'  # rename the 'x' column immediately before the 'peak-position' column

H_NRL.columns = cols
display(H_NRL)


In [None]:
### Calculate NRL Length for X vs A

# List of chromosomes to compute the linear regression for
chromosomes = ['X', 'A']
Dequations=[]
Hequations=[]
# Using D_NRL dataframe
for chrom in chromosomes:
    model.fit(D_NRL[f'{chrom}-n-plus'].values.reshape(-1, 1), D_NRL[f'{chrom}-peak-position'].values)
    Dslopes = model.coef_[0].round(2)
    Dintercepts = model.intercept_.round(2)
    # Append text to Dequations chrom + ' NRL: ' +DslopeA as a single item to list Dequations
    Dequations.append(f'{chrom} NRL: {Dslopes}')

print(Dequations)

# Using H_NRL dataframe
for chrom in chromosomes:
    model.fit(H_NRL[f'{chrom}-n-plus'].values.reshape(-1, 1), H_NRL[f'{chrom}-peak-position'].values)
    Hslopes = model.coef_[0].round(2)
    Hintercepts = model.intercept_.round(2)
    # Append text to Dequations chrom + ' NRL: ' +DslopeA as a single item to list Dequations
    Hequations.append(f'{chrom} NRL: {Hslopes}')

print(Hequations)

importlib.reload(nanotools)
# Generate plots
nanotools.plot_NRL_regression(D_NRL, 'N2 Nucleosome Repeat Length', 'N2_30min',Dequations)
nanotools.plot_NRL_regression(H_NRL, 'AID::SDC-2 Nucleosome Repeat Length', 'AID-SDC2_30min',Hequations)

In [None]:
importlib.reload(nanotools)

### Dataframes for TSS specific analysis
region_cutoff = 3000
D_X_tss = nanotools.filter_nucs_by_features(D_X, bed_tss, region_cutoff)
D_A_tss = nanotools.filter_nucs_by_features(D_A, bed_tss, region_cutoff)
H_X_tss = nanotools.filter_nucs_by_features(H_X, bed_tss, region_cutoff)
H_A_tss = nanotools.filter_nucs_by_features(H_A, bed_tss, region_cutoff)

In [None]:
### PLOT NRL DISTRIBUTION BY CHROMOSOME FOR TWO CONDITIONS (N2 and #021)
importlib.reload(nanotools)

size = (8,12)

# Use the _tss filtered dataframes
chromosome_data = [
    (D_X_tss, "X", "#c45746", "N2_30min_NRL_X"),
    (D_A_tss, "A", "#16415e", "N2_30min_NRL_A")
]

peaks_list = []
for data, chromosome_name, color, output_prefix in chromosome_data:
    peaks = nanotools.plot_NRL_dist(data, chromosome_name, color, output_prefix)
    peaks_list.append(peaks)

# Concatenate the peaks
D_NRL = pd.concat(peaks_list, axis=1)

# Iterate through columns and rename 'x' columns
cols = list(D_NRL.columns)
for i, col in enumerate(cols):
    if 'peak-position' in col:
        chrom = col.split('-peak-position')[0]
        cols[i-1] = f'{chrom}-n-plus'  # rename the 'x' column immediately before the 'peak-position' column

D_NRL.columns = cols
display(D_NRL)

# Use the _tss filtered dataframes
chromosome_data = [
    (H_X_tss, "X", "#c45746", "SDC2-AID-Aux_30min_NRL_X"),
    (H_A_tss, "A", "#16415e", "AID-SDC2_30min_NRL_A")
]

peaks_list = []
for data, chromosome_name, color, output_prefix in chromosome_data:
    peaks = nanotools.plot_NRL_dist(data, chromosome_name, color, output_prefix)
    peaks_list.append(peaks)

# Concatenate the peaks
H_NRL = pd.concat(peaks_list, axis=1)

# Iterate through columns and rename 'x' columns
cols = list(H_NRL.columns)
for i, col in enumerate(cols):
    if 'peak-position' in col:
        chrom = col.split('-peak-position')[0]
        cols[i-1] = f'{chrom}-n-plus'  # rename the 'x' column immediately before the 'peak-position' column

H_NRL.columns = cols
display(H_NRL)

In [None]:
### Calculate NRL Length for X vs A

# List of chromosomes to compute the linear regression for
chromosomes = ['X', 'A']
Dequations=[]
Hequations=[]
# Using D_NRL dataframe
for chrom in chromosomes:
    model.fit(D_NRL[f'{chrom}-n-plus'].values.reshape(-1, 1), D_NRL[f'{chrom}-peak-position'].values)
    Dslopes = model.coef_[0].round(2)
    Dintercepts = model.intercept_.round(2)
    # Append text to Dequations chrom + ' NRL: ' +DslopeA as a single item to list Dequations
    Dequations.append(f'{chrom} NRL: {Dslopes}')

print(Dequations)

# Using H_NRL dataframe
for chrom in chromosomes:
    model.fit(H_NRL[f'{chrom}-n-plus'].values.reshape(-1, 1), H_NRL[f'{chrom}-peak-position'].values)
    Hslopes = model.coef_[0].round(2)
    Hintercepts = model.intercept_.round(2)
    # Append text to Dequations chrom + ' NRL: ' +DslopeA as a single item to list Dequations
    Hequations.append(f'{chrom} NRL: {Hslopes}')

print(Hequations)

importlib.reload(nanotools)
# Generate plots
nanotools.plot_NRL_regression(D_NRL, 'N2 Nucleosome Repeat Length', 'N2_30min',Dequations)
nanotools.plot_NRL_regression(H_NRL, 'AID::SDC-2 Nucleosome Repeat Length', 'AID-SDC2_30min',Hequations)

In [None]:
# Fit a linear regression model
model = LinearRegression()
model.fit(D_NRL['peak-number'].to_frame('peak-number'), 
          D_NRL['X-peak-position'].to_frame('X-peak-position'))
DslopeX = model.coef_[0].round(2)
DinterceptX = model.intercept_.round(2)
DequationX = f'X NRL: {DslopeX}'
print("DequationX;",DequationX)

model = LinearRegression()
model.fit(D_NRL['peak-number'].to_frame('peak-number'), 
          D_NRL['A-peak-position'].to_frame('A-peak-position'))
DslopeA = model.coef_[0].round(1)
DinterceptA = model.intercept_.round(1)
DequationA = f'A NRL: {DslopeA}'
print("DequationA;",DequationA)

# Fit a linear regression model
model = LinearRegression()
model.fit(H_NRL['peak-number'].to_frame('peak-number'), 
          H_NRL['X-peak-position'].to_frame('X-peak-position'))
HslopeX = model.coef_[0].round(1)
HinterceptX = model.intercept_.round(1)
HequationX = f'X NRL: {HslopeX}'
print("HequationX;",HequationX)

model = LinearRegression()
model.fit(H_NRL['peak-number'].to_frame('peak-number'), 
          H_NRL['A-peak-position'].to_frame('A-peak-position'))
HslopeA = model.coef_[0].round(1)
HinterceptA = model.intercept_.round(1)
HequationA = f'A NRL: {HslopeA}'
print("HequationA;",HequationA)

In [None]:
### OLD
def frequency_table(df):
    frequency_df = pd.DataFrame(columns=["chr", "dist","n-plus"])

    for index, row in df.head(100).iterrows():
        # Get the value of the "smt_pos" column for this row
        chrm = row["chr"]

        # Get the value of the "smt_pos" column for this row
        smt_pos = row["smt_pos"]

        # Get the next X rows of the dataframe
        next_rows = nucleosome_positions_D[index+1:index+n_dist+1]

        n_counter = 0
        # Subtract the "smt_pos" value from each of the next 10 rows
        for i, r in next_rows.iterrows():
            # If n'th row's chromosome, and current row's chromosome match:
            if r["chr"] == chrm:
                n_counter += 1
                # calculate nlr
                difference = int((r["smt_pos"] - smt_pos)/n_counter)
                # append to frequency table
                frequency_df = frequency_df.append({"chr": chrm, "dist": difference, "n-plus": n_counter}, ignore_index=True)

    # Sort the results_df dataframe by "smt_pos" and "difference"
    #frequency_df = frequency_df.sort_values(by=["chr", "dist","n-plus"])

    display(frequency_df.tail(20))
    return frequency_df


###
# Fit a linear regression model
model = LinearRegression()
model.fit(data_D[.loc[data_D['chr-type']=='X','n-plus']].to_frame('n-plus'), 
          data_D.loc[data_D['chr-type']=='X','dist'].to_frame('dist'))
DslopeX = model.coef_[0].round(2)
DinterceptX = model.intercept_.round(2)
DequationX = f'X: y = {DslopeX}x + {DinterceptX}'
print("DequationX;",DequationX)

model = LinearRegression()
model.fit(data_D.loc[data_D['chr-type']=='A','n-plus'].to_frame('n-plus'), 
          data_D.loc[data_D['chr-type']=='A','dist'].to_frame('dist'))
DslopeA = model.coef_[0].round(1)
DinterceptA = model.intercept_.round(1)
DequationA = f'A: y = {DslopeA}x + {DinterceptA}'
print("DequationA;",DequationA)

# Fit a linear regression model
model = LinearRegression()
model.fit(data_H.loc[data_H['chr-type']=='X','n-plus'].to_frame('n-plus'), 
          data_H.loc[data_H['chr-type']=='X','dist'].to_frame('dist'))
HslopeX = model.coef_[0].round(1)
HinterceptX = model.intercept_.round(1)
HequationX = f'X: y = {HslopeX}x + {HinterceptX}'
print("HequationX;",HequationX)

model = LinearRegression()
model.fit(data_H.loc[data_H['chr-type']=='A','n-plus'].to_frame('n-plus'), 
          data_H.loc[data_H['chr-type']=='A','dist'].to_frame('dist'))
HslopeA = model.coef_[0].round(1)
HinterceptA = model.intercept_.round(1)
HequationA = f'A: y = {HslopeA}x + {HinterceptA}'
print("HequationA;",HequationA)