In [1]:
import pickle
import os
import sys
import glob
# import torch
import numpy as np
import pandas as pd
import scipy.io
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
from scipy import signal
from scipy import stats
import seaborn as sns
import statsmodels.api as sm
from statistics import mean
from scipy.stats import chi2_contingency
from scipy.stats import fisher_exact
from scipy.stats import ranksums
from scipy.stats import pearsonr
from tqdm import tqdm
import plotly.express as px

# jupyter notebook display setting for all data structures
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
np.set_printoptions(threshold=np.inf)

In [2]:
location = '/home/waqaas/DNA_Breathing/waqaas_scratch2'

In [3]:
# load the correlation results from the pickle file
with open(f'{location}DNA-breathing/data/Chipseq_data/correlation_results_robust.pkl', 'rb') as f:
    correlation_results = pickle.load(f)

In [4]:
correlation_results

{'TCF12': {'MA1648.1': {'motif_flip': (-0.053, '1.13241e-33'),
   'motif_coord': (-0.045, '1.14874e-24'),
   'motif_coordsq': (-0.054, '4.76174e-35')}},
 'ELF1': {'MA0473.3': {'motif_flip': (-0.151, '0.00000e+00'),
   'motif_coord': (-0.148, '2.21934e-318'),
   'motif_coordsq': (-0.155, '0.00000e+00')}},
 'MAX': {'MA0058.3': {'motif_flip': (-0.121, '3.02992e-132'),
   'motif_coord': (-0.119, '1.78912e-128'),
   'motif_coordsq': (-0.125, '2.40868e-141')}},
 'E2F6': {'MA0471.2': {'motif_flip': (-0.2, '0.00000e+00'),
   'motif_coord': (-0.191, '0.00000e+00'),
   'motif_coordsq': (-0.203, '0.00000e+00')}},
 'RUNX3': {'MA0684.2': {'motif_flip': (0.053, '7.83515e-35'),
   'motif_coord': (0.072, '1.39526e-62'),
   'motif_coordsq': (0.044, '4.13134e-24')}},
 'BHLHE40': {'MA0464.2': {'motif_flip': (-0.071, '1.05203e-36'),
   'motif_coord': (-0.062, '3.63039e-28'),
   'motif_coordsq': (-0.074, '2.64074e-40')}},
 'FOXA1': {'MA0148.4': {'motif_flip': (-0.089, '7.15563e-105'),
   'motif_coord': (-0

In [5]:
# turn correlation_results into a dataframe
correlation_results_df = pd.DataFrame.from_dict({(i,j): correlation_results[i][j] 
                           for i in correlation_results.keys() 
                           for j in correlation_results[i].keys()},
                       orient='index')

In [6]:
# split each of motif_flip, motif_coord and motif_coordsq into correlation coefficient and p-value columns
correlation_results_df[['motif_flip_corr', 'motif_flip_pval']] = pd.DataFrame(correlation_results_df['motif_flip'].tolist(), index=correlation_results_df.index)
correlation_results_df[['motif_coord_corr', 'motif_coord_pval']] = pd.DataFrame(correlation_results_df['motif_coord'].tolist(), index=correlation_results_df.index)
correlation_results_df[['motif_coordsq_corr', 'motif_coordsq_pval']] = pd.DataFrame(correlation_results_df['motif_coordsq'].tolist(), index=correlation_results_df.index)

# drop the motif_flip, motif_coord and motif_coordsq columns
correlation_results_df.drop(columns=['motif_flip', 'motif_coord', 'motif_coordsq'], inplace=True)

In [7]:
correlation_results_df['motif_flip_corr']

TCF12    MA1648.1   -0.053
ELF1     MA0473.3   -0.151
MAX      MA0058.3   -0.121
E2F6     MA0471.2   -0.200
RUNX3    MA0684.2    0.053
BHLHE40  MA0464.2   -0.071
FOXA1    MA0148.4   -0.089
CTCF     MA0139.1   -0.112
         MA1930.1   -0.080
         MA1929.1   -0.078
ZNF143   MA0088.2   -0.075
GATA2    MA0036.3   -0.061
STAT3    MA0144.2    0.006
FOSL2    MA0478.1   -0.035
ZBTB7A   MA0750.2   -0.219
JUND     MA0492.1   -0.130
         MA0491.2   -0.115
PBX3     MA1114.1   -0.001
NR2F2    MA1111.1   -0.023
EGR1     MA0162.4   -0.026
ATF3     MA0605.2   -0.117
SP1      MA0079.5   -0.122
PAX5     MA0014.3   -0.020
RFX5     MA0510.2   -0.041
ETS1     MA0098.3   -0.213
FOSL1    MA0477.2   -0.111
FOXA2    MA0047.3   -0.067
SRF      MA0083.3   -0.046
MAFK     MA0496.3    0.207
CEBPB    MA0466.3    0.159
HNF4A    MA1494.1   -0.013
         MA0114.4   -0.005
MEF2A    MA0052.4   -0.091
BATF     MA1634.1    0.131
THAP1    MA0597.2   -0.076
REST     MA0138.2   -0.006
EBF1     MA0154.4    0.122
U

In [8]:

# round the correlation columns, and not the p-value columns, to 3 decimal places
correlation_results_df = correlation_results_df.round({'motif_flip_corr': 3, 'motif_coord_corr': 3, 'motif_coordsq_corr': 3})

In [12]:
# Create a new column 'significance' and initialize it with empty strings
correlation_results_df['significance'] = ''

# move this column into the first position
cols = correlation_results_df.columns.tolist()
cols = cols[-1:] + cols[:-1]
correlation_results_df = correlation_results_df[cols]

# If a p-value is less than 0.05 for any of the three correlation coefficients, then add an asterisk to the 'significance' column
correlation_results_df.loc[(correlation_results_df['motif_flip_pval'].astype(float) < 5e-10) | (correlation_results_df['motif_coord_pval'].astype(float) < 5e-10) | (correlation_results_df['motif_coordsq_pval'].astype(float) < 5e-10), 'significance'] += '*'

# If a p-value is less than 0.0000000005 for any of the three correlation coefficients, then add another asterisk to the 'significance' column
correlation_results_df.loc[(correlation_results_df['motif_flip_pval'].astype(float) < 5e-100) | (correlation_results_df['motif_coord_pval'].astype(float) < 5e-100) | (correlation_results_df['motif_coordsq_pval'].astype(float) < 5e-100), 'significance'] += '*'

# If a p-value is less than 5e-200 for any of the three correlation coefficients, then add another asterisk to the 'significance' column
correlation_results_df.loc[(correlation_results_df['motif_flip_pval'].astype(float) < 5e-200) | (correlation_results_df['motif_coord_pval'].astype(float) < 5e-200) | (correlation_results_df['motif_coordsq_pval'].astype(float) < 5e-200), 'significance'] += '*'

In [13]:
correlation_results_df = correlation_results_df.rename(columns={
    'motif_flip_corr': 'Motif Flip Correlation',
    'motif_flip_pval': 'Motif Flip P-value',
    'motif_coord_corr': 'Motif Coord Correlation',
    'motif_coord_pval': 'Motif Coord P-value',
    'motif_coordsq_corr': 'Motif Coord Squared Correlation',
    'motif_coordsq_pval': 'Motif Coord Squared P-value',
    'significance': 'Significance'
})

In [14]:
# reset the index
correlation_results_df.reset_index(inplace=True)

In [15]:
# sort the dataframe by the 'significance' column in descending order
correlation_results_df.sort_values(by='Significance', ascending=False, inplace=True)

# sort the dataframe by the 'Motif Flip Correlation' column in descending order
correlation_results_df.sort_values(by='Motif Flip Correlation', ascending=False, inplace=True)

# view the dataframe
correlation_results_df

Unnamed: 0,level_0,level_1,Motif Coord P-value,Motif Coord Squared Correlation,Motif Coord Squared P-value,Significance,Motif Flip Correlation,Motif Flip P-value,Motif Coord Correlation
28,MAFK,MA0496.3,0.0,0.214,0.0,***,0.207,0.0,0.227
39,NFYB,MA0502.2,0.0,0.178,0.0,***,0.18,0.0,0.175
29,CEBPB,MA0466.3,2.32767e-271,0.156,5.070589999999999e-239,***,0.159,1.09438e-248,0.166
45,ZBTB33,MA0527.1,2.2370899999999998e-134,0.154,2.7953e-118,**,0.158,2.77515e-124,0.164
33,BATF,MA1634.1,0.0,0.125,4.487e-320,***,0.131,0.0,0.13
36,EBF1,MA0154.4,1.09086e-203,0.123,1.92057e-249,***,0.122,2.1918299999999997e-247,0.111
4,RUNX3,MA0684.2,1.39526e-62,0.044,4.13134e-24,*,0.053,7.83515e-35,0.072
41,PRDM1,MA0508.3,9.84675e-06,0.017,8.1423e-05,,0.016,0.000218144,0.019
12,STAT3,MA0144.2,0.348607,0.005,0.116561,,0.006,0.0628939,0.003
17,PBX3,MA1114.1,0.0448875,-0.002,0.569692,,-0.001,0.842046,0.007


In [16]:
# sort the dataframe by the 'Motif Flip Correlation' column in descending order
correlation_results_df.sort_values(by='Motif Flip Correlation', ascending=False, inplace=True)
alt_sort_df = correlation_results_df.copy()
alt_sort_df.sort_values(by='Motif Flip Correlation', ascending=True, inplace=True)

subset_df = pd.concat([correlation_results_df.head(6), alt_sort_df.head(5)])

# view the dataframe
print(subset_df.to_markdown(index=False))

| level_0   | level_1   |   Motif Coord P-value |   Motif Coord Squared Correlation |   Motif Coord Squared P-value | Significance   |   Motif Flip Correlation |   Motif Flip P-value |   Motif Coord Correlation |
|:----------|:----------|----------------------:|----------------------------------:|------------------------------:|:---------------|-------------------------:|---------------------:|--------------------------:|
| MAFK      | MA0496.3  |          0            |                             0.214 |                  0            | ***            |                    0.207 |         0            |                     0.227 |
| NFYB      | MA0502.2  |          0            |                             0.178 |                  0            | ***            |                    0.18  |         0            |                     0.175 |
| CEBPB     | MA0466.3  |          2.32767e-271 |                             0.156 |                  5.07059e-239 | ***            |              

In [17]:
subset_df

Unnamed: 0,level_0,level_1,Motif Coord P-value,Motif Coord Squared Correlation,Motif Coord Squared P-value,Significance,Motif Flip Correlation,Motif Flip P-value,Motif Coord Correlation
28,MAFK,MA0496.3,0.0,0.214,0.0,***,0.207,0.0,0.227
39,NFYB,MA0502.2,0.0,0.178,0.0,***,0.18,0.0,0.175
29,CEBPB,MA0466.3,2.32767e-271,0.156,5.070589999999999e-239,***,0.159,1.09438e-248,0.166
45,ZBTB33,MA0527.1,2.2370899999999998e-134,0.154,2.7953e-118,**,0.158,2.77515e-124,0.164
33,BATF,MA1634.1,0.0,0.125,4.487e-320,***,0.131,0.0,0.13
36,EBF1,MA0154.4,1.09086e-203,0.123,1.92057e-249,***,0.122,2.1918299999999997e-247,0.111
14,ZBTB7A,MA0750.2,0.0,-0.221,0.0,***,-0.219,0.0,-0.222
24,ETS1,MA0098.3,0.0,-0.217,0.0,***,-0.213,0.0,-0.217
3,E2F6,MA0471.2,0.0,-0.203,0.0,***,-0.2,0.0,-0.191
49,POU2F2,MA0507.2,2.38861e-226,-0.164,1.0688e-261,***,-0.16,4.3675899999999996e-247,-0.153


In [18]:
# view the dataframe as a markdown table
print(correlation_results_df.to_markdown(index=False))

| level_0   | level_1   |   Motif Coord P-value |   Motif Coord Squared Correlation |   Motif Coord Squared P-value | Significance   |   Motif Flip Correlation |   Motif Flip P-value |   Motif Coord Correlation |
|:----------|:----------|----------------------:|----------------------------------:|------------------------------:|:---------------|-------------------------:|---------------------:|--------------------------:|
| MAFK      | MA0496.3  |          0            |                             0.214 |                  0            | ***            |                    0.207 |         0            |                     0.227 |
| NFYB      | MA0502.2  |          0            |                             0.178 |                  0            | ***            |                    0.18  |         0            |                     0.175 |
| CEBPB     | MA0466.3  |          2.32767e-271 |                             0.156 |                  5.07059e-239 | ***            |              

In [None]:
# load the positional_data pickle file
with open('/project2/xinhe/waqaas/DNA-breathing/data/Chipseq_data/positional_data.pkl', 'rb') as f:
    positional_data = pickle.load(f)

In [None]:
for tf in positional_data.keys():
    for matrix_id in positional_data[tf].keys():
        # only use the matrices if they correspond to values in the correlation_results_df dataframe with *** in the 'Significance' column
        if correlation_results_df.loc[(tf, matrix_id), 'Significance'] == '***':
            length = len(positional_data[tf][matrix_id])
            # make a bargraph for each feature, showing the correlation coefficient and p-value for each position in the motif
            for feature in ['motif_flip', 'motif_coord', 'motif_coordsq']:
                bar_graph = {}
                for i in range(length):
                    bar_graph[i] = {
                        feature: {
                            'correlation': positional_data[tf][matrix_id][i][feature + '_corr'],
                            'p_value': positional_data[tf][matrix_id][i][feature + '_pval'],
                        }
                    }
                positions = list(positional_data[tf][matrix_id].keys())
                correlations = [entry[feature]['correlation'] for entry in bar_graph.values()]
                p_values = [-np.log10(entry[feature]['p_value']) if entry[feature]['p_value'] != 0 else -np.log10(1e-300) for entry in bar_graph.values()]
                
                plt.figure(figsize=(15, 7.5))

                # add a super title
                plt.suptitle(f'{tf} - {matrix_id} - {feature}', fontsize=16)

                plt.subplot(121)
                plt.bar(positions, correlations)
                plt.title('Correlation')
                plt.grid(axis='y')
                plt.xticks(positions, positions)


                plt.subplot(122)
                plt.bar(positions, p_values)
                plt.title('-log10(P-Value)')
                plt.grid(axis='y')
                plt.xticks(positions, positions)

                plt.tight_layout()
                # save the bargraph as a png file in /project2/xinhe/waqaas/DNA-breathing-1/figs/phase4/top_sig_feat/
                #plt.savefig(f'/project2/xinhe/waqaas/DNA-breathing-1/figs/phase4/top_sig_feat/{tf}_{matrix_id}_{feature}.png')
                plt.show()
                

In [None]:
for tf in positional_data.keys():
    for matrix_id in positional_data[tf].keys():
        # only use the matrices if they correspond to values in the correlation_results_df dataframe with *** in the 'Significance' column
        if tf == 'PAX5':
            length = len(positional_data[tf][matrix_id])
            # make a bargraph for each feature, showing the correlation coefficient and p-value for each position in the motif
            for feature in ['motif_flip', 'motif_coord', 'motif_coordsq']:
                bar_graph = {}
                for i in range(length):
                    bar_graph[i] = {
                        feature: {
                            'correlation': positional_data[tf][matrix_id][i][feature + '_corr'],
                            'p_value': positional_data[tf][matrix_id][i][feature + '_pval'],
                        }
                    }
                positions = list(positional_data[tf][matrix_id].keys())
                correlations = [entry[feature]['correlation'] for entry in bar_graph.values()]
                p_values = [-np.log10(entry[feature]['p_value']) if entry[feature]['p_value'] != 0 else -np.log10(1e-300) for entry in bar_graph.values()]
                
                plt.figure(figsize=(15, 7.5))

                # add a super title
                plt.suptitle(f'{tf} - {matrix_id} - {feature}', fontsize=16)

                plt.subplot(121)
                plt.bar(positions, correlations)
                plt.title('Correlation')
                plt.grid(axis='y')
                plt.xticks(positions, positions)


                plt.subplot(122)
                plt.bar(positions, p_values)
                plt.title('-log10(P-Value)')
                plt.grid(axis='y')
                plt.xticks(positions, positions)

                plt.tight_layout()
                # save the bargraph as a png file in /project2/xinhe/waqaas/DNA-breathing-1/figs/phase4/top_sig_feat/
                #plt.savefig(f'/project2/xinhe/waqaas/DNA-breathing-1/figs/phase4/top_sig_feat/{tf}_{matrix_id}_{feature}.png')
                plt.show()
                

In [None]:
selected_tfs = ['MAFK', 'NFYB', 'CEBPB', 'CTCF', 'BATF', 'ZBTB7A', 'ETS1', 'E2F6', 'POU2F2', 'GABPA']
matrix_ids = ['MA0496.3', 'MA0502.2', 'MA0466.3', 'MA1929.1', 'MA1634.1', 'MA0750.2', 'MA0098.3', 'MA0471.2', 'MA0507.2', 'MA0062.3']

In [None]:
for tf, matrix_id in zip(selected_tfs, matrix_ids):
    # only use the matrices if they correspond to values in the correlation_results_df dataframe with *** in the 'Significance' column
        length = len(positional_data[tf][matrix_id])
        # make a bargraph for each feature, showing the correlation coefficient and p-value for each position in the motif
        for feature in ['motif_flip', 'motif_coord', 'motif_coordsq']:
            bar_graph = {}
            for i in range(length):
                bar_graph[i] = {
                    feature: {
                        'correlation': positional_data[tf][matrix_id][i][feature + '_corr'],
                        'p_value': positional_data[tf][matrix_id][i][feature + '_pval'],
                    }
                }
            positions = list(positional_data[tf][matrix_id].keys())
            correlations = [entry[feature]['correlation'] for entry in bar_graph.values()]
            p_values = [-np.log10(entry[feature]['p_value']) if entry[feature]['p_value'] != 0 else -np.log10(1e-300) for entry in bar_graph.values()]
            
            plt.figure(figsize=(15, 7.5))

            # add a super title
            plt.suptitle(f'{tf} - {matrix_id} - {feature}', fontsize=16)

            plt.subplot(121)
            plt.bar(positions, correlations)
            plt.title('Correlation')
            plt.grid(axis='y')
            plt.xticks(positions, positions)


            plt.subplot(122)
            plt.bar(positions, p_values)
            plt.title('-log10(P-Value)')
            plt.grid(axis='y')
            plt.xticks(positions, positions)

            plt.tight_layout()
            # save the bargraph as a png file in /project2/xinhe/waqaas/DNA-breathing-1/figs/phase4/top_sig_feat/
            plt.savefig(f'/scratch/midway3/waqaas/DNA-breathing-1/figs/phase4/top_sig_feat/{tf}_{matrix_id}_{feature}.png')
            plt.show()
            