In [1]:
import argparse
from pathlib import Path
import numpy as np
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole


In [2]:
import pandas as pd
from tqdm import tqdm
import prolif as plf
import MDAnalysis as mda
from MDAnalysis.topology.guessers import guess_types
from pathlib import Path

In [3]:
IPythonConsole.ipython_useSVG = True

# Initialize progress bar for pandas
tqdm.pandas()

In [4]:
final_results_df = pd.read_csv('processed.csv')
final_results_df.shape

(34107, 13)

In [5]:
final_results_df.head()

Unnamed: 0,Residue,Hydrophobic.distance,VdWContact.distance,HBDonor.distance,HBDonor.DHA_angle,Cationic.distance,HBAcceptor.distance,HBAcceptor.DHA_angle,PiStacking.distance,PiStacking.plane_angle,PiStacking.normal_to_centroid_angle,PiStacking.intersect_distance,PDB_File
0,VAL60.A,4.475821,,,,,,,,,,,0
1,ASP61.A,,3.103711,,,,,,,,,,0
2,PHE64.A,4.00642,,,,,,,,,,,0
3,PHE64.A,,3.348581,,,,,,,,,,0
4,PHE64.A,4.00642,,,,,,,,,,,0


In [28]:
final_results_df.head(40)

Unnamed: 0,Residue,Hydrophobic.distance,VdWContact.distance,HBDonor.distance,HBDonor.DHA_angle,Cationic.distance,HBAcceptor.distance,HBAcceptor.DHA_angle,PiStacking.distance,PiStacking.plane_angle,PiStacking.normal_to_centroid_angle,PiStacking.intersect_distance,PDB_File,normalized
0,VAL60.A,4.475821,,,,,,,,,,,0,
1,ASP61.A,,3.103711,,,,,,,,,,0,9.1e-05
2,PHE64.A,4.00642,,,,,,,,,,,0,
3,PHE64.A,,3.348581,,,,,,,,,,0,9.8e-05
4,PHE64.A,4.00642,,,,,,,,,,,0,
5,PHE64.A,,3.348581,,,,,,,,,,0,9.8e-05
6,MET71.A,4.23332,,,,,,,,,,,0,
7,GLN74.A,4.269494,,,,,,,,,,,0,
8,LEU75.A,4.473456,,,,,,,,,,,0,
9,THR92.A,4.377967,,,,,,,,,,,0,


In [6]:
import plotly.express as px
import plotly.graph_objects as go
from ipywidgets import interact, widgets


### Viewing scaled distributions for 501 pdb files

##### Select from dropdown to view each of the metric in the output

In [18]:
def plot_normalized_distributions(df, columns):
    @interact(column=columns)
    def make_plot(column):
        # Aggregate data across all PDB files
        all_data = df[column].dropna()
        
        # Ensure there is data to plot
        if all_data.empty:
            print(f"No data available for column {column}")
            return
        
        # Calculate histogram for aggregated data
        counts, bins = np.histogram(all_data, bins=50, density=True)
        
        # Normalize the counts by dividing by the number of data points
        total_count = len(all_data)
        normalized_counts = counts / total_count
        
        # Plotting
        fig = go.Figure()
        
        fig.add_trace(go.Bar(
            x=bins[:-1],  # Bin edges
            y=normalized_counts,
            name='All PDB Files',
            opacity=0.75,
            marker_color='blue'
        ))
        
        fig.update_layout(
            title=f'Normalized Distribution of {column} across All PDB Files',
            xaxis_title=column,
            yaxis_title='Normalized Count',
            barmode='overlay'
        )
        
        fig.show()

In [19]:
columns = [col for col in final_results_df.columns if col not in ['PDB_File', 'Residue']]
plot_normalized_distributions(final_results_df, columns)


interactive(children=(Dropdown(description='column', options=('Hydrophobic.distance', 'VdWContact.distance', '…

#### Mean and confidence interval 
##### Smoothed with a rolling window of 25 

In [23]:
def plot_mean_with_confidence_intervals(df, numeric_columns, window_size=5):
    @interact(column=numeric_columns)
    def make_plot(column):
        grouped = df.groupby('PDB_File')[column]
        means = grouped.mean()
        sems = grouped.sem()  # Standard error of the mean

        # Apply moving average to smooth the mean and confidence intervals
        smoothed_means = means.rolling(window=window_size, center=True).mean()
        smoothed_upper = (means + sems).rolling(window=window_size, center=True).mean()
        smoothed_lower = (means - sems).rolling(window=window_size, center=True).mean()

        fig = go.Figure()

        # Add smoothed mean line
        fig.add_trace(go.Scatter(
            x=smoothed_means.index,
            y=smoothed_means.values,
            mode='lines+markers',
            name='Mean (Smoothed)',
            line=dict(color='blue')
        ))

        # Add smoothed confidence interval
        fig.add_trace(go.Scatter(
            x=smoothed_upper.index,
            y=smoothed_upper.values,
            fill=None,
            mode='lines',
            line=dict(color='lightblue'),
            showlegend=True,
            name='Confidence Interval Upper (Smoothed)'
        ))

        fig.add_trace(go.Scatter(
            x=smoothed_lower.index,
            y=smoothed_lower.values,
            fill='tonexty',  # Fill the area between this trace and the previous one
            mode='lines',
            line=dict(color='lightblue'),
            name='Confidence Interval Lower (Smoothed)'
        ))

        fig.update_layout(
            title=f'Mean and Confidence Interval of {column} across PDB Files (Smoothed)',
            xaxis_title='PDB File',
            yaxis_title=column,
            legend_title='Legend'
        )

        fig.show()

In [26]:
numeric_columns = [
    col for col in final_results_df.columns
    if col not in ['PDB_File', 'Residue'] and pd.api.types.is_numeric_dtype(final_results_df[col])
]

# Plot with a default window size for the moving average
plot_mean_with_confidence_intervals(final_results_df, numeric_columns, window_size=25)

interactive(children=(Dropdown(description='column', options=('Hydrophobic.distance', 'VdWContact.distance', '…