In [30]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
import os
import itertools
import tqdm
import gc

# Load and filter interaction data

This first section will load the raw interaction energy data files which is stored in 'energyDataFiles'

if you are running this notebook on the cloud, you will need to download the data from github by cloning the repository. This can be accomplished using the cell below (be sure to uncomment it of course)

The data will be filtered by discarding any interaction pairs which exhibit a mean total interaction energy of less than some specified cutoff (currently this is somewhat arbitrarily set to be thermal energy of the simulations)

Since we will ultimately want to compare differences between the wild type and mutant networks, we need to make sure that we retain interaction pairs over all systems if there was a significant interaction in even a single system.

To complicate things further, however, this protein exists as a homo-hexamer. This means we need to take account of the symmetry as well. E.g. if chains 1 and 2 interact, this would also be equivalent having chains 2 and 3, or 3 and 4, ... etc. For this reason the data is organized by 'ChainDelta' which is equal to the difference in chain ids (clockwise) of the interacting pair. So when we filter interaction pairs we will do so using the intrachain sequence id and chain delta for the pairs rather than the individual residue id's.

This section only needs to be completed once. If it is already run, and you just want to visualize results, head to the visualization section.

In [None]:
#!git clone https://github.com/wesleymsmith/cx26_energy_network.git
#!ln -s cx26_energy_network/energyDataFiles ./

In [18]:
dataDir='energyDataFiles'

energyFilePattern='ChainDelta'

dataFiles=[dataFile for dataFile in os.listdir(dataDir) \
           if energyFilePattern in dataFile]

chainDeltas=np.unique(list(map(lambda x: x.split('.')[3],
                               dataFiles)))
chainDeltas=[chainDelta for chainDelta in chainDeltas if 'Chain' in chainDelta]
chainDeltas

['ChainDelta_0',
 'ChainDelta_1',
 'ChainDelta_2',
 'ChainDelta_3',
 'ChainDelta_4',
 'ChainDelta_5']

In [125]:
filteredDataTables=[]
kb=0.0019872041 #kcal/mol
Tsim=310.15
eCut=1.0*kb*Tsim #energy cutoff at 1.0 kT (one half thermal energy)

sigPairTags=[]
with tqdm.tqdm_notebook(chainDeltas,desc="chain delta:") as deltaIter:
    with tqdm.tqdm_notebook(total=None,desc="loading tables") as loadIter:
        with tqdm.tqdm_notebook(total=None,desc="filtering data") as filterIter:
            for chainDelta in deltaIter:
                deltaIter.set_description(chainDelta)
                deltaFileList=[filename for filename in dataFiles if \
                               chainDelta in filename]
                #print(deltaFileList)
                loadIter.total=len(deltaFileList)
                loadIter.reset()
                deltaTables=[]
                for deltaFile in deltaFileList:
                    loadIter.set_description_str(deltaFile)
                    filepath='/'.join([dataDir,deltaFile])
                    tempData=pd.read_csv(filepath)
                    tempData['TestVal']=tempData['TOTAL.Avg'].abs() - \
                        tempData['TOTAL.Std_Err_of_Mean']
                    tempData.drop(columns=[colName for colName in tempData.columns\
                                           if 'Unnamed' in colName])
                    fileNameTokens=deltaFile.split('.')
                    tempData['System']=fileNameTokens[0]
                    tempData['Variant']=fileNameTokens[1]
                    tempData['Rep']=fileNameTokens[2]
                    tempData['Pair_Tag']=tempData[
                            ['Seqid_1','Seqid_2','Chain_Delta']
                        ].apply(lambda x: '_'.join(list(map(str,x))),axis=1)
                    deltaTables.append(tempData.copy())
                    loadIter.update()
                
                deltaTable=pd.concat(deltaTables)
                
                pairGroups=tempData.groupby('Pair_Tag')
                
                filterIter.total=len(pairGroups)
                filterIter.reset()
                for pairName,pairData in pairGroups:
                    filterIter.set_description_str(pairName)
                    maxTestVal=pairData['TestVal'].max()
                    if maxTestVal > eCut:
                        sigPairTags.append(pairName)
                    filterIter.update()
                deltaTable=deltaTable[deltaTable['Pair_Tag'].isin(sigPairTags)]
                filteredDataTables.append(deltaTable.copy())
                gc.collect()
energyData=pd.concat(filteredDataTables)
filteredDataTables=[]
energyData.head()

HBox(children=(IntProgress(value=0, description='chain delta:', max=6, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=1, bar_style='info', description='loading tables', max=1, style=ProgressStyle…

HBox(children=(IntProgress(value=1, bar_style='info', description='filtering data', max=1, style=ProgressStyle…

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceed

Unnamed: 0.1,Unnamed: 0,Resid_1,ResName_1,Chain_1,Seqid_1,Resid_2,ResName_2,Chain_2,Seqid_2,TOTAL.Avg,TOTAL.Std_Dev,TOTAL.Std_Err_of_Mean,Chain_Delta,TestVal,System,Variant,Rep,Pair_Tag
0,0,1,MET,1,1,1,MET,1,1,-24.06473,1.788517,0.177964,0,23.886766,n14k2,acetyl,rep1,1_1_0
1,1,1,MET,1,1,2,ASP,1,2,-27.276465,0.659607,0.065633,0,27.210831,n14k2,acetyl,rep1,1_2_0
2,2,1,MET,1,1,3,TRP,1,3,-3.058341,0.3312,0.032956,0,3.025386,n14k2,acetyl,rep1,1_3_0
4,4,1,MET,1,1,5,THR,1,5,-3.156847,1.469387,0.14621,0,3.010638,n14k2,acetyl,rep1,1_5_0
5,5,1,MET,1,1,6,LEU,1,6,-0.124479,0.081879,0.008147,0,0.116331,n14k2,acetyl,rep1,1_6_0


In [126]:
len(sigPairTags)

3081

In [127]:
print(energyData.shape)
energyData.head()

(369720, 18)


Unnamed: 0.1,Unnamed: 0,Resid_1,ResName_1,Chain_1,Seqid_1,Resid_2,ResName_2,Chain_2,Seqid_2,TOTAL.Avg,TOTAL.Std_Dev,TOTAL.Std_Err_of_Mean,Chain_Delta,TestVal,System,Variant,Rep,Pair_Tag
0,0,1,MET,1,1,1,MET,1,1,-24.06473,1.788517,0.177964,0,23.886766,n14k2,acetyl,rep1,1_1_0
1,1,1,MET,1,1,2,ASP,1,2,-27.276465,0.659607,0.065633,0,27.210831,n14k2,acetyl,rep1,1_2_0
2,2,1,MET,1,1,3,TRP,1,3,-3.058341,0.3312,0.032956,0,3.025386,n14k2,acetyl,rep1,1_3_0
4,4,1,MET,1,1,5,THR,1,5,-3.156847,1.469387,0.14621,0,3.010638,n14k2,acetyl,rep1,1_5_0
5,5,1,MET,1,1,6,LEU,1,6,-0.124479,0.081879,0.008147,0,0.116331,n14k2,acetyl,rep1,1_6_0


In [128]:
energyData=energyData.drop(columns=['Unnamed: 0', 'TestVal', 'Pair_Tag'])

In [129]:
energyData.shape

(369720, 15)

In [142]:
energyData.to_csv('/'.join([dataDir,'energyNetworkDataTable.csv']),
                  index=False)

# Visualize Interaction Energy Networks
The following cells allow you to visualize the individual interaction energy networks generated above.

The first method is to view the networks in matrix form using the interactive plotting package 'bokeh'. This will require downloading the 'correlation_data_utilities.py' script, which can be obtained using the cell below.

The bokeh package is used here instead of matplotlib or seaborn because it allows support for 'tooltips'. This means that you can hover your mouse over any data point of interest in the matrix heatmap and it will display a 'tooltip' which will present all the information contained in the corresponding data table row.

The second method will be to display the network as an overlay onto a rendering of the 3-dimensional structure of the protein.

<I>Note: this methods will be added shortly</I>
There are two options for doing so. The first method uses the package 'nglview'. This provides support for tooltips much like the bokeh method. Unfortunately, some platforms do not support this (e.g. it will cause notebooks running in google colab to crash).

The second option is to use py3dmol. The result will look much the same except that it will not allow use of tooltips.

In [None]:
#!git clone https://github.com/LynaLuo-Lab/network_analysis_scripts.git
#!ln -s network_analysis_scripts/python_version/correlation_data_utilities.py ./

In [114]:
import correlation_data_utilities
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from IPython.display import display

In [144]:
#Load interaction data.
dataDir='energyDataFiles'
energyData=pd.read_csv('/'.join([dataDir,'energyNetworkDataTable.csv']))
energyData.head()

Unnamed: 0,Resid_1,ResName_1,Chain_1,Seqid_1,Resid_2,ResName_2,Chain_2,Seqid_2,TOTAL.Avg,TOTAL.Std_Dev,TOTAL.Std_Err_of_Mean,Chain_Delta,System,Variant,Rep
0,1,MET,1,1,1,MET,1,1,-24.06473,1.788517,0.177964,0,n14k2,acetyl,rep1
1,1,MET,1,1,2,ASP,1,2,-27.276465,0.659607,0.065633,0,n14k2,acetyl,rep1
2,1,MET,1,1,3,TRP,1,3,-3.058341,0.3312,0.032956,0,n14k2,acetyl,rep1
3,1,MET,1,1,5,THR,1,5,-3.156847,1.469387,0.14621,0,n14k2,acetyl,rep1
4,1,MET,1,1,6,LEU,1,6,-0.124479,0.081879,0.008147,0,n14k2,acetyl,rep1


In [141]:
@interact_manual
def show_mat_plot(system=energyData['System'].unique(),
                  variant=energyData['Variant'].unique(),
                  rep=energyData['Rep'].unique(),
                  vRange=widgets.FloatRangeSlider(min=-25,max=25,value=(-10,10))):
    vMin,vMax=vRange
    plotData=energyData[
        (energyData['System']==system) & \
        (energyData['Variant']==variant) & \
        (energyData['Rep']==rep)].copy()
    if len(plotData)>0:
        display(correlation_data_utilities.bokeh_dataTable_heatMap(
            plotData,Xcol='Resid_1',Ycol='Resid_2',dataCol='TOTAL.Avg',
            width=640,height=640,
            rectheight=1,rectwidth=1,
            colorMap=sns.color_palette("coolwarm", n_colors=256).as_hex(),
            title="Total GB interaction energy\n(kcal/mol)",
            xlabel=None,
            ylabel=None,
            axisFontSize="14pt",
            vmin=vMin,#-plotData['TOTAL.Avg'].abs().max(),
            vmax=vMax))#plotData['TOTAL.Avg'].abs().max()))
    else:
        print("No data present for %s"%('.'.join([system,variant,rep])))

interactive(children=(Dropdown(description='system', options=('n14k2', 'n14y2', 'wt2'), value='n14k2'), Dropdo…