In [None]:
import py3Dmol
import pandas as pd
import numpy as np
import plotly.express as px
import requests
import os
import re
import glob
import time

amino_acid_map = {
    'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
    'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
    'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
    'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V'
    }

def parse_pdb(file_path):
    columns = ['record_type', 'atom_number', 'atom_name', 'alt_loc', 'residue_name', 'chain_id', 
               'residue_number', 'insertion', 'x', 'y', 'z', 'occupancy', 'temp_factor', 'element', 'charge']
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('ATOM') or line.startswith('HETATM'):
                record_type = line[0:6].strip()
                atom_number = int(line[6:11])
                atom_name = line[12:16].strip()
                alt_loc = line[16].strip()
                residue_name = line[17:20].strip()
                chain_id = line[21].strip()
                residue_number = int(line[22:26])
                insertion = line[26].strip()
                x = float(line[30:38])
                y = float(line[38:46])
                z = float(line[46:54])
                occupancy = float(line[54:60])
                temp_factor = float(line[60:66])
                element = line[76:78].strip()
                charge = line[78:80].strip()

                data.append([record_type, atom_number, atom_name, alt_loc, residue_name, chain_id, 
                             residue_number, insertion, x, y, z, occupancy, temp_factor, element, charge])

    return pd.DataFrame(data, columns=columns).sort_values('residue_number')

def parse_pdbqt(file_path):
    columns = ['record_type', 'atom_number', 'atom_name', 'alt_loc', 'residue_name', 'chain_id', 
               'residue_number', 'insertion', 'x', 'y', 'z', 'occupancy', 'temp_factor', 'partial_charge', 'atom_type']
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('ATOM') or line.startswith('HETATM'):
                record_type = line[0:6].strip()
                atom_number = int(line[6:11])
                atom_name = line[12:16].strip()
                alt_loc = line[16].strip()
                residue_name = line[17:20].strip()
                chain_id = line[21].strip()
                residue_number = int(line[22:26])
                insertion = line[26].strip()
                x = float(line[30:38])
                y = float(line[38:46])
                z = float(line[46:54])
                occupancy = float(line[54:60])
                temp_factor = float(line[60:66])
                partial_charge = float(line[70:76])
                atom_type = line[77:79].strip()

                data.append([record_type, atom_number, atom_name, alt_loc, residue_name, chain_id, 
                             residue_number, insertion, x, y, z, occupancy, temp_factor, partial_charge, atom_type])

    return pd.DataFrame(data, columns=columns).sort_values('residue_number')

def pdb_to_pandas(file_path):
    if file_path.lower().endswith('.pdb'):
        return parse_pdb(file_path)
    elif file_path.lower().endswith('.pdbqt'):
        return parse_pdbqt(file_path)
    else:
        raise ValueError("Unsupported file format. Please provide a .pdb or .pdbqt file.")

def view_pymol(receptor, ligand, docked, receptor_highlight=None, sticks=False):
  v = py3Dmol.view()
  v.addModel(open(receptor).read())
  if sticks:
    v.setStyle({'cartoon':{},'stick':{'radius':.1}})
  else:
    v.setStyle({'cartoon':{}})
  if receptor_highlight:
    for i in range(receptor_highlight-3, receptor_highlight+3):
      v.setStyle({'model': -1, 'serial': i}, {"cartoon": {'color': 'yellow'}, 'stick':{'radius':.3, 'color':'yellow'}})
  v.addModel(open(ligand).read())
  v.setStyle({'model':1},{'stick':{'colorscheme':'dimgrayCarbon','radius':.125}})
  v.addModelsAsFrames(open(docked).read())
  v.setStyle({'model':2},{'stick':{'colorscheme':'greenCarbon'}})
  v.zoomTo({'model':1})
  v.rotate(90)
  v.animate({'interval':5000})
  return v

from rdkit import Chem
from rdkit.Chem import AllChem

def find_ester_bond(mol):
    for bond in mol.GetBonds():
        atom1, atom2 = bond.GetBeginAtom(), bond.GetEndAtom()
        # Check if the bond is between carbon and oxygen
        if (atom1.GetAtomicNum() == 6 and atom2.GetAtomicNum() == 8) or \
           (atom2.GetAtomicNum() == 6 and atom1.GetAtomicNum() == 8):
            # Identify which atom is carbon and which is oxygen
            c_atom = atom1 if atom1.GetAtomicNum() == 6 else atom2
            o_atom = atom2 if atom1.GetAtomicNum() == 6 else atom1
            
            # Check if the carbon is connected to another oxygen (double-bonded)
            for neighbor in c_atom.GetNeighbors():
                if neighbor.GetAtomicNum() == 8 and neighbor.GetIdx() != o_atom.GetIdx():
                    return bond
    return None

def pdb_to_residue_number(receptor_path, target_motif, target_codon_in_motif, target_molecule):
    df_pdb = pdb_to_pandas(receptor_path)
    
    # Extract unique amino acids
    aa3 = df_pdb[['residue_name', 'residue_number']].drop_duplicates()['residue_name']
    convert = np.vectorize(lambda x: amino_acid_map.get(x, x))
    aa1 = convert(aa3)
    
    # Find all occurrences of the motif
    aa_sequence = ''.join(aa1)
    motif_matches = list(re.finditer(target_motif, aa_sequence))

    results = []
    for match in motif_matches:
        start, end = match.start(), match.end()
        # Find coordinates for the target molecule
        target_idx = start + 1 + target_codon_in_motif

    return target_idx

def calculate_distances(receptor_path, ligand_path, residue_number, atom_name):
    # Load the protein structure using our pandas loader
    receptor_df = pdb_to_pandas(receptor_path)
    # Find the specific atom in the protein
    target_atom = receptor_df[(receptor_df['residue_number'] == residue_number) & 
                              (receptor_df['atom_name'] == atom_name)]
    
    if target_atom.empty:
        raise ValueError(f"Atom {atom_name} not found in residue {residue_number}")
    
    target_coords = np.array([target_atom['x'].values[0], 
                              target_atom['y'].values[0], 
                              target_atom['z'].values[0]])
    
    # Load the ligand with all conformations
    suppl = Chem.SDMolSupplier(ligand_path, removeHs=False)
    
    distances = []
    for mol in suppl:
        if mol is None:
            distances.append(None)
            continue
        
        # Find the ester bond in the ligand
        ester_bond = find_ester_bond(mol)
        
        if ester_bond is None:
            distances.append(None)
            continue
        
        # Calculate the midpoint of the ester bond
        conf = mol.GetConformer()
        pos1 = conf.GetAtomPosition(ester_bond.GetBeginAtomIdx())
        pos2 = conf.GetAtomPosition(ester_bond.GetEndAtomIdx())
        midpoint = np.array([(pos1.x + pos2.x) / 2, (pos1.y + pos2.y) / 2, (pos1.z + pos2.z) / 2])
        
        # Calculate distance
        distance = np.linalg.norm(target_coords - midpoint)
        distances.append(distance)
    
    return distances

def load_docking_results(receptor_path, ligand_path, gnina_log_file, target_molecule="OG", target_codon_in_motif=4, target_motif="[LIV].G.S.G"):
    
    residue_number = pdb_to_residue_number(receptor_path=receptor_path, target_molecule=target_molecule, target_codon_in_motif=target_codon_in_motif, target_motif=target_motif)
    # Calculate distances
    distances = calculate_distances(receptor_path, ligand_path, residue_number, target_molecule)

    # Read gnina log file
    with open(gnina_log_file, 'r') as f:
        log_content = f.read()
    
    # Extract docking results
    pattern = r"^\s*(\d+)\s+([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s+([-\d.]+)\s*$"
    docking_results = re.findall(pattern, log_content, re.MULTILINE)
    
    # Create DataFrame
    df = pd.DataFrame(docking_results, columns=['Mode', 'Affinity', 'Intramol', 'CNN_score', 'CNN_affinity'])
    df = df.astype({'Mode': int, 'Affinity': float, 'Intramol': float, 'CNN_score': float, 'CNN_affinity': float})
    
    # Add distances
    df['Distance'] = pd.Series(distances)
    
    # Reorder columns
    df = df[['Mode', 'Affinity', 'Intramol', 'CNN_score', 'CNN_affinity', 'Distance']]
    return df

In [None]:
from rdkit.Chem import rdFMCS, AllChem
from scipy.cluster.hierarchy import linkage
import plotly.graph_objects as go
import plotly.figure_factory as ff

def load_poses(sdf_file):
    """Load poses from an SDF file and rename them."""
    poses = Chem.SDMolSupplier(sdf_file)
    renamed_poses = []
    for index, p in enumerate(poses):
        if p is not None:
            p.SetProp('_Name', str(index + 1))
            renamed_poses.append(p)
    return renamed_poses

def calculate_rmsd_matrix(poses):
    """Calculate RMSD matrix for all pose pairs."""
    import math
    size = len(poses)
    rmsd_matrix = np.zeros((size, size))
    
    for i, mol in enumerate(poses):
        for j, jmol in enumerate(poses):
            # MCS identification between reference pose and target pose
            r = rdFMCS.FindMCS([mol, jmol])
            # Atom map for reference and target
            a = mol.GetSubstructMatch(Chem.MolFromSmarts(r.smartsString))
            b = jmol.GetSubstructMatch(Chem.MolFromSmarts(r.smartsString))
            # Atom map generation
            amap = list(zip(a, b))

            # Calculate RMSD
            # distance calculation per atom pair
            distances=[]
            for atomA, atomB in amap:
                pos_A=mol.GetConformer().GetAtomPosition (atomA)
                pos_B=jmol.GetConformer().GetAtomPosition (atomB)
                coord_A=np.array((pos_A.x,pos_A.y,pos_A.z))
                coord_B=np.array ((pos_B.x,pos_B.y,pos_B.z))
                dist_numpy = np.linalg.norm(coord_A-coord_B)
                distances.append(dist_numpy)
    
            # This is the RMSD formula from wikipedia
            rmsd=math.sqrt(1/len(distances)*sum([i*i for i in distances]))
    
            #saving the rmsd values to a matrix and a table for clustering
            rmsd_matrix[i ,j]=rmsd
    
    return rmsd_matrix

def perform_clustering(rmsd_matrix):
    """Perform hierarchical clustering on the RMSD matrix."""
    return linkage(rmsd_matrix, method='complete')

def plot_dendrogram(
    dist,
    linkage_method,
    leaf_data,
    width,
    height,
    title=None,
    count_sort=True,
    distance_sort=False,
    line_width=0.5,
    line_color='black',
    marker_size=15,
    leaf_color='CNN_score',
    render_mode='svg',
    leaf_y=0,
    leaf_color_discrete_map=None,
    leaf_category_orders=None,
    template='simple_white',
):

    import plotly.express as px
    import scipy.cluster.hierarchy as sch

    # Hierarchical clustering.
    Z = sch.linkage(dist, method=linkage_method)
    # Compute the dendrogram but don't plot it.
    dend = sch.dendrogram(
        Z,
        count_sort=count_sort,
        distance_sort=distance_sort,
        no_plot=True,
    )

    # Compile the line coordinates into a single dataframe.
    icoord = dend["icoord"]
    dcoord = dend["dcoord"]
    line_segments_x = []
    line_segments_y = []
    for ik, dk in zip(icoord, dcoord):
        # Adding None here breaks up the lines.
        line_segments_x += ik + [None]
        line_segments_y += dk + [None]
    df_line_segments = pd.DataFrame({"x": line_segments_x, "y": line_segments_y})

    # Convert X coordinates to haplotype indices (scipy multiplies coordinates by 10).
    df_line_segments["x"] = (df_line_segments["x"] - 5) / 10

    # Plot the lines.
    fig = px.line(
        df_line_segments,
        x="x",
        y="y",
        render_mode=render_mode,
        template=template,
    )

    # Reorder leaf data to align with dendrogram.
    leaves = dend["leaves"]
    n_leaves = len(leaves)
    leaf_data = leaf_data.iloc[leaves]

    # Add scatter plot to draw the leaves.
    fig.add_traces(
        list(
            px.scatter(
                data_frame=leaf_data,
                x=np.arange(n_leaves),
                y=np.repeat(leaf_y, n_leaves),
                color=leaf_color,
                render_mode=render_mode,
                hover_name='Mode',
                hover_data=leaf_data.columns.to_list(),
                template=template,
                color_discrete_map=leaf_color_discrete_map,
                category_orders=leaf_category_orders,
            ).select_traces()
        )
    )

    # Style the lines and markers.
    line_props = dict(
        width=line_width,
        color=line_color,
    )
    marker_props = dict(
        size=marker_size,
    )
    fig.update_traces(line=line_props, marker=marker_props)

    # Style the figure.
    fig.update_layout(
        width=width,
        height=height,
        title=title,
        autosize=True,
        hovermode="closest",
        showlegend=False,
    )

    return fig, leaf_data

def plot_heatmap(df_rmsd, leaf_data):
    """Create an interactive heatmap using Plotly."""
    pose_order = leaf_data['Mode'].to_numpy() - 1
    
    fig = go.Figure(data=go.Heatmap(
        z=df_rmsd,
        x=np.arange(len(pose_order)),
        y=np.arange(len(pose_order)),
        colorscale='Viridis',
        showscale=False
    ))

    # fig.update_layout(xaxis={'categoryarray': pose_order})

    return fig

def _concat_subplots(
        figures,
        width,
        height,
    ):
        from plotly.subplots import make_subplots  # type: ignore
        # make subplots
        fig = make_subplots(
            rows=2,
            cols=1,
            vertical_spacing=0.05,
            shared_xaxes=True
        )

        for i, figure in enumerate(figures):
            if isinstance(figure, go.Figure):
                # This is a figure, access the traces within it.
                for trace in range(len(figure["data"])):
                    fig.append_trace(figure["data"][trace], row=i+1, col=1)
            else:
                # Assume this is a trace, add directly.
                fig.append_trace(figure, row=i+1, col=1)

        fig.update_xaxes(visible=False)
        fig.update_layout(
            width=width,
            height=height,
            hovermode="closest",
            plot_bgcolor="white",
        )
        fig.update(layout_coloraxis_showscale=False)

        return fig

def analyse_poses(receptor_path, sdf_path, docking_log_path, width=1000, height=500):
    """Analyze poses and create visualizations."""
    # Load poses
    poses = load_poses(sdf_path)
    results_df = load_docking_results(receptor_path, sdf_path, docking_log_path)
    
    # Calculate RMSD matrix
    rmsd_matrix = calculate_rmsd_matrix(poses)

    from scipy.spatial.distance import squareform
    condensed_rmsd_matrix = squareform(rmsd_matrix, checks=False)
    
    # Create visualizations
    dendrogram, leaf_data = plot_dendrogram(dist=condensed_rmsd_matrix, linkage_method='complete', leaf_data=results_df, width=500, height=500)
    
    pose_order = leaf_data['Mode'].to_numpy() - 1
    df_rmsd = pd.DataFrame(rmsd_matrix).iloc[pose_order, pose_order]
    heatmap = plot_heatmap(df_rmsd, leaf_data)

    fig = _concat_subplots(figures=[dendrogram, heatmap], width=width, height=height)
    
    return fig, leaf_data

In [None]:
gene_id = 'AGAP006228'
ligand = 'deltamethrin3d'

receptor_path=f"../receptors/{gene_id}.pdbqt"
sdf_path = f"../docking/{gene_id}_{ligand}.sdf"
docking_log_path = f"../docking/{gene_id}_{ligand}.log"

# Example usage
df_docking = load_docking_results(receptor_path=receptor_path,
                    ligand_path=sdf_path, 
                    gnina_log_file=docking_log_path, 
                   )
print(df_docking)

In [None]:
genes = ['AGAP006227', 'AGAP006228', 'AGAP006723', 'AGAP006724', 'AGAP006725', 'AGAP006726', 'AGAP006727']
ligands = ['4-nitrophenyl-butyrate-ester', 'deltamethrin3d']#, 'pirimiphos-methyl-oxon3d', 'malathion3d', 'cis-permethrin3d','transfluthrin3d']

dfs = []
for ligand in ligands:
    suffix = -2 if ligand.endswith("3d") else 40
    for gene_id in genes:

        receptor_path=f"../receptors/{gene_id}.pdbqt"
        sdf_path = f"../docking/{gene_id}_{ligand}.sdf"
        docking_log_path = f"../docking/{gene_id}_{ligand}.log"
        
        df_docking = load_docking_results(receptor_path=receptor_path,
                    ligand_path=sdf_path, 
                    gnina_log_file=docking_log_path, 
                   )
        print(ligand, gene_id)
        df = df_docking.sort_values('CNN_score', ascending=False)#.assign(ligand=ligand[0:suffix], gene=gene_id)
        dfs.append(df)
        print(df)
        print("\n\n\n")

In [None]:
pd.concat(dfs).to_csv("Supplementary-table-XXX.csv")