## Import the necessary libraries and data

In [91]:
import os
import pandas as pd
import numpy as np
import re
import seaborn as sb

In [92]:
col_names_data = ['pdb_code', 'resolution', 'release_year', 'log(value)', 'type', 'value']
col_names_name = ['pdb_code', 'release year', 'uniprot_id', 'protein_name']
col_names_set = ['pdb_code', 'resolution', 'release year', 'type', 'value']

df_data = pd.read_fwf('INDEX_refined_data.2020', colspecs=[(0, 5), (6, 10), (11, 16), (16, 23), (24, 27), (28, 60)], names=col_names_data, skiprows=6)
df_name = pd.read_fwf('INDEX_refined_name.2020', colspecs=[(0, 5), (6, 10), (11, 18), (18, 60)], names=col_names_name, skiprows=6)
df_set = pd.read_fwf('INDEX_refined_set.2020', colspecs=[(0, 5), (6, 10), (11, 16), (17, 20), (21, 60)], names=col_names_set, skiprows=6)

## First look

In [93]:
df_data.sort_values(by='pdb_code').head()

Unnamed: 0,pdb_code,resolution,release_year,log(value),type,value
2615,10gs,2.2,1998,6.4,Ki,0.4uM // 10gs.pdf (VWW)
1158,184l,1.8,1995,4.72,Kd,19uM // 184l.pdf (I4B)
399,185l,1.8,1995,3.54,Kd,290uM // 185l.pdf (IND)
1252,186l,1.8,1995,4.85,Kd,14uM // 186l.pdf (N4B)
325,187l,1.8,1995,3.37,Kd,422uM // 187l.pdf (PXY)


In [94]:
df_name.sort_values(by='pdb_code').head()

Unnamed: 0,pdb_code,release year,uniprot_id,protein_name
3967,10gs,1998,P09211,GLUTATHIONE S-TRANSFERASE P1-1
48,184l,1995,P00720,T4 LYSOZYME
43,185l,1995,P00720,T4 LYSOZYME
49,186l,1995,P00720,T4 LYSOZYME
41,187l,1995,P00720,T4 LYSOZYME


In [95]:
df_set.sort_values(by='pdb_code').head()

Unnamed: 0,pdb_code,resolution,release year,type,value
250,10gs,2.2,1998,Ki,0.4uM // 10gs.pdf (VWW)
88,184l,1.8,1995,Kd,19uM // 184l.pdf (I4B) ligand is
90,185l,1.8,1995,Kd,290uM // 185l.pdf (IND) ligand is
86,186l,1.8,1995,Kd,14uM // 186l.pdf (N4B) ligand is
93,187l,1.8,1995,Kd,422uM // 187l.pdf (PXY) ligand is


## Merge of datasets

Merge all datasets by identity column ("pdb_code") 

In [96]:
data = pd.merge(df_data, df_name, on='pdb_code').merge(df_set, on='pdb_code')
data.head()

Unnamed: 0,pdb_code,resolution_x,release_year,log(value),type_x,value_x,release year_x,uniprot_id,protein_name,resolution_y,release year_y,type_y,value_y
0,2r58,2.0,2007,2.0,Kd,10mM // 2r58.pdf (MLY),2007,Q9VHA0,POLYCOMB PROTEIN SCM,2.0,2007,Kd,10mM // 2r58.pdf (MLY)
1,3c2f,2.35,2008,2.0,Kd,10.1mM // 3c2f.pdf (PRP),2008,P43619,NICOTINATE-NUCLEOTIDE PYROPHOSPHORYLASE,2.35,2008,Kd,10.1mM // 3c2f.pdf (PRP) Kd=10.1+/-
2,3g2y,1.31,2009,2.0,Ki,10mM // 3g2y.pdf (GF4),2009,Q9L5C8,BETA-LACTAMASE CTX-M-9A,1.31,2009,Ki,10mM // 3g2y.pdf (GF4)
3,3pce,2.06,1998,2.0,Ki,10mM // 3pce.pdf (3HP),1998,P00436,"PROTOCATECHUATE 3,4-DIOXYGENASE",2.06,1998,Ki,10mM // 3pce.pdf (3HP)
4,4qsu,1.9,2014,2.0,Kd,10mM // 4qsu.pdf (TDR),2014,Q6PL18,ATPASE FAMILY AAA DOMAIN-CONTAINING PROT,1.9,2014,Kd,10mM // 4qsu.pdf (TDR) ligand is


In [97]:
data.columns

Index(['pdb_code', 'resolution_x', 'release_year', 'log(value)', 'type_x',
       'value_x', 'release year_x', 'uniprot_id', 'protein_name',
       'resolution_y', 'release year_y', 'type_y', 'value_y'],
      dtype='object')

Remove uninformative and duplicated columns

In [98]:
data = data.drop(['resolution_y', 'resolution_x', 'release_year', 'uniprot_id', 'release year_x', 'release year_y', 'value_y', 'type_y'], axis=1)

In [99]:
data.head(20)

Unnamed: 0,pdb_code,log(value),type_x,value_x,protein_name
0,2r58,2.0,Kd,10mM // 2r58.pdf (MLY),POLYCOMB PROTEIN SCM
1,3c2f,2.0,Kd,10.1mM // 3c2f.pdf (PRP),NICOTINATE-NUCLEOTIDE PYROPHOSPHORYLASE
2,3g2y,2.0,Ki,10mM // 3g2y.pdf (GF4),BETA-LACTAMASE CTX-M-9A
3,3pce,2.0,Ki,10mM // 3pce.pdf (3HP),"PROTOCATECHUATE 3,4-DIOXYGENASE"
4,4qsu,2.0,Kd,10mM // 4qsu.pdf (TDR),ATPASE FAMILY AAA DOMAIN-CONTAINING PROT
5,4qsv,2.0,Kd,10mM // 4qsv.pdf (THM),ATPASE FAMILY AAA DOMAIN-CONTAINING PROT
6,4u54,2.06,Kd,8.7mM // 4u54.pdf (3C5),IMPORTIN SUBUNIT ALPHA-1
7,3ao4,2.07,Kd,8.5mM // 3ao1.pdf (833),HIV-1 INTEGRASE
8,4cs9,2.1,Kd,8mM // 4cs8.pdf (AMP),M2-1
9,2w8w,2.12,Kd,7.5mM // 2w8j.pdf (PLS),SERINE PALMITOYLTRANSFERASE


## Data correction

In [100]:
data = data.rename(columns={'type_x': 'type'}) 

Separate value_x to get adequate values and units for each complex

In [101]:
data['units'] = data['value_x'].apply(lambda x: re.search(r'[a-zA-Z]+', x).group(0))
data['value'] = (data['value_x'].apply(lambda x: re.search(r'[0123456789.]+', x).group(0))).astype(float)

In [102]:
data = data.drop(['value_x'], axis=1)

In [103]:
data.head(20)

Unnamed: 0,pdb_code,log(value),type,protein_name,units,value
0,2r58,2.0,Kd,POLYCOMB PROTEIN SCM,mM,10.0
1,3c2f,2.0,Kd,NICOTINATE-NUCLEOTIDE PYROPHOSPHORYLASE,mM,10.1
2,3g2y,2.0,Ki,BETA-LACTAMASE CTX-M-9A,mM,10.0
3,3pce,2.0,Ki,"PROTOCATECHUATE 3,4-DIOXYGENASE",mM,10.0
4,4qsu,2.0,Kd,ATPASE FAMILY AAA DOMAIN-CONTAINING PROT,mM,10.0
5,4qsv,2.0,Kd,ATPASE FAMILY AAA DOMAIN-CONTAINING PROT,mM,10.0
6,4u54,2.06,Kd,IMPORTIN SUBUNIT ALPHA-1,mM,8.7
7,3ao4,2.07,Kd,HIV-1 INTEGRASE,mM,8.5
8,4cs9,2.1,Kd,M2-1,mM,8.0
9,2w8w,2.12,Kd,SERINE PALMITOYLTRANSFERASE,mM,7.5


## Protein selection:

We cannot utilize all proteins for our subsequent model due to the following reasons:

- Proteins have diverse structures and functions. Different protein systems may possess distinct binding sites, interaction mechanisms, and chemical environments. Consequently, a model trained on one protein system may not generalize well to other systems due to these structural variations.

- Ligand-protein interactions are highly specific. Ligands bind to proteins based on complementary shapes, electrostatic interactions, hydrogen bonding, and hydrophobic interactions. Therefore, a model trained on one protein-ligand interaction may not accurately predict interactions with other proteins due to differences in binding site architecture and chemical properties.

- Models trained on diverse datasets tend to generalize better. However, if the dataset consists of bioactivity data from various protein systems, the model may struggle to learn meaningful patterns that are applicable across different systems. This may compromise the model's predictive performance.

To address this, let's select the most frequently occurring protein from oour samplel>

In [104]:
data.value_counts('protein_name').head(10)

protein_name
HIV-1 PROTEASE                      303
CARBONIC ANHYDRASE 2                291
HEAT SHOCK PROTEIN HSP90-ALPHA       94
BROMODOMAIN-CONTAINING PROTEIN 4     69
TRYPSIN                              64
BETA-SECRETASE 1                     48
COAGULATION FACTOR XA                46
CASEIN KINASE II, ALPHA SUBUNIT      43
THROMBIN                             38
TRANSTHYRETIN                        37
Name: count, dtype: int64

In [105]:
data_hiv = data[data['protein_name'] == 'HIV-1 PROTEASE'].sort_values(by='pdb_code').reset_index(drop=True)

## Units

The units include uM, nM, and pM. Convert all of them to a unified format, nM.

In [106]:
data_hiv['type'].unique()

array(['Ki', 'Kd'], dtype=object)

In [107]:
data_hiv['units'].unique()

array(['uM', 'nM', 'pM'], dtype=object)

In [108]:
conv_dict = {'pM': 0.001, 'nM': 1, 'uM': 1000}

def convertor_to_nM(units, values):
    return [values[i] * conv_dict[units[i]] for i in range(len(values))]

In [109]:
data_hiv['value_nM'] = convertor_to_nM(data_hiv['units'], data_hiv['value'])
data_hiv = data_hiv.drop(['units', 'value'], axis=1)

In [110]:
data_hiv.head(10)

Unnamed: 0,pdb_code,log(value),type,protein_name,value_nM
0,1a30,4.3,Ki,HIV-1 PROTEASE,50000.0
1,1a94,7.85,Ki,HIV-1 PROTEASE,14.0
2,1a9m,6.92,Ki,HIV-1 PROTEASE,119.0
3,1aaq,8.4,Ki,HIV-1 PROTEASE,4.0
4,1aid,4.82,Ki,HIV-1 PROTEASE,15000.0
5,1ajv,7.72,Ki,HIV-1 PROTEASE,19.1
6,1ajx,7.91,Ki,HIV-1 PROTEASE,12.2
7,1b6j,7.92,Ki,HIV-1 PROTEASE,12.0
8,1b6k,8.74,Ki,HIV-1 PROTEASE,1.8
9,1b6l,8.3,Ki,HIV-1 PROTEASE,5.0


## Establishment of target constant

Using a mix of Ki (inhibitory constant) and Kd (dissociation constant) values as a single target in predictive modeling for bioactivity can be problematic due to several reasons:

 - Variability in Measurement: Ki and Kd values are measured differently and may have varying levels of experimental uncertainty. Combining these values into a single target variable could obscure the true relationship between ligand properties and bioactivity, leading to a less reliable predictive model.

 - Different Interpretations: Ki and Kd values have different interpretations in terms of ligand-receptor interactions. Ki values represent the concentration of inhibitor required to inhibit 50% of enzyme activity, while Kd values represent the concentration of ligand required to occupy 50% of receptor binding sites. Attempting to treat them interchangeably as a single target may oversimplify the underlying biology and lead to misleading conclusions.

 - Model Performance: Combining Ki and Kd values as a single target may result in a model that performs poorly in predicting bioactivity. Since Ki and Kd values measure different aspects of ligand-receptor interactions, a model trained on a mixed dataset may struggle to capture the nuanced relationships between ligand features and bioactivity accurately.

Overall, it is generally preferable to use a consistent and well-defined target variable in predictive modeling for bioactivity prediction.
As in the case of selecting a protein, let's select the most frequently occurring constant.

In [111]:
data_hiv.value_counts('type')

type
Ki    260
Kd     43
Name: count, dtype: int64

In [112]:
data_hiv = data_hiv[data_hiv['type'] == 'Ki']
data_hiv.shape

(260, 5)

## Save to csv

In [113]:
data_hiv.to_csv('target_ki_hiv.csv')