**Predicting Drug-Target Interactions for COVID-19 Proteins**

The traditional drug discovery process is expensive and time-consuming, so accelerating it is crucial in responding to emerging health threats like SARS-CoV-2. This project aims to predict docking scores of candidate molecules against SARS-CoV-2 protein targets using machine learning, enabling high-throughput virtual screening and prioritization of the most promising compounds for experimental testing.

In [1]:
# imports
import pandas as pd
import numpy as np
# run to install, otherwise keep uncommented
!pip install rdkit
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator,DataStructs
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

Collecting rdkit
  Downloading rdkit-2025.9.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.2 kB)
Downloading rdkit-2025.9.3-cp311-cp311-manylinux_2_28_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m59.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.3


In [2]:
# load data
train_data = pd.read_csv("/kaggle/input/covid-19-bioinformatics-drug-target-challenge/train/train.csv")
test_data  = pd.read_csv("/kaggle/input/covid-19-bioinformatics-drug-target-challenge/test/test.csv")

In [3]:
# keep a random sample of the training and test data
train_data_small = train_data.sample(n=100, random_state=42)
test_data_small = test_data.sample(n=50, random_state=42)

In [4]:
# save truncated train and test data as a new csv
train_data_small.to_csv('small_train_data.csv', index=False)
test_data_small.to_csv('small_test_data.csv', index=False)

In [5]:
# function to convert SMILES to Morgan fingerprints using MorganGenerator
def smiles_to_morgan(smiles, radius=2, n_bits=1024):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    
    # use the new generator
    fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
    fp = fpgen.GetFingerprint(mol)  # returns a FingerprintGenerator-type vector
    
    arr = np.zeros((n_bits,), dtype=int)
    for bit in fp.GetOnBits():      # iterate bits that are set
        arr[bit] = 1
    return arr

In [6]:
# 18 target columns for 18 pockets

target_cols = ['3CLPro_pocket1', 'ADRP-ADPR_pocket1', 'ADRP-ADPR_pocket5', 
               'ADRP_pocket1', 'ADRP_pocket12', 'ADRP_pocket13', 'COV_pocket1', 
               'COV_pocket2', 'COV_pocket8', 'COV_pocket10', 'NSP9_pocket2', 
               'NSP9_pocket7', 'NSP15_pocket1', 'ORF7A_pocket2', 
               'PLPro_chainA_pocket3', 'PLPro_chainA_pocket23', 'PLPro_pocket6', 
               'PLPro_pocket50']

In [7]:
# convert SMILES into fingerprint matrix
X_train = np.array([smiles_to_morgan(s) for s in train_data_small["SMILES"]])  
y_train = train_data_small[target_cols].values.astype(np.float32)
X_test = np.array([smiles_to_morgan(s) for s in test_data_small["SMILES"]])

In [8]:
# confirm that the fingerprint conversion actually works if it matches molecules and 1024-bit fingerprints.
print("Example fingerprint:\n", smiles_to_morgan(train_data["SMILES"].iloc[0])[:32])
print("Shape:", X_train.shape)

Example fingerprint:
 [0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Shape: (100, 1024)


In [9]:
# split train into train/validation to check performance quickly
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [10]:
# multi-output random forest
rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
rf.fit(X_tr, y_tr)

In [11]:
# evaluate on validation set
y_val_pred = rf.predict(X_val)
val_mae = mean_absolute_error(y_val, y_val_pred)
print("Validation MAE:", val_mae)

Validation MAE: 0.7373375058306588


In [12]:
# predict on the small test set
y_test_pred = rf.predict(X_test)

In [13]:
# create a DF for readability
pred_df = pd.DataFrame(y_test_pred, columns=target_cols)
pred_df.insert(0, "SMILES", test_data_small["SMILES"].values)


# print predictions
print(pred_df)

                                               SMILES  3CLPro_pocket1  \
0                             NC(c1ccc(cc1)Cl)CCC(C)C         -4.8974   
1                   O=C(N1CCOCC1)N1CCN(CC1)c1ccncc1Cl         -5.3440   
2                   CCC(C(=O)Nc1ccc(nc1)n1nc(cc1C)C)C         -5.1889   
3          N#Cc1cccc(c1)C(=O)N1CCN(CC1)C(=O)Cc1ccccn1         -5.8907   
4          O=C(C(=O)NCCc1onc(n1)C(C)C)Nc1ccc(c(c1)F)F         -5.0349   
5    OC(CN1C(=O)NC(C1=O)(C)c1ccc2c(c1)CCC2)COCc1ccco1         -6.4515   
6            CCN(CC1CCN(C1)C(=O)NCc1cccc(c1)n1cncn1)C         -6.4984   
7   N#CCCn1nc(c(c1C)CCC(=O)NCCNS(=O)(=O)c1ccc(cc1)C)C         -5.3405   
8     N#Cc1cc(ccc1NCCCN1CCN(CC1)c1ncccn1)[N+](=O)[O-]         -5.6417   
9          FC(COc1nccc(c1)c1noc(n1)C1CCN(S1(=O)=O)C)F         -5.3706   
10            CCOCn1c(=O)[nH]c(=O)c(c1SC1CCCCC1)C(C)C         -4.9690   
11    COc1ccc(nn1)n1ncc(c1C(F)(F)F)C(=O)Nc1cccc(c1)Cl         -5.6727   
12            COCC1(CCCN1Cc1cccc(c1)C(=O)N(C)C)C(=O

In [14]:
# rank molecules per pocket
pred_df_sorted = pred_df.sort_values(by='3CLPro_pocket1')
print(pred_df_sorted[['SMILES', '3CLPro_pocket1']].head(10))  # top 10 candidates

                                               SMILES  3CLPro_pocket1
36   O=C(N1CCCc2c1ccc(c2)C(F)(F)F)CN1C(=O)CSc2c1cccc2         -6.7654
41                  COCCNC(=O)c1cccc(c1)N1CCCS1(=O)=O         -6.7215
44  O=C(Nc1cc(nn1c1ccccc1)c1ccc2c(c1)CCC2)CN1CCN(C...         -6.5256
6            CCN(CC1CCN(C1)C(=O)NCc1cccc(c1)n1cncn1)C         -6.4984
19    CC(c1ccc(cc1)c1nnc(s1)NC(=O)c1cccc(c1)n1cnnn1)C         -6.4889
5    OC(CN1C(=O)NC(C1=O)(C)c1ccc2c(c1)CCC2)COCc1ccco1         -6.4515
27       N#Cc1c(NCc2ccc(cc2)n2ncnc2)n(C)c(=O)n(c1=O)C         -6.4255
24  CN1CCN(CC1)C(=O)c1ccc2c(c1)C(=O)N(C2=O)c1ccc(c...         -6.3383
34               O=C(c1onc(c1)c1ccccc1)NC(C(=O)N)(C)C         -5.9550
3          N#Cc1cccc(c1)C(=O)N1CCN(CC1)C(=O)Cc1ccccn1         -5.8907


In [15]:
# the best overall molecules across all pockets
pred_df['mean_score'] = pred_df[target_cols].mean(axis=1)
pred_df_sorted_overall = pred_df.sort_values(by='mean_score')
print(pred_df_sorted_overall[['SMILES', 'mean_score']].head(10))

                                               SMILES  mean_score
3          N#Cc1cccc(c1)C(=O)N1CCN(CC1)C(=O)Cc1ccccn1   -6.755117
39  CNc1ccc(cc1C(=O)NCC(=O)Nc1ccc(cc1C)Br)[N+](=O)...   -6.718522
18             O=C(C(=O)Nc1cccc(c1)c1cnco1)NC1CCCCCC1   -6.702556
35  CC/C=C\c1cc2c(cc1O)CC[C@@H]1[C@@H]2CC[C@]2([C@...   -6.680317
28         N#Cc1cccc(c1)C(=O)N1CCN(CC1)CC(=O)N1CCOCC1   -6.667728
24  CN1CCN(CC1)C(=O)c1ccc2c(c1)C(=O)N(C2=O)c1ccc(c...   -6.662939
47             O=C(CCc1c(C)n[nH]c1C)NCc1ccc2c(c1)OCO2   -6.601583
13  C=CCn1c(SCc2ccc(cc2)[N+](=O)[O-])nc2c(c1=O)c(c...   -6.596733
21  CCOC(=O)c1sc2c(c1)cc(cc2)NC(=O)N(Cc1cccnc1)CC1...   -6.595178
45               O=C(Nc1cccc(c1)C(=O)N)COc1ccc(cc1)Br   -6.586522
