# Classification experiments with molecular kernels

In [None]:
# Local imports 
from mols.mol_kernels import MolGraphKernel, MolDistanceKernel
from mols.molecule import Molecule
from dist.ot_dist_computer import OTChemDistanceComputer

# 3rd party
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:
# read the MUV dataset
df = pd.read_csv("../datasets/muv/muv.csv")

In [None]:
df.head()

In [None]:
tasks = list(df.columns)[:-2]

def get_data_for_task(df, task):
    our_df = df.loc[:, [task, 'smiles']]
    our_df = our_df.dropna()
    print("Mean positive: {:.5f}".format(our_df[task].mean()))
    return our_df

In [None]:
for task in tasks:
    print(task)
    data = get_data_for_task(df, task)

In [None]:
# let's take MUV-600 for now

data = get_data_for_task(df, "MUV-600")
X, y = data.iloc[:, 1], data.iloc[:, 0]

In [None]:
def subsample(X, y, rate):
    index_zeros = X[y == 0].index
    index_zeros = np.random.choice(index_zeros, size=int(len(index_zeros) * rate))
    index_nonzeros = X[y != 0].index
    index_all = list(index_nonzeros) + list(index_zeros)
    X = X.loc[index_all]
    y = y.loc[index_all]
    print(y.mean())
    return X, y

In [None]:
X, y = subsample(X, y, 0.1)

In [None]:
# Evaluation for OT dist complexity
lens = [Molecule(X.iloc[i]).to_rdkit().GetNumAtoms() for i in range(len(X))]
np.mean(lens), np.std(lens)

In [None]:
def get_func_on_smiles(kernel_obj):
    func_on_mols = kernel_obj._child_evaluate
    def inner(xs, ys):
        xs_mol = [Molecule(smile) for smile in xs]
        ys_mol = [Molecule(smile) for smile in ys]
        return func_on_mols(xs_mol, ys_mol)
    return inner

In [None]:
kernel_wl = MolGraphKernel("wl_kernel", 2)
kernel_ot = MolDistanceKernel("distance_kernel_expsum", 
                              dist_computer=OTChemDistanceComputer(),
                              betas=[1,1,1,1])

kern_func_wl = get_func_on_smiles(kernel_wl)
kern_func_ot = get_func_on_smiles(kernel_ot)

clf_wl = SVC(gamma='auto', kernel="precomputed")
clf_ot = SVC(gamma='auto', kernel="precomputed")

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, train_size=0.8, shuffle=True)

In [None]:
G = kern_func_wl(X_train, X_train)
clf_wl.fit(G, y_train)
G_val = kern_func_wl(X_val, X_train)
p = clf_wl.predict(G_val)
roc_auc_score(y_val, p), accuracy_score(y_val, p)

In [None]:
G = kern_func_ot(X_train, X_train)
clf_ot.fit(G, y_train)
G_val = kern_func_ot(X_val, X_train)
p = clf_ot.predict(G_val)
roc_auc_score(y_val, p), accuracy_score(y_val, p)