# ML Model Comparison Using lazypredict

In [1]:
! pip install lazypredict



In [2]:
! pip install lightgbm



In [3]:
import lightgbm

In [4]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import lazypredict
from lazypredict.Supervised import LazyRegressor

In [5]:
dataset = 'csv/serotonin_dataset_pIC50_pubchem.csv'
df = pd.read_csv(dataset)
df

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,pIC50
0,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.94
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.38
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.48
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.56
4,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,8.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2829,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.37
2830,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.87
2831,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,6.06
2832,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,7.89


In [6]:
x = df.drop('pIC50', axis=1)
y = df['pIC50']
x = x[:3000]
y = y[:3000]

# Data Pre-Processing

In [7]:
x.shape

(2834, 881)

In [8]:
#remove low variance features
from sklearn.feature_selection import VarianceThreshold
selection = VarianceThreshold(threshold=(.8 * (1 - .8)))    
x = selection.fit_transform(x)
x.shape

(2834, 145)

In [9]:
#data split (80:20 ratio)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [10]:
len(x_train)

2267

In [None]:
# Defines and builds the lazyclassifier
reg = LazyRegressor(verbose=0,ignore_warnings=True, custom_metric=None)
train,test = reg.fit(x_train, x_test, y_train, y_test)
# models_train,predictions_train = reg.fit(x_train, x_train, y_train, y_train)
# models_test,predictions_test = reg.fit(x_train, x_test, y_train, y_test)

 64%|███████████████████████████▋               | 27/42 [00:11<00:08,  1.83it/s]

In [None]:
# Performance table of the training set (80% subset)
train

In [None]:
# Performance table of the test set (20% subset)
test

# Data Visualization of Model Performance

In [None]:
# Bar plot of R-squared values
import matplotlib.pyplot as plt
import seaborn as sns

#train["R-Squared"] = [0 if i < 0 else i for i in train.iloc[:,0] ]

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=train.index, x="R-Squared", data=train)
ax.set(xlim=(0, 1))

In [None]:
# Bar plot of RMSE values
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=train.index, x="RMSE", data=train)
ax.set(xlim=(0, 10))

In [None]:

# Bar plot of calculation time
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5, 10))
sns.set_theme(style="whitegrid")
ax = sns.barplot(y=train.index, x="Time Taken", data=train)
ax.set(xlim=(0, 10))