In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

!pip install pip==24.0

data_path = 'bioactivity_dataset.csv'
df= pd.read_csv(data_path)
df.drop(["molecule_chembl_id"], axis=1, inplace=True)
df.head()

#Scaling the attributes
from sklearn.preprocessing import StandardScaler

columns_to_standardize = ['MW', 'LogP', 'NumHDonors', 'NumHAcceptors']
scaler = StandardScaler()
df[columns_to_standardize] = scaler.fit_transform(df[columns_to_standardize])

#labeling class

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["bioactivity_class"] = le.fit_transform(df["bioactivity_class"])

#minority oversampling

import pandas as pd
from imblearn.over_sampling import SMOTENC

df = df.dropna()
X = df[['MW', 'LogP', 'NumHDonors', 'NumHAcceptors', 'canonical_smiles']]
y = df['bioactivity_class']

categorical_features = [4]

smote_nc = SMOTENC(categorical_features=categorical_features, random_state=42)
X_resampled, y_resampled = smote_nc.fit_resample(X, y)
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled_df = pd.DataFrame(y_resampled, columns=['bioactivity_class'])
df_resampled = pd.concat([X_resampled_df, y_resampled_df], axis=1)
df_resampled.head()

#implementation of the biot5 model

!pip install simplet5
!pip install transformers


t5_df = pd.DataFrame(columns=["source_text", "target_text"])


for i,row in df_resampled.iterrows():
  t5_df.loc[len(t5_df)] = {"source_text": "{} {} {} {} {}".format(str(row["canonical_smiles"]), str(row["MW"]), str(row["NumHDonors"]), str(row["LogP"]), str(row["NumHAcceptors"])),
                           "target_text": str(row['bioactivity_class'])}


from sklearn.model_selection import train_test_split
train_df, remaining_df = train_test_split(t5_df, test_size=0.2, random_state=42)
test_df, valid_df = train_test_split(remaining_df, test_size=0.5, random_state=42)
print(train_df.shape, test_df.shape, valid_df.shape)


from simplet5 import SimpleT5
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Initialize SimpleT5

model = SimpleT5()

# Load tokenizer and pre-trained model (optional)

tokenizer = T5Tokenizer.from_pretrained("QizhiPei/biot5-base-dti-human", model_max_length=512)
model.from_pretrained(model_type="t5", model_name="QizhiPei/biot5-base-dti-human") # Load pre-trained weights into SimpleT5


# Train the model using SimpleT5's train method

model.train(train_df=train_df,
            eval_df=valid_df,
            source_max_token_len=128,
            target_max_token_len=50,
            batch_size=16, max_epochs=5, use_gpu=True)


from simplet5 import SimpleT5

# model_path can be found in the outputs folder in colab runtime/local machine
# after training is completed. Pick the best 'val-loss-xxx' (lower is better).

model_path = "/content/outputs/simplet5-epoch-2-train-loss-0.0346-val-loss-0.075"
model.load_model("t5",model_path, use_gpu=True)





preds = []
original = []

for i,row in test_df.iterrows():
  preds.append(model.predict(row["source_text"]))
  original.append(row["target_text"])





from sklearn.metrics import classification_report
print(classification_report(original, preds))
