In [None]:
# Cell 1: Imports and setup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [None]:
# Cell 2: Load data
file_path = r"data/raw/drug_data.csv"
df = pd.read_csv(file_path)
df.head()


In [None]:
# Cell 3: Explore the dataset
print("Dataset shape:", df.shape)
print("\nColumn names:", df.columns)
print("\nMissing values:\n", df.isnull().sum())
print("\nActivity distribution:\n", df['activity'].value_counts())
sns.countplot(data=df, x='activity')
plt.title("Activity Class Distribution")
plt.show()


In [None]:
# Cell 4: Basic featurization using RDKit (e.g., Molecular Weight, LogP)
def featurize(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return [
            Descriptors.MolWt(mol),
            Descriptors.MolLogP(mol),
            Descriptors.NumHDonors(mol),
            Descriptors.NumHAcceptors(mol)
        ]
    else:
        return [np.nan] * 4

features = df['smiles'].apply(featurize)
features_df = pd.DataFrame(features.tolist(), columns=['MolWt', 'LogP', 'HDonors', 'HAcceptors'])
df_features = pd.concat([df, features_df], axis=1).dropna()
df_features.head()


In [None]:
# Cell 5: Split the data
X = df_features[['MolWt', 'LogP', 'HDonors', 'HAcceptors']]
y = df_features['activity']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Cell 6: Train RandomForest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Cell 7: Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()


In [None]:
# Cell 8: Save the model (optional)
import joblib
joblib.dump(clf, 'models/drug_model.pkl')
