In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install rdkit scikit-learn tensorflow gradio

Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m36.2/36.2 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


In [3]:
import gradio as gr
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random

In [12]:
from rdkit import Chem
import random
import pandas as pd

def generate_patterned_smiles(n=100):
    """Generate molecules with simple structure-activity patterns."""
    bases = ["C", "CC", "CCC", "CCCC", "CCCCC", "C1CCCCC1"]  # Carbon skeletons
    functional_groups = ["O", "N", "Cl", "Br", "CO", "CN", "C=O", "C#N", "CCO", "CCN"]
    smiles = []
    activity = []

    for _ in range(n):
        base = random.choice(bases)
        fg = random.choice(functional_groups)
        smi = base + fg

        # Activity rule: if molecule has O or N ‚Üí more likely active
        if any(atom in smi for atom in ["O", "N"]):
            label = random.choices([1, 0], weights=[0.8, 0.2])[0]
        else:
            label = random.choices([0, 1], weights=[0.8, 0.2])[0]

        smiles.append(smi)
        activity.append(label)

    df = pd.DataFrame({"smiles": smiles, "activity": activity})
    return df

# Generate dataset
df = generate_patterned_smiles(200)  # You can change 200 ‚Üí 500 for even better performance
df.to_csv("patterned_molecules.csv", index=False)
print(f"‚úÖ Generated {len(df)} patterned molecules")
df#.head()


‚úÖ Generated 200 patterned molecules


Unnamed: 0,smiles,activity
0,CCl,0
1,CCO,1
2,CC=O,1
3,CCCCO,1
4,CCCCO,1
...,...,...
195,CCCO,1
196,CCO,1
197,CCCCCCCN,1
198,CCCN,1


In [13]:
from rdkit import Chem
from rdkit.Chem.rdFingerprintGenerator import GetMorganGenerator
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np

# --- 1Ô∏è‚É£ Clean invalid SMILES ---
def clean_smiles(smiles_list):
    valid = []
    for s in smiles_list:
        mol = Chem.MolFromSmiles(s)
        if mol is not None:  # RDKit returns None if SMILES is invalid
            valid.append(s)
    return valid

# Filter dataset
valid_smiles = clean_smiles(df["smiles"])
df = df[df["smiles"].isin(valid_smiles)]
print(f"‚úÖ Cleaned dataset ‚Äî {len(df)} valid molecules remain out of {len(valid_smiles)}")

# --- 2Ô∏è‚É£ Generate molecular fingerprints ---
morgan = GetMorganGenerator(radius=2, fpSize=1024)

def smiles_to_fp(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol:
        return np.array(morgan.GetFingerprint(mol))
    return None

fps = []
activities = []

for s, act in zip(df.smiles, df.activity):
    fp = smiles_to_fp(s)
    if fp is not None:
        fps.append(fp)
        activities.append(act)

X = np.array(fps)
y = np.array(activities)

print(f"Fingerprint matrix shape: {X.shape}")

# --- 3Ô∏è‚É£ Split data ---
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# --- 4Ô∏è‚É£ Train Random Forest QSAR model ---
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# --- 5Ô∏è‚É£ Evaluate performance ---
acc = rf_model.score(X_test, y_test)
print(f"‚úÖ QSAR Model trained successfully!\nModel accuracy on test set: {acc:.2f}")


‚úÖ Cleaned dataset ‚Äî 200 valid molecules remain out of 200
Fingerprint matrix shape: (200, 1024)
‚úÖ QSAR Model trained successfully!
Model accuracy on test set: 0.72


In [15]:
smiles_list = df.smiles.tolist()
chars = sorted(list(set(''.join(smiles_list))))
char_to_idx = {c: i+1 for i, c in enumerate(chars)}
idx_to_char = {i+1: c for i, c in enumerate(chars)}
vocab_size = len(chars) + 1
max_len = max(len(s) for s in smiles_list)

X_seq, y_seq = [], []
for s in smiles_list:
    for i in range(1, len(s)):
        seq_in = s[:i]
        seq_out = s[i]
        X_seq.append([char_to_idx[c] for c in seq_in])
        y_seq.append(char_to_idx[seq_out])

X_seq = pad_sequences(X_seq, maxlen=max_len, padding='pre')
y_seq = to_categorical(y_seq, num_classes=vocab_size)

gen_model = Sequential([
    Embedding(vocab_size, 64),
    LSTM(128),
    Dense(vocab_size, activation='softmax')
])
gen_model.compile(loss='categorical_crossentropy', optimizer='adam')
gen_model.fit(X_seq, y_seq, epochs=50, verbose=0)

<keras.src.callbacks.history.History at 0x7ecf1f7de5a0>

In [16]:
def generate_smiles(seed="C", length=10):
    result = seed
    for _ in range(length):
        encoded = [char_to_idx.get(c, 0) for c in result]
        encoded = pad_sequences([encoded], maxlen=max_len, padding='pre')
        pred = gen_model.predict(encoded, verbose=0)
        next_idx = np.argmax(pred)
        next_char = idx_to_char.get(next_idx, "")
        if next_char == "":
            break
        result += next_char
    return result

def predict_activity(smiles):
    fp = smiles_to_fp(smiles)
    if fp is None:
        return "Invalid SMILES"
    prob = rf_model.predict_proba([fp])[0, 1]
    return f"Predicted activity: {prob:.2f}"


In [17]:
def ai_drug_discovery(seed_smiles):
    new_smiles = generate_smiles(seed_smiles)
    activity = predict_activity(new_smiles)
    return f"üß¨ Generated Molecule: {new_smiles}\nüîπ {activity}"

demo = gr.Interface(
    fn=ai_drug_discovery,
    inputs=gr.Textbox(label="Enter a seed SMILES"),
    outputs=gr.Textbox(label="Generated Molecule and Predicted Activity"),
    title="AI-Driven Drug Discovery (De Novo Molecule Design)",
    description="This app uses a Generative LSTM to create new molecules and a QSAR model (Random Forest) to predict their biological activity."
)

demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8cb69e1eb8050eecce.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


