In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import autokeras as ak
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score


# # -------------------------------
# # 1. Load Full Dataset
# # -------------------------------
data_path = '/content/drive/MyDrive/BTP_3/mol_3d_descriptors_final.csv'
df = pd.read_csv(data_path)
print(f"Full dataset loaded with shape: {df.shape}")

target = "Potency_Change_Label"
non_feature_cols = ["canonical_smiles_1", "canonical_smiles_2",
                    "Potency_Change", "Potency_Change_Category", "Potency_Change_Label"]

# Define features as all columns except non-feature ones.
features = [col for col in df.columns if col not in non_feature_cols]

# # -------------------------------
# # 2. Apply Variance Threshold
# # -------------------------------
# # Convert features to numeric (non-convertible entries become NaN) and drop columns that are entirely NaN.
X_all_numeric = df[features].apply(pd.to_numeric, errors='coerce').dropna(axis=1, how='all')
variances = X_all_numeric.var()
# # Retain only features with variance greater than 0.8
var_thresh_features = variances[variances > 0.8].index.tolist()
print(f"{len(var_thresh_features)} features have variance > 0.8.")

# # Use intersection with original feature list (if needed)
final_features = [f for f in var_thresh_features if f in features]
print(f"Using {len(final_features)} features after applying variance threshold.")

# # -------------------------------
# # 3. Create Binary Target
# # -------------------------------
# # Original encoding:
# # 0 → Large Negative Change, 1 → Large Positive Change,
# # 2 → Moderate Negative Change, 3 → Moderate Positive Change,
# 4 → Small Negative Change, 5 → Small Positive Change
# Map negative changes (0,2,4) to 0 (improved potency)
# Map positive changes (1,3,5) to 1 (reduced potency)


binary_mapping = {0: 0, 2: 0, 4: 0, 1: 1, 3: 1, 5: 1}
df['binary_label'] = df[target].map(binary_mapping)
print("Binary target value counts:")
print(df['binary_label'].value_counts())

# -------------------------------
# 4. Subset Data to Selected Features and Binary Target
# -------------------------------
df_subset = df[final_features + ['binary_label']].copy()
print(f"Subset data shape: {df_subset.shape}")
# Save subset data to CSV in the Colab environment
df_subset.to_csv("/content/drive/MyDrive/BTP_3/df_subset.csv", index=False)
print("Saved df_subset to /content/drive/MyDrive/BTP_3/df_subset.csv")




# -------------------------------
# 5. Split Data for Training and Testing
# -------------------------------
X = df_subset[final_features]
y = df_subset['binary_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape[0]} rows; Testing set: {X_test.shape[0]} rows.")





# Convert X to numeric and drop NaNs
X_train_clean = X_train.apply(pd.to_numeric, errors='coerce').dropna()
X_test_clean = X_test.apply(pd.to_numeric, errors='coerce').dropna()
# Align y
y_train_clean = y_train[X_train_clean.index]
y_test_clean = y_test[X_test_clean.index]

# Convert everything to NumPy arrays
X_train_np = X_train_clean.to_numpy()
X_test_np = X_test_clean.to_numpy()
y_train_np = y_train_clean.to_numpy()
y_test_np = y_test_clean.to_numpy()

# -------------------------------
# Build IO pipeline for AutoKeras
# -------------------------------
input_node = ak.Input()
output_node = ak.ClassificationHead()(input_node)

clf = ak.AutoModel(
    inputs=input_node,
    outputs=output_node,
    overwrite=True,
    max_trials=10
)

clf.fit(X_train_np, y_train_np, epochs=100)

model = clf.export_model()
y_pred_proba = model.predict(X_test_np)

# Class prediction
if y_pred_proba.ndim > 1 and y_pred_proba.shape[1] > 1:
    y_pred = np.argmax(y_pred_proba, axis=1)
    auc_score = roc_auc_score(y_test_np, y_pred_proba[:, 1])
else:
    y_pred = (y_pred_proba > 0.5).astype(int)
    auc_score = roc_auc_score(y_test_np, y_pred_proba)

# Accuracy
acc = accuracy_score(y_test_np, y_pred)

print(f"Test accuracy: {acc:.4f}")
#print(f"ROC AUC score: {auc_score:.4f}")


# Export best model
model.save("autokeras_io_model.keras")
print("Saved model to autokeras_io_model.keras")

Trial 10 Complete [00h 00m 10s]
val_loss: 87.6564712524414

Best val_loss So Far: 81.5947265625
Total elapsed time: 00h 01m 11s
Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.5753 - loss: 1915.0156 
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.6004 - loss: 549.3397
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.4247 - loss: 1013.1636
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.4247 - loss: 1240.9646
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.4247 - loss: 614.7989
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5753 - loss: 307.3719
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.5753 - loss: 626.3760
Epoch 8/100
[1m2/2[0m [32

  saveable.load_own_variables(weights_store.get(inner_path))
Expected: ['keras_tensor']
Received: inputs=Tensor(shape=(25, 1332))


Test accuracy: 0.6400
Saved model to autokeras_io_model.keras


In [None]:
pip uninstall autokeras tensorflow keras -y

Found existing installation: autokeras 1.0.20
Uninstalling autokeras-1.0.20:
  Successfully uninstalled autokeras-1.0.20
Found existing installation: tensorflow 2.18.1
Uninstalling tensorflow-2.18.1:
  Successfully uninstalled tensorflow-2.18.1
Found existing installation: keras 3.8.0
Uninstalling keras-3.8.0:
  Successfully uninstalled keras-3.8.0


In [None]:
pip install autokeras==1.0.20 tensorflow==2.12 keras==2.12


Collecting tensorflow==2.12
  Downloading tensorflow-2.12.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting keras==2.12
  Using cached keras-2.12.0-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting gast<=0.4.0,>=0.2.1 (from tensorflow==2.12)
  Downloading gast-0.4.0-py3-none-any.whl.metadata (1.1 kB)
Collecting numpy<1.24,>=1.22 (from tensorflow==2.12)
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting tensorboard<2.13,>=2.12 (from tensorflow==2.12)
  Downloading tensorboard-2.12.3-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-estimator<2.13,>=2.12.0 (from tensorflow==2.12)
  Downloading tensorflow_estimator-2.12.0-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow==2.12)
  Downloading wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting ml_dtypes>=0.4

In [2]:
pip install autokeras --upgrade

Collecting autokeras
  Downloading autokeras-2.0.0-py3-none-any.whl.metadata (5.8 kB)
Collecting keras-tuner>=1.4.0 (from autokeras)
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner>=1.4.0->autokeras)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading autokeras-2.0.0-py3-none-any.whl (122 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.7/122.7 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner, autokeras
Successfully installed autokeras-2.0.0 keras-tuner-1.4.7 kt-legacy-1.0.5


In [None]:
import autokeras
print(autokeras.__version__)


2.0.0


In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
import autokeras as ak


# -------------------------------
# Split Data for Training and Testing
# -------------------------------
df_subset = pd.read_csv("/content/drive/MyDrive/BTP_3/df_subset.csv")
print(f"Subset data shape: {df_subset.shape}")
final_features = df_subset.columns.drop('binary_label')
X = df_subset[final_features]
y = df_subset['binary_label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set: {X_train.shape[0]} rows; Testing set: {X_test.shape[0]} rows.")


# Convert X to numeric and drop NaNs
X_train_clean = X_train.apply(pd.to_numeric, errors='coerce').dropna()
X_test_clean = X_test.apply(pd.to_numeric, errors='coerce').dropna()

# Align y
y_train_clean = y_train[X_train_clean.index]
y_test_clean = y_test[X_test_clean.index]

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Impute missing
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train_clean)
X_test_imputed = imputer.transform(X_test_clean)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)


# Convert everything to NumPy arrays
X_train_np = X_train_scaled
X_test_np = X_test_scaled
y_train_np = y_train_clean.to_numpy()
y_test_np = y_test_clean.to_numpy()

# -------------------------------
# Build IO pipeline for AutoKeras
# -------------------------------
input_node = ak.Input()    #ak.Input() will detect your data type (since you're passing DataFrame) and treat it as tabular
output_node = ak.ClassificationHead()(input_node)   #ak.ClassificationHead() creates the output block for a classification task

clf = ak.AutoModel(
    inputs=input_node,
    outputs=output_node,
    overwrite=True,
    max_trials=50
)



clf.fit(X_train_np, y_train_np, epochs=100)


model = clf.export_model()
y_pred_proba = model.predict(X_test_np)

# Class prediction
if y_pred_proba.ndim > 1 and y_pred_proba.shape[1] > 1:
    y_pred = np.argmax(y_pred_proba, axis=1)
    auc_score = roc_auc_score(y_test_np, y_pred_proba[:, 1])
else:
    y_pred = (y_pred_proba > 0.5).astype(int)
    auc_score = roc_auc_score(y_test_np, y_pred_proba)

# Accuracy
acc = accuracy_score(y_test_np, y_pred)

print(f"Test accuracy: {acc:.4f}")
print(f"ROC AUC score: {auc_score:.4f}")


# Export best model
model.save("autokeras_io_model.keras")
print("Saved model to autokeras_io_model.keras")


Trial 24 Complete [00h 00m 03s]
val_loss: 0.918127715587616

Best val_loss So Far: 0.636214554309845
Total elapsed time: 00h 02m 47s
Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 29ms/step - accuracy: 0.4707 - loss: 0.8668 
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5167 - loss: 0.8757
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.4811 - loss: 0.8672
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.5271 - loss: 0.7882
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.4686 - loss: 0.8861
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.4100 - loss: 0.8238 
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.4790 - loss: 0.8109 
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━

Expected: ['keras_tensor']
Received: inputs=Tensor(shape=(25, 1332))
