In [2]:
%pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp310-cp310-win_amd64.whl.metadata (4.6 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-win_amd64.whl.metadata (5.3 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Downloading termcolor-


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy.stats import pearsonr
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

# ============================================
# 1. LOAD AND SPLIT DATA
# ============================================
print("Loading data...")
merged_data = pd.read_csv("C:\\Users\\vinay\\OneDrive\\Desktop\\hackathon\\autopharma\\dataset\\merged_data.csv")

target = merged_data['LN_IC50']
features = merged_data.drop(columns=['LN_IC50'])
stratify_col = merged_data['TCGA_DESC']

print(f"Data shape: {merged_data.shape}")

# Split into training (60%) and temporary (40%)
split1 = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)
for train_index, temp_index in split1.split(features, stratify_col):
    X_train, X_temp = features.iloc[train_index], features.iloc[temp_index]
    y_train, y_temp = target.iloc[train_index], target.iloc[temp_index]
    stratify_temp = stratify_col.iloc[temp_index]

# Split temporary into validation (20%) and test (20%)
split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
for val_index, test_index in split2.split(X_temp, stratify_temp):
    X_val, X_test = X_temp.iloc[val_index], X_temp.iloc[test_index]
    y_val, y_test = y_temp.iloc[val_index], y_temp.iloc[test_index]

print(f"\nTrain: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

# ============================================
# 2. PREPROCESSING
# ============================================
print("\nPreprocessing...")

# Select numeric features only
numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
X_train_num = X_train[numeric_cols]
X_val_num = X_val[numeric_cols]
X_test_num = X_test[numeric_cols]

# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_num)
X_val_scaled = scaler.transform(X_val_num)
X_test_scaled = scaler.transform(X_test_num)

print(f"Number of features: {X_train_scaled.shape[1]}")

# ============================================
# 3. BUILD ANN MODEL
# ============================================
print("\nBuilding ANN model...")

def build_ann(input_dim):
    """Build fully connected neural network"""
    model = keras.Sequential([
        layers.Input(shape=(input_dim,)),
        
        # Layer 1
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        # Layer 2
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        
        # Layer 3
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.2),
        
        # Layer 4
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.2),
        
        # Output layer
        layers.Dense(1)
    ])
    
    # Compile
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )
    
    return model

# Build model
ann_model = build_ann(X_train_scaled.shape[1])
ann_model.summary()

# ============================================
# 4. TRAIN MODEL
# ============================================
print("\nTraining ANN...")

# Callbacks
early_stop = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=20,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=10,
    min_lr=0.00001,
    verbose=1
)

# Train
history = ann_model.fit(
    X_train_scaled, y_train,
    validation_data=(X_val_scaled, y_val),
    epochs=200,
    batch_size=32,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# ============================================
# 5. EVALUATE MODEL
# ============================================
print("\n" + "="*50)
print("EVALUATION")
print("="*50)

def evaluate_ann(model, X, y, set_name):
    """Evaluate ANN model"""
    y_pred = model.predict(X, verbose=0).flatten()
    
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    pearson_corr, pearson_pval = pearsonr(y, y_pred)
    
    print(f"\n{set_name} Metrics:")
    print(f"  RMSE:    {rmse:.4f}")
    print(f"  MAE:     {mae:.4f}")
    print(f"  R²:      {r2:.4f}")
    print(f"  Pearson: {pearson_corr:.4f} (p={pearson_pval:.2e})")
    
    return {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'Pearson': pearson_corr
    }

# Evaluate on all sets
train_metrics = evaluate_ann(ann_model, X_train_scaled, y_train, "Train")
val_metrics = evaluate_ann(ann_model, X_val_scaled, y_val, "Validation")
test_metrics = evaluate_ann(ann_model, X_test_scaled, y_test, "Test")

# ============================================
# 6. SAVE MODEL AND ARTIFACTS
# ============================================
print("\n" + "="*50)
print("SAVING MODEL")
print("="*50)

save_dir = "C:\\Users\\vinay\\OneDrive\\Desktop\\hackathon\\autopharma\\trained_models\\ann_model"
os.makedirs(save_dir, exist_ok=True)

# Save Keras model
ann_model_path = os.path.join(save_dir, 'ann_model.keras')
ann_model.save(ann_model_path)
print(f"✅ ANN model saved: {ann_model_path}")

# Save as H5 (alternative format)
ann_h5_path = os.path.join(save_dir, 'ann_model.h5')
ann_model.save(ann_h5_path)
print(f"✅ ANN model (H5) saved: {ann_h5_path}")

# Save scaler
scaler_path = os.path.join(save_dir, 'ann_scaler.pkl')
with open(scaler_path, 'wb') as f:
    pickle.dump(scaler, f)
print(f"✅ Scaler saved: {scaler_path}")

# Save metadata
metadata = {
    'model_type': 'Artificial Neural Network',
    'architecture': {
        'layers': [256, 128, 64, 32, 1],
        'activation': 'relu',
        'dropout': [0.3, 0.3, 0.2, 0.2]
    },
    'feature_columns': numeric_cols,
    'n_features': len(numeric_cols),
    'training_samples': len(X_train),
    'val_samples': len(X_val),
    'test_samples': len(X_test),
    'metrics': {
        'train': train_metrics,
        'validation': val_metrics,
        'test': test_metrics
    },
    'training_date': pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S'),
    'epochs_trained': len(history.history['loss']),
    'best_epoch': np.argmin(history.history['val_loss']) + 1
}

metadata_path = os.path.join(save_dir, 'ann_metadata.pkl')
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
print(f"✅ Metadata saved: {metadata_path}")

# Save training history
history_path = os.path.join(save_dir, 'ann_training_history.pkl')
with open(history_path, 'wb') as f:
    pickle.dump(history.history, f)
print(f"✅ Training history saved: {history_path}")

# ============================================
# 7. RESULTS SUMMARY
# ============================================
print("\n" + "="*50)
print("FINAL RESULTS")
print("="*50)

results_df = pd.DataFrame({
    'Set': ['Train', 'Validation', 'Test'],
    'RMSE': [train_metrics['RMSE'], val_metrics['RMSE'], test_metrics['RMSE']],
    'MAE': [train_metrics['MAE'], val_metrics['MAE'], test_metrics['MAE']],
    'R²': [train_metrics['R2'], val_metrics['R2'], test_metrics['R2']],
    'Pearson': [train_metrics['Pearson'], val_metrics['Pearson'], test_metrics['Pearson']]
})

print("\n", results_df.to_string(index=False))

print("\n" + "="*50)
print(f"ALL FILES SAVED TO: {save_dir}")
print("="*50)
print("\nFiles created:")
print("  - ann_model.keras (TensorFlow/Keras model)")
print("  - ann_model.h5 (H5 format)")
print("  - ann_scaler.pkl (Feature scaler)")
print("  - ann_metadata.pkl (Model metadata)")
print("  - ann_training_history.pkl (Training history)")

# ============================================
# 8. EXAMPLE: HOW TO LOAD AND USE
# ============================================
print("\n" + "="*50)
print("TO LOAD AND USE THE MODEL:")
print("="*50)
print("""
import pickle
from tensorflow import keras

# Load model
model = keras.models.load_model('ann_model.keras')

# Load scaler
with open('ann_scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# Load metadata
with open('ann_metadata.pkl', 'rb') as f:
    metadata = pickle.load(f)

# Make predictions
# X_new_scaled = scaler.transform(X_new)
# predictions = model.predict(X_new_scaled)
""")

Loading data...
Data shape: (166644, 32)

Train: 99986, Val: 33329, Test: 33329

Preprocessing...
Number of features: 5

Building ANN model...



Training ANN...
Epoch 1/200
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - loss: 3.4062 - mae: 1.3450 - val_loss: 2.4269 - val_mae: 1.1201 - learning_rate: 0.0010
Epoch 2/200
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - loss: 2.8439 - mae: 1.2086 - val_loss: 2.3262 - val_mae: 1.0300 - learning_rate: 0.0010
Epoch 3/200
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - loss: 2.7072 - mae: 1.1657 - val_loss: 2.3271 - val_mae: 1.0442 - learning_rate: 0.0010
Epoch 4/200
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - loss: 2.6307 - mae: 1.1436 - val_loss: 2.3814 - val_mae: 1.0849 - learning_rate: 0.0010
Epoch 5/200
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 4ms/step - loss: 2.5938 - mae: 1.1318 - val_loss: 2.3189 - val_mae: 1.0667 - learning_rate: 0.0010
Epoch 6/200
[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5m



✅ ANN model saved: C:\Users\vinay\OneDrive\Desktop\hackathon\autopharma\trained_models\ann_model\ann_model.keras
✅ ANN model (H5) saved: C:\Users\vinay\OneDrive\Desktop\hackathon\autopharma\trained_models\ann_model\ann_model.h5
✅ Scaler saved: C:\Users\vinay\OneDrive\Desktop\hackathon\autopharma\trained_models\ann_model\ann_scaler.pkl
✅ Metadata saved: C:\Users\vinay\OneDrive\Desktop\hackathon\autopharma\trained_models\ann_model\ann_metadata.pkl
✅ Training history saved: C:\Users\vinay\OneDrive\Desktop\hackathon\autopharma\trained_models\ann_model\ann_training_history.pkl

FINAL RESULTS

        Set     RMSE      MAE       R²  Pearson
     Train 1.339622 0.907632 0.774992 0.881035
Validation 1.355137 0.912879 0.767366 0.876514
      Test 1.368438 0.925978 0.768589 0.877299

ALL FILES SAVED TO: C:\Users\vinay\OneDrive\Desktop\hackathon\autopharma\trained_models\ann_model

Files created:
  - ann_model.keras (TensorFlow/Keras model)
  - ann_model.h5 (H5 format)
  - ann_scaler.pkl (Feature