In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Paclitaxel Dose Optimization - Model Training\n",
    "\n",
    "This notebook covers:\n",
    "1. Loading enhanced features from previous step\n",
    "2. Baseline model training\n",
    "3. Multiple algorithm comparison\n",
    "4. Model performance evaluation\n",
    "5. Feature importance analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split, cross_val_score\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
    "from sklearn.linear_model import LinearRegression\n",
    "from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error\n",
    "from sklearn.neural_network import MLPRegressor\n",
    "import xgboost as xgb\n",
    "import pickle\n",
    "import json\n",
    "import os\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "print('Libraries imported for model training!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load enhanced data from previous step\n",
    "X_enhanced = pd.read_csv('../data/processed/X_enhanced.csv')\n",
    "y = pd.read_csv('../data/processed/y_enhanced.csv').squeeze()\n",
    "\n",
    "# Load feature list\n",
    "with open('../data/processed/enhanced_features.txt', 'r') as f:\n",
    "    enhanced_features = [line.strip() for line in f.readlines()]\n",
    "\n",
    "print('Enhanced dataset loaded:')\n",
    "print(f'Features shape: {X_enhanced.shape}')\n",
    "print(f'Target shape: {y.shape}')\n",
    "print(f'Enhanced features: {enhanced_features}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Baseline Model with Original Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create baseline with original simple features\n",
    "X_simple = X_enhanced[['log_dose', 'cell_line_encoded']]\n",
    "\n",
    "# Split data for baseline\n",
    "X_train_simple, X_test_simple, y_train, y_test = train_test_split(\n",
    "    X_simple, y, test_size=0.2, random_state=42\n",
    ")\n",
    "\n",
    "# Train baseline model\n",
    "baseline_model = RandomForestRegressor(n_estimators=100, random_state=42)\n",
    "baseline_model.fit(X_train_simple, y_train)\n",
    "y_pred_baseline = baseline_model.predict(X_test_simple)\n",
    "baseline_r2 = r2_score(y_test, y_pred_baseline)\n",
    "baseline_rmse = np.sqrt(mean_squared_error(y_test, y_pred_baseline))\n",
    "\n",
    "print('Baseline Model Performance (Original Features):')\n",
    "print(f'   R² Score: {baseline_r2:.4f}')\n",
    "print(f'   RMSE: {baseline_rmse:.4f}')\n",
    "print(f'   Features used: {X_simple.columns.tolist()}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Enhanced Model Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split enhanced data\n",
    "X_train, X_test, y_train_enh, y_test_enh = train_test_split(\n",
    "    X_enhanced, y, test_size=0.2, random_state=42\n",
    ")\n",
    "\n",
    "# Scale features for neural network\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)\n",
    "\n",
    "print('Enhanced dataset split:')\n",
    "print(f'Training set: {X_train.shape}')\n",
    "print(f'Test set: {X_test.shape}')\n",
    "print(f'Features: {len(enhanced_features)}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Multi-Algorithm Comparison"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define models to compare\n",
    "models = {\n",
    "    'Linear Regression': LinearRegression(),\n",
    "    'Random Forest': RandomForestRegressor(\n",
    "        n_estimators=200, max_depth=12, min_samples_split=5,\n",
    "        min_samples_leaf=2, random_state=42, n_jobs=-1\n",
    "    ),\n",
    "    'Enhanced Random Forest': RandomForestRegressor(\n",
    "        n_estimators=500, max_depth=15, min_samples_split=3,\n",
    "        min_samples_leaf=1, random_state=42, n_jobs=-1\n",
    "    ),\n",
    "    'Gradient Boosting': GradientBoostingRegressor(\n",
    "        n_estimators=200, learning_rate=0.1, max_depth=6,\n",
    "        min_samples_split=4, random_state=42\n",
    "    ),\n",
    "    'XGBoost': xgb.XGBRegressor(\n",
    "        n_estimators=200, learning_rate=0.1, max_depth=6,\n",
    "        min_child_weight=1, random_state=42\n",
    "    ),\n",
    "    'Neural Network': MLPRegressor(\n",
    "        hidden_layer_sizes=(100, 50), activation='relu', solver='adam',\n",
    "        max_iter=1000, random_state=42, early_stopping=True\n",
    "    )\n",
    "}\n",
    "\n",
    "print(f'Training {len(models)} different algorithms...')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Train and evaluate models\n",
    "model_results = {}\n",
    "trained_models = {}\n",
    "\n",
    "for name, model in models.items():\n",
    "    print(f'\\nTraining {name}...')\n",
    "    \n",
    "    try:\n",
    "        # Train model\n",
    "        if name == 'Neural Network':\n",
    "            model.fit(X_train_scaled, y_train_enh)\n",
    "            y_pred_train = model.predict(X_train_scaled)\n",
    "            y_pred_test = model.predict(X_test_scaled)\n",
    "        else:\n",
    "            model.fit(X_train, y_train_enh)\n",
    "            y_pred_train = model.predict(X_train)\n",
    "            y_pred_test = model.predict(X_test)\n",
    "        \n",
    "        # Calculate metrics\n",
    "        train_r2 = r2_score(y_train_enh, y_pred_train)\n",
    "        test_r2 = r2_score(y_test_enh, y_pred_test)\n",
    "        train_rmse = np.sqrt(mean_squared_error(y_train_enh, y_pred_train))\n",
    "        test_rmse = np.sqrt(mean_squared_error(y_test_enh, y_pred_test))\n",
    "        \n",
    "        # Cross-validation\n",
    "        try:\n",
    "            if name == 'Neural Network':\n",
    "                cv_scores = cross_val_score(model, X_train_scaled, y_train_enh, cv=3, scoring='r2')\n",
    "            else:\n",
    "                cv_scores = cross_val_score(model, X_train, y_train_enh, cv=3, scoring='r2')\n",
    "        except:\n",
    "            cv_scores = np.array([test_r2])\n",
    "        \n",
    "        model_results[name] = {\n",
    "            'Train R²': train_r2,\n",
    "            'Test R²': test_r2,\n",
    "            'Train RMSE': train_rmse,\n",
    "            'Test RMSE': test_rmse,\n",
    "            'CV R² Mean': cv_scores.mean(),\n",
    "            'CV R² Std': cv_scores.std()\n",
    "        }\n",
    "        \n",
    "        trained_models[name] = model\n",
    "        \n",
    "        # Calculate improvement over baseline\n",
    "        improvement = (test_r2 - baseline_r2) / baseline_r2 * 100\n",
    "        print(f'{name} - Test R²: {test_r2:.4f}, Improvement: {improvement:+.1f}%')\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f'{name} failed: {e}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display comprehensive results\n",
    "results_df = pd.DataFrame(model_results).T\n",
    "\n",
    "# Add baseline for comparison\n",
    "baseline_row = pd.DataFrame({\n",
    "    'Train R²': [baseline_r2],\n",
    "    'Test R²': [baseline_r2],\n",
    "    'Train RMSE': [baseline_rmse],\n",
    "    'Test RMSE': [baseline_rmse],\n",
    "    'CV R² Mean': [baseline_r2],\n",
    "    'CV R² Std': [0.0]\n",
    "}, index=['Baseline (Original)'])\n",
    "\n",
    "comparison_df = pd.concat([baseline_row, results_df])\n",
    "\n",
    "print('\\nComplete Model Comparison:')\n",
    "print(comparison_df.round(4))\n",
    "\n",
    "# Find best model\n",
    "if len(trained_models) > 0:\n",
    "    best_model_name = results_df['Test R²'].idxmax()\n",
    "    best_model = trained_models[best_model_name]\n",
    "    best_r2 = results_df.loc[best_model_name, 'Test R²']\n",
    "    \n",
    "    print(f'\\nBest Model: {best_model_name}')\n",
    "    print(f'Test R²: {best_r2:.4f}')\n",
    "    print(f'Improvement over baseline: {best_r2 - baseline_r2:.4f} ({(best_r2 - baseline_r2)/baseline_r2*100:.1f}%)')\nelse:\n",
    "    print('No models were successfully trained')\n",
    "    best_model_name = 'None'\n",
    "    best_model = None\n",
    "    best_r2 = 0"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Performance Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Plot model comparison\n",
    "fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n",
    "\n",
    "# R² comparison\n",
    "r2_scores = comparison_df['Test R²']\n",
    "colors = ['red' if 'Baseline' in idx else 'skyblue' for idx in r2_scores.index]\n",
    "axes[0,0].bar(range(len(r2_scores)), r2_scores.values, color=colors)\n",
    "axes[0,0].set_title('Model R² Score Comparison')\n",
    "axes[0,0].set_ylabel('R² Score')\n",
    "axes[0,0].set_xticks(range(len(r2_scores)))\n",
    "axes[0,0].set_xticklabels(r2_scores.index, rotation=45, ha='right')\n",
    "axes[0,0].axhline(y=baseline_r2, color='red', linestyle='--', label='Baseline')\n",
    "axes[0,0].legend()\n",
    "\n",
    "# RMSE comparison\n",
    "rmse_scores = comparison_df['Test RMSE']\n",
    "axes[0,1].bar(range(len(rmse_scores)), rmse_scores.values, color=colors)\n",
    "axes[0,1].set_title('Model RMSE Comparison')\n",
    "axes[0,1].set_ylabel('RMSE')\n",
    "axes[0,1].set_xticks(range(len(rmse_scores)))\n",
    "axes[0,1].set_xticklabels(rmse_scores.index, rotation=45, ha='right')\n",
    "\n",
    "# Training vs Test R² (exclude baseline)\n",
    "if len(results_df) > 0:\n",
    "    for model_name in results_df.index:\n",
    "        train_r2 = results_df.loc[model_name, 'Train R²']\n",
    "        test_r2 = results_df.loc[model_name, 'Test R²']\n",
    "        axes[1,0].scatter(train_r2, test_r2, s=100, label=model_name, alpha=0.7)\n",
    "    \n",
    "    # Perfect line\n",
    "    min_r2 = min(results_df['Train R²'].min(), results_df['Test R²'].min())\n",
    "    max_r2 = max(results_df['Train R²'].max(), results_df['Test R²'].max())\n",
    "    axes[1,0].plot([min_r2, max_r2], [min_r2, max_r2], 'k--', alpha=0.5)\n",
    "    axes[1,0].set_title('Training vs Test R² (Overfitting Check)')\n",
    "    axes[1,0].set_xlabel('Training R²')\n",
    "    axes[1,0].set_ylabel('Test R²')\n",
    "    axes[1,0].legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n",
    "\n",
    "# Cross-validation scores\n",
    "if len(results_df) > 0:\n",
    "    cv_means = results_df['CV R² Mean']\n",
    "    cv_stds = results_df['CV R² Std']\n",
    "    axes[1,1].bar(range(len(cv_means)), cv_means.values, \n",
    "                  yerr=cv_stds.values, capsize=5, alpha=0.7)\n",
    "    axes[1,1].set_title('Cross-Validation R² Scores')\n",
    "    axes[1,1].set_ylabel('CV R² Score')\n",
    "    axes[1,1].set_xticks(range(len(cv_means)))\n",
    "    axes[1,1].set_xticklabels(cv_means.index, rotation=45, ha='right')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Importance Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feature importance for tree-based models\n",
    "feature_importance = None\n",
    "\n",
    "if best_model is not None and hasattr(best_model, 'feature_importances_'):\n",
    "    feature_importance = pd.DataFrame({\n",
    "        'feature': enhanced_features,\n",
    "        'importance': best_model.feature_importances_\n",
    "    }).sort_values('importance', ascending=False)\n",
    "    \n",
    "    print(f'Feature Importance Analysis ({best_model_name}):')\n",
    "    print(feature_importance)\n",
    "    \n",
    "    # Plot feature importance\n",
    "    plt.figure(figsize=(12, 8))\n",
    "    top_features = feature_importance.head(12)\n",
    "    plt.barh(range(len(top_features)), top_features['importance'])\n",
    "    plt.yticks(range(len(top_features)), top_features['feature'])\n",
    "    plt.xlabel('Feature Importance')\n",
    "    plt.title(f'Top 12 Feature Importances ({best_model_name})')\n",
    "    plt.gca().invert_yaxis()\n",
    "    plt.tight_layout()\n",
    "    plt.show()\nelse:\n",
    "    print(f'Feature importance not available for {best_model_name}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Prediction Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get predictions from best model\n",
    "if best_model is not None:\n",
    "    if best_model_name == 'Neural Network':\n",
    "        y_pred_best = best_model.predict(X_test_scaled)\n",
    "    else:\n",
    "        y_pred_best = best_model.predict(X_test)\n",
    "    \n",
    "    # Prediction vs Actual plot\n",
    "    plt.figure(figsize=(10, 8))\n",
    "    plt.scatter(y_test_enh, y_pred_best, alpha=0.6, s=20)\n",
    "    plt.plot([y_test_enh.min(), y_test_enh.max()], [y_test_enh.min(), y_test_enh.max()], 'r--', lw=2)\n",
    "    plt.xlabel('Actual Viability')\n",
    "    plt.ylabel('Predicted Viability')\n",
    "    plt.title(f'Prediction vs Actual ({best_model_name})\\nR² = {best_r2:.4f}')\n",
    "    plt.grid(True, alpha=0.3)\n",
    "    plt.show()\n",
    "    \n",
    "    # Residuals plot\n",
    "    residuals = y_test_enh - y_pred_best\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    plt.scatter(y_pred_best, residuals, alpha=0.6, s=20)\n",
    "    plt.axhline(y=0, color='r', linestyle='--')\n",
    "    plt.xlabel('Predicted Viability')\n",
    "    plt.ylabel('Residuals')\n",
    "    plt.title(f'Residuals Plot ({best_model_name})')\n",
    "    plt.grid(True, alpha=0.3)\n",
    "    plt.show()\nelse:\n",
    "    print('No valid model for prediction analysis')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save Best Model and Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create directories\n",
    "os.makedirs('../models', exist_ok=True)\n",
    "os.makedirs('../data/processed', exist_ok=True)\n",
    "\n",
    "if best_model is not None:\n",
    "    # Save best model\n",
    "    model_filename = best_model_name.lower().replace(' ', '_') + '_model.pkl'\n",
    "    with open(f'../models/{model_filename}', 'wb') as f:\n",
    "        pickle.dump(best_model, f)\n",
    "    \n",
    "    # Save scaler\n",
    "    with open('../models/scaler.pkl', 'wb') as f:\n",
    "        pickle.dump(scaler, f)\n",
    "    \n",
    "    # Save results\n",
    "    comparison_df.to_csv('../data/processed/model_comparison_results.csv')\n",
    "    \n",
    "    if feature_importance is not None:\n",
    "        feature_importance.to_csv('../data/processed/feature_importance.csv', index=False)\n",
    "    \n",
    "    # Save model info\n",
    "    model_info = {\n",
    "        'best_model': best_model_name,\n",
    "        'best_r2': float(best_r2),\n",
    "        'baseline_r2': float(baseline_r2),\n",
    "        'improvement': float(best_r2 - baseline_r2),\n",
    "        'improvement_percent': float((best_r2 - baseline_r2) / baseline_r2 * 100),\n",
    "        'training_samples': len(X_train),\n",
    "        'test_samples': len(X_test),\n",
    "        'enhanced_features': enhanced_features\n",
    "    }\n",
    "    \n",
    "    with open('../data/processed/model_info.json', 'w') as f:\n",
    "        json.dump(model_info, f, indent=2)\n",
    "    \n",
    "    print('Best model and results saved!')\n",
    "    print('Files saved:')\n",
    "    print(f'  - {model_filename} (best model)')\n",
    "    print('  - scaler.pkl (feature scaler)')\n",
    "    print('  - model_comparison_results.csv (all model results)')\n",
    "    if feature_importance is not None:\n",
    "        print('  - feature_importance.csv (feature importance)')\n",
    "    print('  - model_info.json (model metadata)')\n",
    "    \n",
    "    print(f'\\nTRAINING SUMMARY:')\n",
    "    print(f'   Best Model: {best_model_name}')\n",
    "    print(f'   Baseline R²: {baseline_r2:.4f}')\n",
    "    print(f'   Best R²: {best_r2:.4f}')\n",
    "    print(f'   Improvement: +{best_r2 - baseline_r2:.4f} ({(best_r2 - baseline_r2)/baseline_r2*100:.1f}%)')\n",
    "    print(f'   Features: {len(enhanced_features)} enhanced features')\nelse:\n",
    "    print('No valid model to save')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}