In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Paclitaxel Dose Optimization - Model Improvement\n",
    "\n",
    "This notebook covers:\n",
    "1. Loading best model from previous step\n",
    "2. Hyperparameter optimization\n",
    "3. Ensemble methods\n",
    "4. Advanced model techniques\n",
    "5. Final model selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, BaggingRegressor\n",
    "from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error\n",
    "from sklearn.neural_network import MLPRegressor\n",
    "import xgboost as xgb\n",
    "import pickle\n",
    "import json\n",
    "import os\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "print('Libraries imported for model improvement!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load data and previous results\n",
    "X_enhanced = pd.read_csv('../data/processed/X_enhanced.csv')\n",
    "y = pd.read_csv('../data/processed/y_enhanced.csv').squeeze()\n",
    "\n",
    "# Load previous model info\n",
    "with open('../data/processed/model_info.json', 'r') as f:\n",
    "    previous_results = json.load(f)\n",
    "\n",
    "baseline_r2 = previous_results['baseline_r2']\n",
    "best_previous_r2 = previous_results['best_r2']\n",
    "best_previous_model = previous_results['best_model']\n",
    "\n",
    "print('Previous Results:')\n",
    "print(f'Baseline R²: {baseline_r2:.4f}')\n",
    "print(f'Best Previous R²: {best_previous_r2:.4f} ({best_previous_model})')\n",
    "print(f'Dataset shape: {X_enhanced.shape}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split data (same random state for consistency)\n",
    "X_train, X_test, y_train, y_test = train_test_split(\n",
    "    X_enhanced, y, test_size=0.2, random_state=42\n",
    ")\n",
    "\n",
    "# Scale features\n",
    "scaler = StandardScaler()\n",
    "X_train_scaled = scaler.fit_transform(X_train)\n",
    "X_test_scaled = scaler.transform(X_test)\n",
    "\n",
    "print(f'Data split completed: {X_train.shape} train, {X_test.shape} test')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hyperparameter Optimization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optimize Random Forest\n",
    "print('Optimizing Random Forest...')\n",
    "\n",
    "rf_param_grid = {\n",
    "    'n_estimators': [300, 500, 800],\n",
    "    'max_depth': [10, 15, 20, None],\n",
    "    'min_samples_split': [2, 3, 5],\n",
    "    'min_samples_leaf': [1, 2, 3]\n",
    "}\n",
    "\n",
    "rf_grid = RandomizedSearchCV(\n",
    "    RandomForestRegressor(random_state=42, n_jobs=-1),\n",
    "    rf_param_grid,\n",
    "    n_iter=20,\n",
    "    cv=3,\n",
    "    scoring='r2',\n",
    "    random_state=42,\n",
    "    n_jobs=-1\n",
    ")\n",
    "\n",
    "rf_grid.fit(X_train, y_train)\n",
    "best_rf = rf_grid.best_estimator_\n",
    "rf_score = r2_score(y_test, best_rf.predict(X_test))\n",
    "\n",
    "print(f'Best RF parameters: {rf_grid.best_params_}')\n",
    "print(f'Optimized RF R²: {rf_score:.4f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optimize XGBoost\n",
    "print('\\nOptimizing XGBoost...')\n",
    "\n",
    "xgb_param_grid = {\n",
    "    'n_estimators': [200, 300, 500],\n",
    "    'learning_rate': [0.05, 0.1, 0.15],\n",
    "    'max_depth': [4, 6, 8],\n",
    "    'min_child_weight': [1, 3, 5],\n",
    "    'subsample': [0.8, 0.9, 1.0]\n",
    "}\n",
    "\n",
    "xgb_grid = RandomizedSearchCV(\n",
    "    xgb.XGBRegressor(random_state=42),\n",
    "    xgb_param_grid,\n",
    "    n_iter=15,\n",
    "    cv=3,\n",
    "    scoring='r2',\n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "xgb_grid.fit(X_train, y_train)\n",
    "best_xgb = xgb_grid.best_estimator_\n",
    "xgb_score = r2_score(y_test, best_xgb.predict(X_test))\n",
    "\n",
    "print(f'Best XGBoost parameters: {xgb_grid.best_params_}')\n",
    "print(f'Optimized XGBoost R²: {xgb_score:.4f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Optimize Gradient Boosting\n",
    "print('\\nOptimizing Gradient Boosting...')\n",
    "\n",
    "gb_param_grid = {\n",
    "    'n_estimators': [200, 300, 400],\n",
    "    'learning_rate': [0.05, 0.1, 0.15],\n",
    "    'max_depth': [4, 6, 8],\n",
    "    'min_samples_split': [2, 4, 6],\n",
    "    'subsample': [0.8, 0.9, 1.0]\n",
    "}\n",
    "\n",
    "gb_grid = RandomizedSearchCV(\n",
    "    GradientBoostingRegressor(random_state=42),\n",
    "    gb_param_grid,\n",
    "    n_iter=15,\n",
    "    cv=3,\n",
    "    scoring='r2',\n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "gb_grid.fit(X_train, y_train)\n",
    "best_gb = gb_grid.best_estimator_\n",
    "gb_score = r2_score(y_test, best_gb.predict(X_test))\n",
    "\n",
    "print(f'Best GB parameters: {gb_grid.best_params_}')\n",
    "print(f'Optimized GB R²: {gb_score:.4f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Advanced Neural Network\n",
    "print('\\nTraining Advanced Neural Network...')\n",
    "\n",
    "advanced_nn = MLPRegressor(\n",
    "    hidden_layer_sizes=(200, 100, 50),\n",
    "    activation='relu',\n",
    "    solver='adam',\n",
    "    max_iter=2000,\n",
    "    early_stopping=True,\n",
    "    validation_fraction=0.1,\n",
    "    learning_rate_init=0.001,\n",
    "    alpha=0.01,\n",
    "    random_state=42\n",
    ")\n",
    "\n",
    "advanced_nn.fit(X_train_scaled, y_train)\n",
    "nn_score = r2_score(y_test, advanced_nn.predict(X_test_scaled))\n",
    "\n",
    "print(f'Advanced Neural Network R²: {nn_score:.4f}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ensemble Methods"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Collect optimized models with their scores\n",
    "optimized_models = {\n",
    "    'Optimized Random Forest': (best_rf, rf_score),\n",
    "    'Optimized XGBoost': (best_xgb, xgb_score),\n",
    "    'Optimized Gradient Boosting': (best_gb, gb_score),\n",
    "    'Advanced Neural Network': (advanced_nn, nn_score)\n",
    "}\n",
    "\n",
    "print('Optimized Model Performance:')\n",
    "for name, (model, score) in optimized_models.items():\n",
    "    improvement = (score - baseline_r2) / baseline_r2 * 100\n",
    "    print(f'{name}: R² = {score:.4f} (+{improvement:.1f}%)')\n",
    "\n",
    "# Select top 3 models for ensemble\n",
    "sorted_models = sorted(optimized_models.items(), key=lambda x: x[1][1], reverse=True)\n",
    "top_3_models = sorted_models[:3]\n",
    "\n",
    "print('\\nTop 3 models for ensemble:')\n",
    "for name, (model, score) in top_3_models:\n",
    "    print(f'- {name}: {score:.4f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create Voting Ensemble\n",
    "print('\\nCreating Voting Ensemble...')\n",
    "\n",
    "ensemble_models = []\n",
    "for name, (model, score) in top_3_models:\n",
    "    # Skip neural network for voting ensemble (different input requirements)\n",
    "    if 'Neural Network' not in name:\n",
    "        clean_name = name.replace(' ', '_').lower()\n",
    "        ensemble_models.append((clean_name, model))\n",
    "\n",
    "voting_ensemble = None\n",
    "voting_score = 0\n",
    "\n",
    "if len(ensemble_models) >= 2:\n",
    "    voting_ensemble = VotingRegressor(ensemble_models)\n",
    "    voting_ensemble.fit(X_train, y_train)\n",
    "    voting_score = r2_score(y_test, voting_ensemble.predict(X_test))\n",
    "    \n",
    "    print(f'Voting Ensemble R²: {voting_score:.4f}')\n",
    "    print(f'Improvement over baseline: {(voting_score - baseline_r2)/baseline_r2*100:.1f}%')\nelse:\n",
    "    print('Not enough compatible models for voting ensemble')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create Bagging Ensemble with best model\n",
    "print('\\nCreating Bagging Ensemble...')\n",
    "\n",
    "best_single_model = max(optimized_models.items(), key=lambda x: x[1][1])\n",
    "best_name, (best_model, best_score) = best_single_model\n",
    "\n",
    "bagging_ensemble = None\n",
    "bagging_score = 0\n",
    "\n",
    "if 'Neural Network' not in best_name:\n",
    "    bagging_ensemble = BaggingRegressor(\n",
    "        base_estimator=best_model,\n",
    "        n_estimators=10,\n",
    "        random_state=42,\n",
    "        n_jobs=-1\n",
    "    )\n",
    "    \n",
    "    bagging_ensemble.fit(X_train, y_train)\n",
    "    bagging_score = r2_score(y_test, bagging_ensemble.predict(X_test))\n",
    "    \n",
    "    print(f'Bagging Ensemble R²: {bagging_score:.4f}')\n",
    "    print(f'Based on: {best_name}')\nelse:\n",
    "    print('Cannot create bagging ensemble with neural network')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Final Model Selection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Compare all models\n",
    "all_results = {}\n",
    "\n",
    "# Add optimized individual models\n",
    "for name, (model, score) in optimized_models.items():\n",
    "    all_results[name] = {\n",
    "        'model': model,\n",
    "        'r2_score': score,\n",
    "        'improvement': (score - baseline_r2) / baseline_r2 * 100\n",
    "    }\n",
    "\n",
    "# Add ensemble models\n",
    "if voting_ensemble is not None:\n",
    "    all_results['Voting Ensemble'] = {\n",
    "        'model': voting_ensemble,\n",
    "        'r2_score': voting_score,\n",
    "        'improvement': (voting_score - baseline_r2) / baseline_r2 * 100\n",
    "    }\n",
    "\n",
    "if bagging_ensemble is not None:\n",
    "    all_results['Bagging Ensemble'] = {\n",
    "        'model': bagging_ensemble,\n",
    "        'r2_score': bagging_score,\n",
    "        'improvement': (bagging_score - baseline_r2) / baseline_r2 * 100\n",
    "    }\n",
    "\n",
    "# Find absolute best model\n",
    "if all_results:\n",
    "    final_best = max(all_results.items(), key=lambda x: x[1]['r2_score'])\n",
    "    final_model_name, final_model_info = final_best\n",
    "    final_model = final_model_info['model']\n",
    "    final_r2 = final_model_info['r2_score']\n",
    "else:\n",
    "    final_model_name = 'None'\n",
    "    final_model = None\n",
    "    final_r2 = 0\n",
    "\n",
    "print('\\nFINAL MODEL COMPARISON:')\n",
    "print('-' * 60)\n",
    "for name, info in sorted(all_results.items(), key=lambda x: x[1]['r2_score'], reverse=True):\n",
    "    print(f'{name:25} | R²: {info[\"r2_score\"]:.4f} | +{info[\"improvement\"]:5.1f}%')\n",
    "\n",
    "print('-' * 60)\n",
    "print(f'FINAL BEST MODEL: {final_model_name}')\n",
    "print(f'Final R²: {final_r2:.4f}')\n",
    "print(f'Total improvement over baseline: {(final_r2 - baseline_r2):.4f} ({(final_r2 - baseline_r2)/baseline_r2*100:.1f}%)')\n",
    "print(f'Improvement over previous best: {(final_r2 - best_previous_r2):.4f} ({(final_r2 - best_previous_r2)/best_previous_r2*100:.1f}%)')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Performance Visualization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create comprehensive performance visualization\n",
    "if all_results:\n",
    "    fig, axes = plt.subplots(2, 2, figsize=(16, 12))\n",
    "    \n",
    "    # Model comparison bar chart\n",
    "    model_names = list(all_results.keys())\n",
    "    r2_scores = [info['r2_score'] for info in all_results.values()]\n",
    "    colors = ['gold' if name == final_model_name else 'skyblue' for name in model_names]\n",
    "    \n",
    "    axes[0,0].bar(range(len(model_names)), r2_scores, color=colors)\n",
    "    axes[0,0].set_title('Final Model Performance Comparison')\n",
    "    axes[0,0].set_ylabel('R² Score')\n",
    "    axes[0,0].set_xticks(range(len(model_names)))\n",
    "    axes[0,0].set_xticklabels(model_names, rotation=45, ha='right')\n",
    "    axes[0,0].axhline(y=baseline_r2, color='red', linestyle='--', label='Baseline')\n",
    "    axes[0,0].axhline(y=best_previous_r2, color='orange', linestyle='--', label='Previous Best')\n",
    "    axes[0,0].legend()\n",
    "    \n",
    "    # Improvement over baseline\n",
    "    improvements = [info['improvement'] for info in all_results.values()]\n",
    "    axes[0,1].bar(range(len(model_names)), improvements, color=colors)\n",
    "    axes[0,1].set_title('Improvement over Baseline (%)')\n",
    "    axes[0,1].set_ylabel('Improvement (%)')\n",
    "    axes[0,1].set_xticks(range(len(model_names)))\n",
    "    axes[0,1].set_xticklabels(model_names, rotation=45, ha='right')\n",
    "    \n",
    "    # Final model predictions vs actual\n",
    "    if final_model is not None:\n",
    "        if 'Neural Network' in final_model_name:\n",
    "            y_pred_final = final_model.predict(X_test_scaled)\n",
    "        else:\n",
    "            y_pred_final = final_model.predict(X_test)\n",
    "        \n",
    "        axes[1,0].scatter(y_test, y_pred_final, alpha=0.6, s=20, color='green')\n",
    "        axes[1,0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)\n",
    "        axes[1,0].set_xlabel('Actual Viability')\n",
    "        axes[1,0].set_ylabel('Predicted Viability')\n",
    "        axes[1,0].set_title(f'Final Model: {final_model_name}\\nR² = {final_r2:.4f}')\n",
    "        axes[1,0].grid(True, alpha=0.3)\n",
    "        \n",
    "        # Residuals for final model\n",
    "        residuals_final = y_test - y_pred_final\n",
    "        axes[1,1].scatter(y_pred_final, residuals_final, alpha=0.6, s=20, color='purple')\n",
    "        axes[1,1].axhline(y=0, color='r', linestyle='--')\n",
    "        axes[1,1].set_xlabel('Predicted Viability')\n",
    "        axes[1,1].set_ylabel('Residuals')\n",
    "        axes[1,1].set_title('Final Model Residuals')\n",
    "        axes[1,1].grid(True, alpha=0.3)\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()\nelse:\n",
    "    print('No results to visualize')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cross-Validation Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Perform cross-validation on final model\n",
    "cv_scores = None\n",
    "\n",
    "if final_model is not None:\n",
    "    print('Performing cross-validation on final model...')\n",
    "    \n",
    "    if 'Neural Network' in final_model_name:\n",
    "        cv_scores = cross_val_score(final_model, X_train_scaled, y_train, cv=5, scoring='r2')\n",
    "    else:\n",
    "        cv_scores = cross_val_score(final_model, X_train, y_train, cv=5, scoring='r2')\n",
    "    \n",
    "    print(f'\\nCross-Validation Results ({final_model_name}):')\n",
    "    print(f'Mean CV R²: {cv_scores.mean():.4f}')\n",
    "    print(f'Std CV R²: {cv_scores.std():.4f}')\n",
    "    print(f'CV Scores: {cv_scores.round(4)}')\n",
    "    \n",
    "    # Plot CV scores\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    plt.bar(range(1, len(cv_scores) + 1), cv_scores, alpha=0.7, color='lightblue')\n",
    "    plt.axhline(y=cv_scores.mean(), color='red', linestyle='-', label=f'Mean: {cv_scores.mean():.4f}')\n",
    "    plt.axhline(y=baseline_r2, color='orange', linestyle='--', label=f'Baseline: {baseline_r2:.4f}')\n",
    "    plt.xlabel('CV Fold')\n",
    "    plt.ylabel('R² Score')\n",
    "    plt.title(f'Cross-Validation Scores - {final_model_name}')\n",
    "    plt.legend()\n",
    "    plt.grid(True, alpha=0.3)\n",
    "    plt.show()\nelse:\n",
    "    print('No final model available for cross-validation')\n",
    "    cv_scores = np.array([0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Save Final Improved Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create directories\n",
    "os.makedirs('../models', exist_ok=True)\n",
    "os.makedirs('../data/processed', exist_ok=True)\n",
    "\n",
    "if final_model is not None and cv_scores is not None:\n",
    "    # Save the final optimized model\n",
    "    final_model_filename = 'final_optimized_model.pkl'\n",
    "    with open(f'../models/{final_model_filename}', 'wb') as f:\n",
    "        pickle.dump(final_model, f)\n",
    "    \n",
    "    # Save the scaler\n",
    "    with open('../models/final_scaler.pkl', 'wb') as f:\n",
    "        pickle.dump(scaler, f)\n",
    "    \n",
    "    # Save final model info\n",
    "    final_model_info = {\n",
    "        'final_model_name': final_model_name,\n",
    "        'final_r2': float(final_r2),\n",
    "        'baseline_r2': float(baseline_r2),\n",
    "        'previous_best_r2': float(best_previous_r2),\n",
    "        'total_improvement': float(final_r2 - baseline_r2),\n",
    "        'total_improvement_percent': float((final_r2 - baseline_r2) / baseline_r2 * 100),\n",
    "        'additional_improvement': float(final_r2 - best_previous_r2),\n",
    "        'additional_improvement_percent': float((final_r2 - best_previous_r2) / best_previous_r2 * 100),\n",
    "        'cv_mean': float(cv_scores.mean()),\n",
    "        'cv_std': float(cv_scores.std()),\n",
    "        'training_samples': len(X_train),\n",
    "        'test_samples': len(X_test),\n",
    "        'hyperparameter_optimization': True,\n",
    "        'ensemble_methods_used': voting_ensemble is not None or bagging_ensemble is not None\n",
    "    }\n",
    "    \n",
    "    # Save all model results\n",
    "    all_model_results = {}\n",
    "    for name, info in all_results.items():\n",
    "        all_model_results[name] = {\n",
    "            'r2_score': float(info['r2_score']),\n",
    "            'improvement_percent': float(info['improvement'])\n",
    "        }\n",
    "    \n",
    "    final_results = {\n",
    "        'final_model_info': final_model_info,\n",
    "        'all_model_results': all_model_results,\n",
    "        'optimization_summary': {\n",
    "            'random_forest_optimized': True,\n",
    "            'xgboost_optimized': True,\n",
    "            'gradient_boosting_optimized': True,\n",
    "            'neural_network_advanced': True,\n",
    "            'voting_ensemble': voting_ensemble is not None,\n",
    "            'bagging_ensemble': bagging_ensemble is not None\n",
    "        }\n",
    "    }\n",
    "    \n",
    "    with open('../data/processed/final_model_results.json', 'w') as f:\n",
    "        json.dump(final_results, f, indent=2)\n",
    "    \n",
    "    print('Final optimized model and results saved!')\n",
    "    print('\\nFiles saved:')\n",
    "    print('- final_optimized_model.pkl (final model)')\n",
    "    print('- final_scaler.pkl (scaler for final model)')\n",
    "    print('- final_model_results.json (comprehensive results)')\n",
    "    \n",
    "    print('\\nFINAL OPTIMIZATION SUMMARY:')\n",
    "    print('=' * 40)\n",
    "    print(f'Baseline R²:          {baseline_r2:.4f}')\n",
    "    print(f'Previous Best R²:     {best_previous_r2:.4f} ({best_previous_model})')\n",
    "    print(f'Final Optimized R²:   {final_r2:.4f} ({final_model_name})')\n",
    "    print('-' * 40)\n",
    "    print(f'Total Improvement:    +{final_r2 - baseline_r2:.4f} ({(final_r2 - baseline_r2)/baseline_r2*100:.1f}%)')\n",
    "    print(f'Additional Improvement: +{final_r2 - best_previous_r2:.4f} ({(final_r2 - best_previous_r2)/best_previous_r2*100:.1f}%)')\n",
    "    print(f'Cross-Validation:     {cv_scores.mean():.4f} ± {cv_scores.std():.4f}')\n",
    "    print('=' * 40)\nelse:\n",
    "    print('No valid final model to save')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}