In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Paclitaxel Dose Optimization - Feature Engineering\n",
    "\n",
    "This notebook covers:\n",
    "1. Loading clean data from previous step\n",
    "2. Advanced feature engineering (16 new features)\n",
    "3. Feature validation and analysis\n",
    "4. Preparing data for modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Import required libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "print('Libraries imported for feature engineering!')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load clean data from previous step\n",
    "df_paclitaxel = pd.read_csv('../data/processed/paclitaxel_clean.csv')\n",
    "print(f'Loaded clean dataset: {df_paclitaxel.shape}')\n",
    "print(f'Original columns: {df_paclitaxel.columns.tolist()}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Phase 1: Mathematical Transformations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1. Log transformations\n",
    "df_paclitaxel['log_dose'] = np.log10(df_paclitaxel['dose'])\n",
    "df_paclitaxel['log_viability'] = np.log10(df_paclitaxel['viability'] + 1e-10)\n",
    "\n",
    "# 2. Polynomial features\n",
    "df_paclitaxel['dose_squared'] = df_paclitaxel['dose'] ** 2\n",
    "df_paclitaxel['log_dose_squared'] = df_paclitaxel['log_dose'] ** 2\n",
    "\n",
    "# 3. Reciprocal features\n",
    "df_paclitaxel['reciprocal_dose'] = 1 / (df_paclitaxel['dose'] + 1e-10)\n",
    "df_paclitaxel['reciprocal_log_dose'] = 1 / (np.abs(df_paclitaxel['log_dose']) + 1e-10)\n",
    "\n",
    "print('Mathematical transformations completed')\n",
    "print('New features: log_dose, log_viability, dose_squared, log_dose_squared, reciprocal_dose, reciprocal_log_dose')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Phase 2: Categorical Encoding"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Encode cell lines\n",
    "label_encoder = LabelEncoder()\n",
    "df_paclitaxel['cell_line_encoded'] = label_encoder.fit_transform(df_paclitaxel['ARXSPAN_ID'])\n",
    "\n",
    "# Create interaction features\n",
    "df_paclitaxel['dose_cell_interaction'] = df_paclitaxel['log_dose'] * df_paclitaxel['cell_line_encoded']\n",
    "\n",
    "print('Categorical encoding completed')\n",
    "print(f'Cell lines encoded: {df_paclitaxel[\"ARXSPAN_ID\"].nunique()} unique cell lines')\n",
    "print('New features: cell_line_encoded, dose_cell_interaction')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Phase 3: Statistical Features per Cell Line"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Calculate statistical features per cell line\n",
    "cell_line_stats = df_paclitaxel.groupby('ARXSPAN_ID')['viability'].agg([\n",
    "    'mean', 'std', 'median', 'min', 'max', 'count'\n",
    "]).reset_index()\n",
    "\n",
    "cell_line_stats.columns = [\n",
    "    'ARXSPAN_ID', 'cell_viability_mean', 'cell_viability_std',\n",
    "    'cell_viability_median', 'cell_viability_min', 'cell_viability_max', 'cell_sample_count'\n",
    "]\n",
    "\n",
    "# Fill NaN std with 0 (for cell lines with only one sample)\n",
    "cell_line_stats['cell_viability_std'] = cell_line_stats['cell_viability_std'].fillna(0)\n",
    "\n",
    "# Merge back to main dataframe\n",
    "df_paclitaxel = df_paclitaxel.merge(cell_line_stats, on='ARXSPAN_ID', how='left')\n",
    "\n",
    "print('Statistical features per cell line added')\n",
    "print('New features: cell_viability_mean, cell_viability_std, cell_viability_median, cell_viability_min, cell_viability_max, cell_sample_count')\n",
    "print('Sample statistics:')\n",
    "print(cell_line_stats.head())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Phase 4: Dose-Related Features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dose ranking and percentiles within cell lines\n",
    "df_paclitaxel['dose_rank'] = df_paclitaxel.groupby('ARXSPAN_ID')['dose'].rank(method='dense')\n",
    "df_paclitaxel['dose_percentile'] = df_paclitaxel.groupby('ARXSPAN_ID')['dose'].rank(pct=True)\n",
    "\n",
    "# Dose binning\n",
    "df_paclitaxel['dose_bin'] = pd.cut(df_paclitaxel['dose'], bins=10, labels=False)\n",
    "df_paclitaxel['log_dose_bin'] = pd.cut(df_paclitaxel['log_dose'], bins=10, labels=False)\n",
    "\n",
    "# High/low dose indicators\n",
    "dose_q75 = df_paclitaxel['dose'].quantile(0.75)\n",
    "dose_q25 = df_paclitaxel['dose'].quantile(0.25)\n",
    "df_paclitaxel['is_high_dose'] = (df_paclitaxel['dose'] > dose_q75).astype(int)\n",
    "df_paclitaxel['is_low_dose'] = (df_paclitaxel['dose'] < dose_q25).astype(int)\n",
    "\n",
    "print('Dose-related features added')\n",
    "print('New features: dose_rank, dose_percentile, dose_bin, log_dose_bin, is_high_dose, is_low_dose')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Phase 5: Cell Line Sensitivity Categories"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create sensitivity categories based on average viability\n",
    "cell_line_sensitivity = df_paclitaxel.groupby('ARXSPAN_ID')['viability'].mean().reset_index()\n",
    "cell_line_sensitivity['sensitivity_category'] = pd.cut(\n",
    "    cell_line_sensitivity['viability'],\n",
    "    bins=3, \n",
    "    labels=['High', 'Medium', 'Low']\n",
    ")\n",
    "\n",
    "# Encode sensitivity category\n",
    "sensitivity_encoder = LabelEncoder()\n",
    "cell_line_sensitivity['sensitivity_encoded'] = sensitivity_encoder.fit_transform(\n",
    "    cell_line_sensitivity['sensitivity_category']\n",
    ")\n",
    "\n",
    "# Merge back to main dataframe\n",
    "sensitivity_merge = cell_line_sensitivity[['ARXSPAN_ID', 'sensitivity_encoded']]\n",
    "df_paclitaxel = df_paclitaxel.merge(sensitivity_merge, on='ARXSPAN_ID', how='left')\n",
    "\n",
    "print('Sensitivity categories added')\n",
    "print('New feature: sensitivity_encoded')\n",
    "print('Sensitivity distribution:')\n",
    "print(cell_line_sensitivity['sensitivity_category'].value_counts())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Engineering Summary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Define all enhanced features\n",
    "enhanced_features = [\n",
    "    'log_dose', 'cell_line_encoded', 'dose_squared', 'log_dose_squared',\n",
    "    'dose_cell_interaction', 'cell_viability_mean', 'cell_viability_std',\n",
    "    'dose_rank', 'dose_percentile', 'dose_bin', 'log_dose_bin',\n",
    "    'sensitivity_encoded', 'is_high_dose', 'is_low_dose',\n",
    "    'reciprocal_dose', 'reciprocal_log_dose'\n",
    "]\n",
    "\n",
    "print('Feature Engineering Summary:')\n",
    "print(f'Original shape: {df_paclitaxel.shape}')\n",
    "print(f'Enhanced features: {len(enhanced_features)}')\n",
    "print(f'Feature list: {enhanced_features}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check for missing values in enhanced features\n",
    "X_enhanced = df_paclitaxel[enhanced_features]\n",
    "missing_check = X_enhanced.isnull().sum()\n",
    "print('Missing values in enhanced features:')\n",
    "print(missing_check[missing_check > 0])\n",
    "\n",
    "# Remove rows with any missing values\n",
    "y = df_paclitaxel['viability']\n",
    "mask = ~(X_enhanced.isnull().any(axis=1) | y.isnull())\n",
    "X_enhanced_clean = X_enhanced[mask]\n",
    "y_clean = y[mask]\n",
    "\n",
    "print(f'Clean dataset after feature engineering:')\n",
    "print(f'Shape: {X_enhanced_clean.shape}')\n",
    "print(f'Target variable shape: {y_clean.shape}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize feature distributions\n",
    "fig, axes = plt.subplots(4, 4, figsize=(20, 16))\n",
    "axes = axes.ravel()\n",
    "\n",
    "for i, feature in enumerate(enhanced_features):\n",
    "    if i < len(axes):\n",
    "        axes[i].hist(X_enhanced_clean[feature], bins=30, alpha=0.7)\n",
    "        axes[i].set_title(f'{feature} Distribution')\n",
    "        axes[i].set_xlabel(feature)\n",
    "        axes[i].set_ylabel('Frequency')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation analysis of enhanced features\n",
    "correlation_matrix = X_enhanced_clean.corr()\n",
    "\n",
    "plt.figure(figsize=(14, 12))\n",
    "sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, \n",
    "            square=True, fmt='.2f', cbar_kws={'shrink': 0.8})\n",
    "plt.title('Enhanced Features Correlation Matrix')\n",
    "plt.tight_layout()\n",
    "plt.show()\n",
    "\n",
    "# Find highly correlated features (>0.8)\n",
    "high_corr_pairs = []\n",
    "for i in range(len(correlation_matrix.columns)):\n",
    "    for j in range(i+1, len(correlation_matrix.columns)):\n",
    "        if abs(correlation_matrix.iloc[i, j]) > 0.8:\n",
    "            high_corr_pairs.append((\n",
    "                correlation_matrix.columns[i],\n",
    "                correlation_matrix.columns[j],\n",
    "                correlation_matrix.iloc[i, j]\n",
    "            ))\n",
    "\n",
    "print('Highly correlated feature pairs (>0.8):')\n",
    "for feature1, feature2, corr in high_corr_pairs:\n",
    "    print(f'{feature1} - {feature2}: {corr:.3f}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save enhanced dataset and encoders\n",
    "import pickle\n",
    "import os\n",
    "\n",
    "# Create directories if they don't exist\n",
    "os.makedirs('../data/processed', exist_ok=True)\n",
    "os.makedirs('../models', exist_ok=True)\n",
    "\n",
    "# Save enhanced data\n",
    "enhanced_data = df_paclitaxel[mask].copy()\n",
    "enhanced_data.to_csv('../data/processed/paclitaxel_enhanced.csv', index=False)\n",
    "\n",
    "# Save feature matrix and target\n",
    "X_enhanced_clean.to_csv('../data/processed/X_enhanced.csv', index=False)\n",
    "y_clean.to_csv('../data/processed/y_enhanced.csv', index=False)\n",
    "\n",
    "# Save encoders\n",
    "with open('../models/label_encoder.pkl', 'wb') as f:\n",
    "    pickle.dump(label_encoder, f)\n",
    "    \n",
    "with open('../models/sensitivity_encoder.pkl', 'wb') as f:\n",
    "    pickle.dump(sensitivity_encoder, f)\n",
    "\n",
    "# Save feature list\n",
    "with open('../data/processed/enhanced_features.txt', 'w') as f:\n",
    "    for feature in enhanced_features:\n",
    "        f.write(f'{feature}\\n')\n",
    "\n",
    "print('Enhanced dataset and encoders saved!')\n",
    "print('Files saved:')\n",
    "print('  - paclitaxel_enhanced.csv (full enhanced dataset)')\n",
    "print('  - X_enhanced.csv (feature matrix)')\n",
    "print('  - y_enhanced.csv (target variable)')\n",
    "print('  - label_encoder.pkl (cell line encoder)')\n",
    "print('  - sensitivity_encoder.pkl (sensitivity encoder)')\n",
    "print('  - enhanced_features.txt (feature list)')\n",
    "print(f'Ready for model training with {len(enhanced_features)} enhanced features!')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}