In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
# ============================================================
# INSPECT GATEWAY IMPLEMENTATION
# ============================================================


/kaggle/input/aimo-3-utils/wheels.tar.gz
/kaggle/input/aimo-3-utils/__results__.html
/kaggle/input/aimo-3-utils/__notebook__.ipynb
/kaggle/input/aimo-3-utils/__output__.json
/kaggle/input/aimo-3-utils/custom.css
/kaggle/input/40-50-gpt-oss-120b-tir-dynamictime-kernelpool/reference.csv
/kaggle/input/40-50-gpt-oss-120b-tir-dynamictime-kernelpool/submission.parquet
/kaggle/input/40-50-gpt-oss-120b-tir-dynamictime-kernelpool/__results__.html
/kaggle/input/40-50-gpt-oss-120b-tir-dynamictime-kernelpool/vllm_server.log
/kaggle/input/40-50-gpt-oss-120b-tir-dynamictime-kernelpool/__notebook__.ipynb
/kaggle/input/40-50-gpt-oss-120b-tir-dynamictime-kernelpool/__output__.json
/kaggle/input/40-50-gpt-oss-120b-tir-dynamictime-kernelpool/custom.css
/kaggle/input/aimo-3-submission-demo/submission.parquet
/kaggle/input/aimo-3-submission-demo/__results__.html
/kaggle/input/aimo-3-submission-demo/__notebook__.ipynb
/kaggle/input/aimo-3-submission-demo/__output__.json
/kaggle/input/aimo-3-submission-demo/

In [2]:
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# AIMO3 Dataset Analysis - Complete Overview\n",
        "\n",
        "This notebook contains a comprehensive analysis of the AIMO3 (AI Mathematical Olympiad) dataset, including:\n",
        "- Data loading and exploration\n",
        "- Answer variable analysis (AIMO3: 0-99999 range) with visualizations\n",
        "- Feature analysis\n",
        "- Linear Discriminant Analysis (Classification)\n",
        "- Regression analysis for answer prediction\n",
        "- All necessary graphs and visualizations"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Install required packages if not already installed\n",
        "import subprocess\n",
        "import sys\n",
        "\n",
        "def install_package(package):\n",
        "    try:\n",
        "        __import__(package)\n",
        "    except ImportError:\n",
        "        print(f\"Installing {package}...\")\n",
        "        subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", package])\n",
        "        print(f\"{package} installed successfully!\")\n",
        "\n",
        "# Install seaborn if needed\n",
        "install_package(\"seaborn\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.linear_model import LinearRegression\n",
        "from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, classification_report, roc_curve, auc\n",
        "from scipy import stats\n",
        "import warnings\n",
        "warnings.filterwarnings('ignore')\n",
        "\n",
        "# Set style for better-looking plots\n",
        "plt.style.use('seaborn-v0_8-darkgrid')\n",
        "sns.set_palette(\"husl\")\n",
        "\n",
        "# Initialize train variable to avoid NameError\n",
        "train = None\n",
        "\n",
        "print(\"=\" * 80)\n",
        "print(\"Dataset Analysis - Data Analysis\")\n",
        "print(\"=\" * 80)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 1. Data Loading"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Update this path to your AIMO3 reference.csv file location\n",
        "train_path = '../input/ai-mathematical-olympiad-progress-prize-3/reference.csv'\n",
        "# Alternative: train_path = 'reference.csv'  # if file is in current directory\n",
        "\n",
        "try:\n",
        "    train = pd.read_csv(train_path)\n",
        "    print(f\"Data loaded: {len(train)} rows, {len(train.columns)} columns\")\n",
        "    print(f\"Memory usage: {train.memory_usage(deep=True).sum() / 1024**2:.2f} MB\")\n",
        "except FileNotFoundError:\n",
        "    print(f\"Warning: Could not find {train_path}\")\n",
        "    print(\"Please update the path to your train.csv file\")\n",
        "    train = None\n",
        "except Exception as e:\n",
        "    print(f\"Error loading data: {e}\")\n",
        "    train = None\n",
        "except Exception as e:\n",
        "    print(f\"Error loading data: {e}\")\n",
        "    train = None"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 2. Basic Data Exploration"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "if train is not None:\n",
        "    print(f\"Shape: {train.shape}\")\n",
        "    print(f\"Columns: {list(train.columns[:10])}... (showing first 10)\")\n",
        "    print(f\"\\nData types:\\n{train.dtypes.value_counts()}\")\n",
        "    print(f\"\\nMissing values:\\n{train.isnull().sum().sum()} total missing values\")\n",
        "    train.head()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 3. Answer Variable Analysis (AIMO3)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "if train is not None and 'answer' in train.columns:\n",
        "    answer = train['answer']\n",
        "    print(f\"The minimum value for answer is: {answer.min()}\")\n",
        "    print(f\"The maximum value for answer is: {answer.max()}\")\n",
        "    print(f\"The mean value for answer is: {answer.mean():.2f}\")\n",
        "    print(f\"The median value for answer is: {answer.median():.2f}\")\n",
        "    print(f\"Skew of answer is: {answer.skew():.2f}\")\n",
        "    print(f\"Kurtosis of answer is: {answer.kurtosis():.2f}\")\n",
        "    print(f\"Standard deviation: {answer.std():.2f}\")\n",
        "    \n",
        "    # Answer range analysis for AIMO3 (0 to 99999)\n",
        "    print(f\"\\nAnswer range: 0 to 99999\")\n",
        "    print(f\"Answers in AIME range (0-999): {(answer <= 999).sum()} ({((answer <= 999).sum() / len(answer) * 100):.1f}%)\")\n",
        "    print(f\"Unique answers: {answer.nunique()}\")\n",
        "    \n",
        "    # Create target variable (classification) - positive vs zero\n",
        "    train['target'] = (train['answer'] > 0).astype(int)\n",
        "    print(f\"\\nTarget distribution (classification):\")\n",
        "    print(train['target'].value_counts())\n",
        "    print(f\"Positive class percentage: {train['target'].mean()*100:.2f}%\")\n",
        "elif train is not None:\n",
        "    print(\"Note: 'answer' column not found. Available columns:\", list(train.columns))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 4. Answer Variable Visualizations (AIMO3)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "if train is not None and 'answer' in train.columns:\n",
        "    fig, axes = plt.subplots(2, 2, figsize=(15, 12))\n",
        "    fig.suptitle('Answer Variable Analysis (AIMO3)', fontsize=16, fontweight='bold')\n",
        "    \n",
        "    answer = train['answer']\n",
        "    \n",
        "    # Histogram\n",
        "    axes[0, 0].hist(answer, bins=min(50, answer.nunique()), edgecolor='black', alpha=0.7)\n",
        "    axes[0, 0].axvline(answer.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {answer.mean():.0f}')\n",
        "    axes[0, 0].axvline(answer.median(), color='green', linestyle='--', linewidth=2, label=f'Median: {answer.median():.0f}')\n",
        "    axes[0, 0].set_xlabel('Answer Value')\n",
        "    axes[0, 0].set_ylabel('Frequency')\n",
        "    axes[0, 0].set_title('Distribution of Answer Values')\n",
        "    axes[0, 0].legend()\n",
        "    axes[0, 0].grid(True, alpha=0.3)\n",
        "    \n",
        "    # Box plot\n",
        "    axes[0, 1].boxplot(answer, vert=True)\n",
        "    axes[0, 1].set_ylabel('Answer Value')\n",
        "    axes[0, 1].set_title('Box Plot of Answer Values')\n",
        "    axes[0, 1].grid(True, alpha=0.3)\n",
        "    \n",
        "    # Q-Q plot for normality check\n",
        "    stats.probplot(answer, dist=\"norm\", plot=axes[1, 0])\n",
        "    axes[1, 0].set_title('Q-Q Plot (Normality Check)')\n",
        "    axes[1, 0].grid(True, alpha=0.3)\n",
        "    \n",
        "    # Answer value scatter by problem index\n",
        "    axes[1, 1].scatter(range(len(answer)), answer.values, alpha=0.6, s=50)\n",
        "    axes[1, 1].axhline(999, color='orange', linestyle='--', linewidth=2, label='AIME Range (0-999)')\n",
        "    axes[1, 1].set_xlabel('Problem Index')\n",
        "    axes[1, 1].set_ylabel('Answer Value')\n",
        "    axes[1, 1].set_title('Answer Values by Problem')\n",
        "    axes[1, 1].legend()\n",
        "    axes[1, 1].grid(True, alpha=0.3)\n",
        "    \n",
        "    plt.tight_layout()\n",
        "    plt.show()\n",
        "elif train is not None:\n",
        "    print(\"Note: 'answer' column not found. Available columns:\", list(train.columns))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 5. Weight Analysis (Not applicable for AIMO3)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Weight Analysis is not applicable for AIMO3 data\n",
        "# This section was designed for Jane Street data with weight columns\n",
        "# AIMO3 data focuses on answer prediction (0-99999 range)\n",
        "print(\"Weight Analysis skipped - not applicable for AIMO3 dataset\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 6. Daily Ratio Analysis (Not applicable for AIMO3)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Daily Ratio Analysis is not applicable for AIMO3 data\n",
        "# This section was designed for Jane Street data with date, weight, and resp columns\n",
        "# AIMO3 data uses 'answer' column (0-99999 range) instead\n",
        "print(\"Daily Ratio Analysis skipped - not applicable for AIMO3 dataset\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 7. Feature Analysis"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "if train is not None:\n",
        "    # Identify feature columns\n",
        "    feature_cols = [col for col in train.columns if col.startswith('feature_')]\n",
        "    print(f\"Found {len(feature_cols)} feature columns\")\n",
        "    \n",
        "    if len(feature_cols) > 0:\n",
        "        # Identify continuous features\n",
        "        continuous_features = []\n",
        "        for col in feature_cols:\n",
        "            if train[col].dtype in ['float64', 'int64']:\n",
        "                unique_ratio = train[col].nunique() / len(train)\n",
        "                if unique_ratio > 0.1:  # More than 10% unique values\n",
        "                    continuous_features.append(col)\n",
        "        \n",
        "        print(f\"Found {len(continuous_features)} continuous features\")\n",
        "        print(\"Showing only top 10 of continuous features\")\n",
        "        \n",
        "        # Feature correlation with target\n",
        "        if 'target' in train.columns:\n",
        "            correlations = train[continuous_features + ['target']].corr()['target'].abs().sort_values(ascending=False)\n",
        "            top_features = correlations.head(11).index[1:]  # Exclude 'target' itself\n",
        "            \n",
        "            print(f\"\\nTop 10 features correlated with target:\")\n",
        "            for feat in top_features[:10]:\n",
        "                corr = correlations[feat]\n",
        "                print(f\"  {feat}: {corr:.4f}\")\n",
        "            \n",
        "            # Feature importance visualization\n",
        "            fig, axes = plt.subplots(2, 2, figsize=(18, 12))\n",
        "            fig.suptitle('Feature Analysis', fontsize=16, fontweight='bold')\n",
        "            \n",
        "            # Top correlated features\n",
        "            top_corr = correlations.head(11)[1:11]  # Top 10 excluding target\n",
        "            axes[0, 0].barh(range(len(top_corr)), top_corr.values)\n",
        "            axes[0, 0].set_yticks(range(len(top_corr)))\n",
        "            axes[0, 0].set_yticklabels([f.replace('feature_', 'f_') for f in top_corr.index])\n",
        "            axes[0, 0].set_xlabel('Absolute Correlation with Target')\n",
        "            axes[0, 0].set_title('Top 10 Features Correlated with Target')\n",
        "            axes[0, 0].grid(True, alpha=0.3, axis='x')\n",
        "            \n",
        "            # Feature distribution (sample of top features)\n",
        "            for i, feat in enumerate(top_corr.head(4).index):\n",
        "                row = i // 2\n",
        "                col = (i % 2) + 1\n",
        "                if row < 2 and col < 2:\n",
        "                    axes[row, col].hist(train[feat].dropna(), bins=50, edgecolor='black', alpha=0.7)\n",
        "                    axes[row, col].set_xlabel(feat.replace('feature_', 'f_'))\n",
        "                    axes[row, col].set_ylabel('Frequency')\n",
        "                    axes[row, col].set_title(f'Distribution of {feat.replace(\"feature_\", \"f_\")}')\n",
        "                    axes[row, col].grid(True, alpha=0.3)\n",
        "            \n",
        "            plt.tight_layout()\n",
        "            plt.show()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 8. Linear Discriminant Analysis - Classification"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "if train is not None and 'target' in train.columns and len(continuous_features) > 0:\n",
        "    print(\"Linear Discriminant Analysis for AIMO3 Classification (answer > 0 vs answer = 0)\")\n",
        "    \n",
        "    # Prepare data\n",
        "    X = train[continuous_features].fillna(0)\n",
        "    y = train['target']\n",
        "    \n",
        "    # Split data\n",
        "    X_train, X_test, y_train, y_test = train_test_split(\n",
        "        X, y, test_size=0.2, random_state=42, stratify=y\n",
        "    )\n",
        "    \n",
        "    # Scale features\n",
        "    scaler = StandardScaler()\n",
        "    X_train_scaled = scaler.fit_transform(X_train)\n",
        "    X_test_scaled = scaler.transform(X_test)\n",
        "    \n",
        "    # Train LDA\n",
        "    lda = LinearDiscriminantAnalysis()\n",
        "    lda.fit(X_train_scaled, y_train)\n",
        "    \n",
        "    # Evaluate\n",
        "    train_score = lda.score(X_train_scaled, y_train)\n",
        "    test_score = lda.score(X_test_scaled, y_test)\n",
        "    \n",
        "    print(f\"Linear Discriminant Analysis training set score: {train_score:.3f}\")\n",
        "    print(f\"Linear Discriminant Analysis test set score: {test_score:.3f}\")\n",
        "    \n",
        "    # Predictions\n",
        "    y_pred = lda.predict(X_test_scaled)\n",
        "    y_pred_proba = lda.predict_proba(X_test_scaled)[:, 1]\n",
        "    \n",
        "    # Confusion matrix visualization\n",
        "    cm = confusion_matrix(y_test, y_pred)\n",
        "    \n",
        "    fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
        "    fig.suptitle('Linear Discriminant Analysis - AIMO3 Classification Results', fontsize=16, fontweight='bold')\n",
        "    \n",
        "    # Confusion matrix\n",
        "    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])\n",
        "    axes[0].set_xlabel('Predicted')\n",
        "    axes[0].set_ylabel('Actual')\n",
        "    axes[0].set_title('Confusion Matrix')\n",
        "    \n",
        "    # ROC curve\n",
        "    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)\n",
        "    roc_auc = auc(fpr, tpr)\n",
        "    \n",
        "    axes[1].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.3f})')\n",
        "    axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')\n",
        "    axes[1].set_xlim([0.0, 1.0])\n",
        "    axes[1].set_ylim([0.0, 1.05])\n",
        "    axes[1].set_xlabel('False Positive Rate')\n",
        "    axes[1].set_ylabel('True Positive Rate')\n",
        "    axes[1].set_title('ROC Curve')\n",
        "    axes[1].legend(loc=\"lower right\")\n",
        "    axes[1].grid(True, alpha=0.3)\n",
        "    \n",
        "    plt.tight_layout()\n",
        "    plt.show()\n",
        "    \n",
        "    print(\"\\nClassification Report:\")\n",
        "    print(classification_report(y_test, y_pred))"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## 9. Regression Analysis (AIMO3 Answer Prediction)"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# ============================================================================\n",
        "# PHENOMENAL AIMO3 REGRESSION ANALYSIS - ALL-IN-ONE CELL\n",
        "# ============================================================================\n",
        "# Comprehensive regression analysis with beautiful visualizations\n",
        "# Based on AIMO3 answer prediction (0-99999 range)\n",
        "\n",
        "import numpy as np\n",
        "import pandas as pd\n",
        "import matplotlib.pyplot as plt\n",
        "import seaborn as sns\n",
        "from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet\n",
        "from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor\n",
        "from sklearn.preprocessing import StandardScaler\n",
        "from sklearn.model_selection import train_test_split\n",
        "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
        "import warnings\n",
        "warnings.filterwarnings('ignore')\n",
        "\n",
        "# Set beautiful style\n",
        "plt.style.use('seaborn-v0_8-darkgrid')\n",
        "sns.set_palette(\"husl\")\n",
        "plt.rcParams['figure.facecolor'] = 'white'\n",
        "plt.rcParams['axes.facecolor'] = 'white'\n",
        "\n",
        "# Check if data is loaded\n",
        "if train is not None and 'answer' in train.columns:\n",
        "    # Prepare data\n",
        "    feature_cols = [col for col in train.columns if col.startswith('feature_')]\n",
        "    if len(feature_cols) > 0:\n",
        "        # Identify continuous features\n",
        "        continuous_features = []\n",
        "        for col in feature_cols:\n",
        "            if train[col].dtype in ['float64', 'int64']:\n",
        "                unique_ratio = train[col].nunique() / len(train)\n",
        "                if unique_ratio > 0.1:\n",
        "                    continuous_features.append(col)\n",
        "        \n",
        "        if len(continuous_features) > 0:\n",
        "            X = train[continuous_features].fillna(0)\n",
        "            y = train['answer']\n",
        "            \n",
        "            # Split data\n",
        "            X_train, X_test, y_train, y_test = train_test_split(\n",
        "                X, y, test_size=0.2, random_state=42\n",
        "            )\n",
        "            \n",
        "            # Scale features\n",
        "            scaler = StandardScaler()\n",
        "            X_train_scaled = scaler.fit_transform(X_train)\n",
        "            X_test_scaled = scaler.transform(X_test)\n",
        "            \n",
        "            # Train multiple models for comparison\n",
        "            models = {\n",
        "                'Linear Regression': LinearRegression(),\n",
        "                'Ridge (L2)': Ridge(alpha=1.0),\n",
        "                'Lasso (L1)': Lasso(alpha=0.1),\n",
        "                'Elastic Net': ElasticNet(alpha=0.1, l1_ratio=0.5),\n",
        "                'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10),\n",
        "                'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=5)\n",
        "            }\n",
        "            \n",
        "            results = {}\n",
        "            predictions = {}\n",
        "            \n",
        "            print(\"=\" * 80)\n",
        "            print(\"PHENOMENAL AIMO3 REGRESSION ANALYSIS\")\n",
        "            print(\"=\" * 80)\n",
        "            print(f\"\\nTraining {len(models)} models on {len(continuous_features)} features...\")\n",
        "            print(f\"Training samples: {len(X_train)}, Test samples: {len(X_test)}\")\n",
        "            print(f\"Answer range: {y.min()} to {y.max()}\")\n",
        "            print(\"\\n\" + \"-\" * 80)\n",
        "            \n",
        "            # Train all models\n",
        "            for name, model in models.items():\n",
        "                model.fit(X_train_scaled, y_train)\n",
        "                y_pred = model.predict(X_test_scaled)\n",
        "                predictions[name] = y_pred\n",
        "                \n",
        "                mse = mean_squared_error(y_test, y_pred)\n",
        "                rmse = np.sqrt(mse)\n",
        "                mae = mean_absolute_error(y_test, y_pred)\n",
        "                r2 = r2_score(y_test, y_pred)\n",
        "                \n",
        "                results[name] = {\n",
        "                    'MSE': mse,\n",
        "                    'RMSE': rmse,\n",
        "                    'MAE': mae,\n",
        "                    'R¬≤': r2\n",
        "                }\n",
        "                \n",
        "                print(f\"{name:25s} | R¬≤: {r2:7.4f} | RMSE: {rmse:10.2f} | MAE: {mae:10.2f}\")\n",
        "            \n",
        "            print(\"-\" * 80)\n",
        "            \n",
        "            # Select best model\n",
        "            best_model_name = max(results, key=lambda x: results[x]['R¬≤'])\n",
        "            best_pred = predictions[best_model_name]\n",
        "            best_r2 = results[best_model_name]['R¬≤']\n",
        "            best_rmse = results[best_model_name]['RMSE']\n",
        "            \n",
        "            print(f\"\\nüèÜ BEST MODEL: {best_model_name}\")\n",
        "            print(f\"   R¬≤ Score: {best_r2:.4f}\")\n",
        "            print(f\"   RMSE: {best_rmse:.2f}\")\n",
        "            print(\"=\" * 80)\n",
        "            \n",
        "            # ========================================================================\n",
        "            # PHENOMENAL VISUALIZATIONS\n",
        "            # ========================================================================\n",
        "            \n",
        "            # Create a comprehensive figure with multiple subplots\n",
        "            fig = plt.figure(figsize=(20, 16))\n",
        "            gs = fig.add_gridspec(4, 3, hspace=0.3, wspace=0.3)\n",
        "            \n",
        "            # 1. PREDICTED VS ACTUAL (Top Left - Large)\n",
        "            ax1 = fig.add_subplot(gs[0:2, 0:2])\n",
        "            ax1.scatter(y_test, best_pred, alpha=0.6, s=30, c=y_test, cmap='viridis', edgecolors='black', linewidth=0.5)\n",
        "            ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], \n",
        "                    'r--', lw=3, label='Perfect Prediction', zorder=5)\n",
        "            ax1.set_xlabel('Actual Answer', fontsize=14, fontweight='bold')\n",
        "            ax1.set_ylabel('Predicted Answer', fontsize=14, fontweight='bold')\n",
        "            ax1.set_title(f'Predicted vs Actual Answer\\n{best_model_name} (R¬≤ = {best_r2:.4f})', \n",
        "                         fontsize=16, fontweight='bold', pad=15)\n",
        "            ax1.legend(fontsize=12, loc='upper left')\n",
        "            ax1.grid(True, alpha=0.3, linestyle='--')\n",
        "            ax1.set_facecolor('#f8f9fa')\n",
        "            \n",
        "            # Add statistics text box\n",
        "            stats_text = f'RMSE: {best_rmse:.2f}\\nMAE: {results[best_model_name][\"MAE\"]:.2f}'\n",
        "            ax1.text(0.02, 0.98, stats_text, transform=ax1.transAxes, \n",
        "                    fontsize=11, verticalalignment='top', \n",
        "                    bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))\n",
        "            \n",
        "            # 2. RESIDUAL PLOT (Top Right)\n",
        "            ax2 = fig.add_subplot(gs[0:2, 2])\n",
        "            residuals = y_test - best_pred\n",
        "            ax2.scatter(best_pred, residuals, alpha=0.6, s=30, c=residuals, \n",
        "                       cmap='RdBu_r', edgecolors='black', linewidth=0.5)\n",
        "            ax2.axhline(y=0, color='r', linestyle='--', linewidth=3, label='Zero Residual')\n",
        "            ax2.axhline(y=residuals.std() * 2, color='orange', linestyle=':', linewidth=2, alpha=0.7, label='¬±2œÉ')\n",
        "            ax2.axhline(y=-residuals.std() * 2, color='orange', linestyle=':', linewidth=2, alpha=0.7)\n",
        "            ax2.set_xlabel('Predicted Answer', fontsize=12, fontweight='bold')\n",
        "            ax2.set_ylabel('Residuals', fontsize=12, fontweight='bold')\n",
        "            ax2.set_title('Residual Plot', fontsize=14, fontweight='bold', pad=10)\n",
        "            ax2.legend(fontsize=10)\n",
        "            ax2.grid(True, alpha=0.3, linestyle='--')\n",
        "            ax2.set_facecolor('#f8f9fa')\n",
        "            \n",
        "            # 3. RESIDUAL DISTRIBUTION (Middle Left)\n",
        "            ax3 = fig.add_subplot(gs[2, 0])\n",
        "            ax3.hist(residuals, bins=50, edgecolor='black', alpha=0.7, color='steelblue')\n",
        "            ax3.axvline(0, color='red', linestyle='--', linewidth=2, label='Zero')\n",
        "            ax3.axvline(residuals.mean(), color='green', linestyle='--', linewidth=2, label=f'Mean: {residuals.mean():.2f}')\n",
        "            ax3.set_xlabel('Residuals', fontsize=11, fontweight='bold')\n",
        "            ax3.set_ylabel('Frequency', fontsize=11, fontweight='bold')\n",
        "            ax3.set_title('Residual Distribution', fontsize=12, fontweight='bold')\n",
        "            ax3.legend(fontsize=9)\n",
        "            ax3.grid(True, alpha=0.3, axis='y')\n",
        "            \n",
        "            # 4. Q-Q PLOT FOR RESIDUALS (Middle Center)\n",
        "            from scipy import stats\n",
        "            ax4 = fig.add_subplot(gs[2, 1])\n",
        "            stats.probplot(residuals, dist=\"norm\", plot=ax4)\n",
        "            ax4.set_title('Q-Q Plot (Normality Check)', fontsize=12, fontweight='bold')\n",
        "            ax4.grid(True, alpha=0.3)\n",
        "            \n",
        "            # 5. MODEL COMPARISON (Middle Right)\n",
        "            ax5 = fig.add_subplot(gs[2, 2])\n",
        "            model_names = list(results.keys())\n",
        "            r2_scores = [results[m]['R¬≤'] for m in model_names]\n",
        "            colors = plt.cm.RdYlGn(np.linspace(0.3, 0.9, len(model_names)))\n",
        "            bars = ax5.barh(model_names, r2_scores, color=colors, edgecolor='black', linewidth=1.5)\n",
        "            ax5.set_xlabel('R¬≤ Score', fontsize=11, fontweight='bold')\n",
        "            ax5.set_title('Model Comparison (R¬≤)', fontsize=12, fontweight='bold')\n",
        "            ax5.grid(True, alpha=0.3, axis='x')\n",
        "            ax5.set_xlim([min(r2_scores) - 0.1, max(r2_scores) + 0.1])\n",
        "            \n",
        "            # Highlight best model\n",
        "            best_idx = model_names.index(best_model_name)\n",
        "            bars[best_idx].set_edgecolor('gold')\n",
        "            bars[best_idx].set_linewidth(3)\n",
        "            \n",
        "            # 6. ERROR METRICS COMPARISON (Bottom Left)\n",
        "            ax6 = fig.add_subplot(gs[3, 0])\n",
        "            metrics = ['RMSE', 'MAE']\n",
        "            x_pos = np.arange(len(model_names))\n",
        "            width = 0.35\n",
        "            \n",
        "            rmse_vals = [results[m]['RMSE'] for m in model_names]\n",
        "            mae_vals = [results[m]['MAE'] for m in model_names]\n",
        "            \n",
        "            # Normalize for better visualization\n",
        "            rmse_norm = [v / max(rmse_vals) for v in rmse_vals]\n",
        "            mae_norm = [v / max(mae_vals) for v in mae_vals]\n",
        "            \n",
        "            ax6.bar(x_pos - width/2, rmse_norm, width, label='RMSE (norm)', alpha=0.8, color='coral')\n",
        "            ax6.bar(x_pos + width/2, mae_norm, width, label='MAE (norm)', alpha=0.8, color='skyblue')\n",
        "            ax6.set_ylabel('Normalized Error', fontsize=11, fontweight='bold')\n",
        "            ax6.set_title('Error Metrics Comparison', fontsize=12, fontweight='bold')\n",
        "            ax6.set_xticks(x_pos)\n",
        "            ax6.set_xticklabels([m[:10] for m in model_names], rotation=45, ha='right', fontsize=8)\n",
        "            ax6.legend(fontsize=9)\n",
        "            ax6.grid(True, alpha=0.3, axis='y')\n",
        "            \n",
        "            # 7. PREDICTION ERROR DISTRIBUTION (Bottom Center)\n",
        "            ax7 = fig.add_subplot(gs[3, 1])\n",
        "            abs_errors = np.abs(residuals)\n",
        "            ax7.hist(abs_errors, bins=50, edgecolor='black', alpha=0.7, color='purple')\n",
        "            ax7.axvline(abs_errors.mean(), color='red', linestyle='--', linewidth=2, \n",
        "                       label=f'Mean: {abs_errors.mean():.2f}')\n",
        "            ax7.axvline(abs_errors.median(), color='green', linestyle='--', linewidth=2, \n",
        "                       label=f'Median: {abs_errors.median():.2f}')\n",
        "            ax7.set_xlabel('Absolute Error', fontsize=11, fontweight='bold')\n",
        "            ax7.set_ylabel('Frequency', fontsize=11, fontweight='bold')\n",
        "            ax7.set_title('Absolute Error Distribution', fontsize=12, fontweight='bold')\n",
        "            ax7.legend(fontsize=9)\n",
        "            ax7.grid(True, alpha=0.3, axis='y')\n",
        "            \n",
        "            # 8. ACTUAL VS PREDICTED DISTRIBUTION (Bottom Right)\n",
        "            ax8 = fig.add_subplot(gs[3, 2])\n",
        "            ax8.hist(y_test, bins=50, alpha=0.5, label='Actual', color='blue', edgecolor='black')\n",
        "            ax8.hist(best_pred, bins=50, alpha=0.5, label='Predicted', color='orange', edgecolor='black')\n",
        "            ax8.set_xlabel('Answer Value', fontsize=11, fontweight='bold')\n",
        "            ax8.set_ylabel('Frequency', fontsize=11, fontweight='bold')\n",
        "            ax8.set_title('Distribution Comparison', fontsize=12, fontweight='bold')\n",
        "            ax8.legend(fontsize=10)\n",
        "            ax8.grid(True, alpha=0.3, axis='y')\n",
        "            \n",
        "            # Overall title\n",
        "            fig.suptitle('PHENOMENAL AIMO3 REGRESSION ANALYSIS - COMPREHENSIVE VISUALIZATION', \n",
        "                        fontsize=18, fontweight='bold', y=0.995)\n",
        "            \n",
        "            plt.tight_layout(rect=[0, 0, 1, 0.99])\n",
        "            plt.show()\n",
        "            \n",
        "            print(\"\\n‚úÖ All visualizations generated successfully!\")\n",
        "            print(f\"üìä Best model ({best_model_name}) achieved R¬≤ = {best_r2:.4f}\")\n",
        "            \n",
        "        else:\n",
        "            print(\"‚ùå No continuous features found for regression analysis.\")\n",
        "    else:\n",
        "        print(\"‚ùå No feature columns found in the dataset.\")\n",
        "else:\n",
        "    print(\"‚ùå Data not loaded or 'answer' column not found.\")\n",
        "    print(\"   Please ensure 'train' DataFrame is loaded with 'answer' column.\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## AIMO3 Inference Server (GPT-OSS-120B)\n",
        "\n",
        "**Note:** This section contains inference code for solving AIMO3 problems using GPT-OSS-120B model. This requires specific Kaggle inputs and setup."
      ]
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "%%time\n",
        "!find /kaggle/usr/lib -type f -print0 | xargs -0 -P 32 -n 500 cat > /dev/null\n",
        "\n",
        "def cache_model(path, exts=(\".bin\", \".pt\", \".safetensors\"), num_workers=None, chunk_mb=256):\n",
        "    \"\"\"Pre-read model weight files into OS page cache.\"\"\"\n",
        "    import os\n",
        "    import multiprocessing\n",
        "    import time\n",
        "    from concurrent.futures import ThreadPoolExecutor, as_completed\n",
        "    \n",
        "    def warmup_file(fpath):\n",
        "        chunk_size = chunk_mb * 1024 * 1024\n",
        "        total = 0\n",
        "        with open(fpath, \"rb\") as f:\n",
        "            while True:\n",
        "                data = f.read(chunk_size)\n",
        "                if not data:\n",
        "                    break\n",
        "                total += len(data)\n",
        "        return fpath, total\n",
        "    \n",
        "    if os.path.isdir(path):\n",
        "        files = [\n",
        "            os.path.join(root, name)\n",
        "            for root, _, names in os.walk(path)\n",
        "            for name in names\n",
        "            if name.endswith(exts)\n",
        "        ]\n",
        "        files.sort()\n",
        "    else:\n",
        "        files = [path]\n",
        "    \n",
        "    if not files:\n",
        "        raise ValueError(f\"No model files found under: {path}\")\n",
        "    \n",
        "    if num_workers is None:\n",
        "        try:\n",
        "            num_workers = min(multiprocessing.cpu_count(), 8)\n",
        "        except Exception:\n",
        "            num_workers = 4\n",
        "    \n",
        "    print(f\"[cache_model] {len(files)} file(s), {num_workers} worker(s)\")\n",
        "    t0 = time.time()\n",
        "    total_bytes = 0\n",
        "    \n",
        "    with ThreadPoolExecutor(max_workers=num_workers) as pool:\n",
        "        futures = {pool.submit(warmup_file, f): f for f in files}\n",
        "        for i, fut in enumerate(as_completed(futures), 1):\n",
        "            fpath, n = fut.result()\n",
        "            total_bytes += n\n",
        "            print(f\"[{i}/{len(files)}] cached {os.path.basename(fpath)}\")\n",
        "    \n",
        "    elapsed = time.time() - t0\n",
        "    gb = total_bytes / 1024**3\n",
        "    print(f\"[cache_model] total read ‚âà {gb:.2f} GB in {elapsed:.2f}s\")\n",
        "    return total_bytes\n",
        "\n",
        "# Warm up the model weights into cache (faster loading)\n",
        "cache_model(\"/kaggle/input/gpt-oss-120b/transformers/default/1\", num_workers=16, chunk_mb=1024)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "%%time\n",
        "# Copy vLLM compile cache if available\n",
        "import os\n",
        "if os.path.exists(\"/kaggle/input/gpt-oss-120b-cache-compile/torch_compile_cache\"):\n",
        "    !mkdir -p /root/.cache/vllm/\n",
        "    !cp -r /kaggle/input/gpt-oss-120b-cache-compile/torch_compile_cache /root/.cache/vllm/\n",
        "\n",
        "uninstall_proc.wait()\n",
        "subprocess.run([\"ls\", \"/kaggle/usr/lib/pip_install_aimo3_1/tiktoken_encodings\"])\n",
        "\n",
        "os.environ[\"TRANSFORMERS_NO_TF\"] = \"1\"\n",
        "os.environ[\"TRANSFORMERS_NO_FLAX\"] = \"1\"\n",
        "os.environ[\"TRITON_PTXAS_PATH\"] = \"/usr/local/cuda/bin/ptxas\"\n",
        "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n",
        "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
        "os.environ[\"TIKTOKEN_ENCODINGS_BASE\"] = \"/kaggle/usr/lib/pip_install_aimo3_1/tiktoken_encodings\""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "%%writefile local_python_tool.py\n",
        "\"\"\"Python tool using Jupyter kernel for stateful execution.\"\"\"\n",
        "import os\n",
        "import queue\n",
        "import threading\n",
        "from abc import ABC, abstractmethod\n",
        "from typing import AsyncIterator, Any\n",
        "from uuid import UUID, uuid4\n",
        "from openai_harmony import (\n",
        "    Author,\n",
        "    Content,\n",
        "    Message,\n",
        "    Role,\n",
        "    TextContent,\n",
        "    ToolNamespaceConfig,\n",
        ")\n",
        "\n",
        "def add_libs(code: str) -> str:\n",
        "    \"\"\"Add common math libraries to code.\"\"\"\n",
        "    return \"import math\\nimport numpy as np\\nimport sympy as sp\\nfrom sympy import *\\n\" + code\n",
        "\n",
        "def ensure_last_print(code: str) -> str:\n",
        "    \"\"\"Ensure the last expression is printed.\"\"\"\n",
        "    lines = code.strip().split(\"\\n\")\n",
        "    if lines and \"print(\" not in lines[-1] and \"import\" not in lines[-1]:\n",
        "        if \"#\" in lines[-1]:\n",
        "            lines[-1] = lines[-1].split(\"#\")[0]\n",
        "        lines[-1] = \"print(\" + lines[-1] + \")\"\n",
        "    return \"\\n\".join(lines)\n",
        "\n",
        "class LocalJupyterSession:\n",
        "    \"\"\"Stateful Jupyter kernel session for code execution.\"\"\"\n",
        "    # Class-level lock and port counter to avoid port conflicts\n",
        "    _port_lock = threading.Lock()\n",
        "    _next_port = 50000\n",
        "    \n",
        "    @classmethod\n",
        "    def _get_next_ports(cls, count: int = 5) -> list[int]:\n",
        "        \"\"\"Get next available ports for kernel connection.\"\"\"\n",
        "        with cls._port_lock:\n",
        "            ports = list(range(cls._next_port, cls._next_port + count))\n",
        "            cls._next_port += count\n",
        "        return ports\n",
        "    \n",
        "    def __init__(self, connection_file: str | None = None, *, timeout: float = 120.0):\n",
        "        try:\n",
        "            from jupyter_client import BlockingKernelClient, KernelManager\n",
        "        except ImportError as exc:\n",
        "            raise RuntimeError(\"jupyter_client package required\") from exc\n",
        "        \n",
        "        self._default_timeout = timeout\n",
        "        self._owns_kernel = False\n",
        "        self._client: BlockingKernelClient\n",
        "        self._km: KernelManager | None = None\n",
        "        \n",
        "        if connection_file:\n",
        "            from pathlib import Path\n",
        "            connection_path = Path(connection_file).expanduser()\n",
        "            if not connection_path.exists():\n",
        "                raise FileNotFoundError(f\"Connection file not found: {connection_path}\")\n",
        "            client = BlockingKernelClient()\n",
        "            client.load_connection_file(str(connection_path))\n",
        "            client.start_channels()\n",
        "            client.wait_for_ready(timeout=self._default_timeout)\n",
        "            self._client = client\n",
        "        else:\n",
        "            # Allocate unique ports to avoid conflicts when running multiple kernels\n",
        "            ports = self._get_next_ports(5)\n",
        "            km = KernelManager()\n",
        "            km.shell_port = ports[0]\n",
        "            km.iopub_port = ports[1]\n",
        "            km.stdin_port = ports[2]\n",
        "            km.hb_port = ports[3]\n",
        "            km.control_port = ports[4]\n",
        "            km.start_kernel()\n",
        "            client = km.blocking_client()\n",
        "            client.start_channels()\n",
        "            client.wait_for_ready(timeout=self._default_timeout)\n",
        "            self._client = client\n",
        "            self._km = km\n",
        "            self._owns_kernel = True\n",
        "    \n",
        "    def execute(self, code: str, *, timeout: float | None = None) -> str:\n",
        "        \"\"\"Execute code and return combined stdout/stderr.\"\"\"\n",
        "        client = self._client\n",
        "        effective_timeout = timeout or self._default_timeout\n",
        "        msg_id = client.execute(code, store_history=True, allow_stdin=False, stop_on_error=False)\n",
        "        \n",
        "        stdout_parts: list[str] = []\n",
        "        stderr_parts: list[str] = []\n",
        "        \n",
        "        while True:\n",
        "            try:\n",
        "                msg = client.get_iopub_msg(timeout=effective_timeout)\n",
        "            except queue.Empty as exc:\n",
        "                raise TimeoutError(\"Timed out waiting for kernel output.\") from exc\n",
        "            \n",
        "            if msg.get(\"parent_header\", {}).get(\"msg_id\") != msg_id:\n",
        "                continue\n",
        "            \n",
        "            msg_type = msg.get(\"msg_type\")\n",
        "            content = msg.get(\"content\", {})\n",
        "            \n",
        "            if msg_type == \"stream\":\n",
        "                text = content.get(\"text\", \"\")\n",
        "                if content.get(\"name\") == \"stdout\":\n",
        "                    stdout_parts.append(text)\n",
        "                else:\n",
        "                    stderr_parts.append(text)\n",
        "            elif msg_type == \"error\":\n",
        "                traceback_data = content.get(\"traceback\")\n",
        "                if traceback_data:\n",
        "                    stderr_parts.append(\"\\n\".join(traceback_data))\n",
        "                else:\n",
        "                    ename = content.get(\"ename\", \"\")\n",
        "                    evalue = content.get(\"evalue\", \"\")\n",
        "                    stderr_parts.append(f\"{ename}: {evalue}\".strip())\n",
        "            elif msg_type in {\"execute_result\", \"display_data\"}:\n",
        "                data = content.get(\"data\", {})\n",
        "                text = data.get(\"text/plain\")\n",
        "                if text:\n",
        "                    stdout_parts.append(text if text.endswith(\"\\n\") else f\"{text}\\n\")\n",
        "            elif msg_type == \"status\" and content.get(\"execution_state\") == \"idle\":\n",
        "                break\n",
        "        \n",
        "        # Drain shell channel\n",
        "        while True:\n",
        "            try:\n",
        "                reply = client.get_shell_msg(timeout=effective_timeout)\n",
        "            except queue.Empty as exc:\n",
        "                raise TimeoutError(\"Timed out waiting for execution reply.\") from exc\n",
        "            \n",
        "            if reply.get(\"parent_header\", {}).get(\"msg_id\") != msg_id:\n",
        "                continue\n",
        "            \n",
        "            reply_content = reply.get(\"content\", {})\n",
        "            if reply_content.get(\"status\") == \"error\":\n",
        "                traceback_data = reply_content.get(\"traceback\")\n",
        "                if traceback_data:\n",
        "                    stderr_parts.append(\"\\n\".join(traceback_data))\n",
        "                else:\n",
        "                    ename = reply_content.get(\"ename\", \"\")\n",
        "                    evalue = reply_content.get(\"evalue\", \"\")\n",
        "                    stderr_parts.append(f\"{ename}: {evalue}\".strip())\n",
        "            break\n",
        "        \n",
        "        stdout = \"\".join(stdout_parts)\n",
        "        stderr = \"\".join(stderr_parts)\n",
        "        if stderr:\n",
        "            stdout = f\"{stdout.rstrip()}\\n{stderr}\" if stdout else stderr\n",
        "        if not stdout.strip():\n",
        "            stdout = \"[WARN] No output. Use print() to see results.\"\n",
        "        return stdout\n",
        "    \n",
        "    def close(self):\n",
        "        import contextlib\n",
        "        with contextlib.suppress(Exception):\n",
        "            self._client.stop_channels()\n",
        "        if self._owns_kernel and self._km is not None:\n",
        "            with contextlib.suppress(Exception):\n",
        "                self._km.shutdown_kernel(now=True)\n",
        "    \n",
        "    def __del__(self):\n",
        "        self.close()\n",
        "\n",
        "class PythonTool:\n",
        "    \"\"\"Python execution tool using Jupyter kernel.\"\"\"\n",
        "    \n",
        "    def __init__(self, execution_backend: str | None = None, local_jupyter_timeout: float = 60.0):\n",
        "        self._local_jupyter_timeout = local_jupyter_timeout\n",
        "        self._execution_lock = threading.Lock()\n",
        "        self._jupyter_session: LocalJupyterSession | None = None\n",
        "        # Lazy initialization to avoid port conflicts during object creation\n",
        "        self._init_lock = threading.Lock()\n",
        "    \n",
        "    def _ensure_session(self):\n",
        "        \"\"\"Lazily initialize the Jupyter session.\"\"\"\n",
        "        if self._jupyter_session is None:\n",
        "            with self._init_lock:\n",
        "                if self._jupyter_session is None:\n",
        "                    self._jupyter_session = LocalJupyterSession(timeout=self._local_jupyter_timeout)\n",
        "    \n",
        "    @classmethod\n",
        "    def get_tool_name(cls) -> str:\n",
        "        return \"python\"\n",
        "    \n",
        "    @property\n",
        "    def name(self) -> str:\n",
        "        return self.get_tool_name()\n",
        "    \n",
        "    @property\n",
        "    def instruction(self) -> str:\n",
        "        return \"\"\"Use this tool to execute Python code. The code runs in a stateful Jupyter notebook. Use print() to see output.\"\"\"\n",
        "    \n",
        "    @property\n",
        "    def tool_config(self) -> ToolNamespaceConfig:\n",
        "        return ToolNamespaceConfig(\n",
        "            name=self.get_tool_name(),\n",
        "            description=self.instruction,\n",
        "            tools=[]\n",
        "        )\n",
        "    \n",
        "    def _make_response(self, output: str, channel: str | None = None) -> Message:\n",
        "        content = TextContent(text=output)\n",
        "        author = Author(role=Role.TOOL, name=self.get_tool_name())\n",
        "        message = Message(author=author, content=[content]).with_recipient(\"assistant\")\n",
        "        if channel:\n",
        "            message = message.with_channel(channel)\n",
        "        return message\n",
        "    \n",
        "    def process_sync_plus(self, message: Message) -> list[Message]:\n",
        "        \"\"\"Execute code from message using Jupyter kernel.\"\"\"\n",
        "        self._ensure_session()\n",
        "        script = message.content[0].text\n",
        "        \n",
        "        with self._execution_lock:\n",
        "            try:\n",
        "                output = self._jupyter_session.execute(script)\n",
        "            except TimeoutError as exc:\n",
        "                # NEW: timeout -> reset kernel so it won't stay stuck for next calls\n",
        "                try:\n",
        "                    self.reset()\n",
        "                except Exception:\n",
        "                    pass\n",
        "                output = f\"[ERROR] {exc}\"\n",
        "        \n",
        "        return [self._make_response(output, channel=message.channel)]\n",
        "    \n",
        "    #====================\n",
        "    def reset(self):\n",
        "        \"\"\"Hard reset: kill current kernel and start a fresh one.\"\"\"\n",
        "        with self._execution_lock:\n",
        "            if self._jupyter_session is not None:\n",
        "                self._jupyter_session.close()\n",
        "                self._jupyter_session = None\n",
        "            self._ensure_session()\n",
        "    \n",
        "    def interrupt(self):\n",
        "        \"\"\"Try a soft interrupt if possible; fallback to hard reset.\"\"\"\n",
        "        self._ensure_session()\n",
        "        try:\n",
        "            km = getattr(self._jupyter_session, \"_km\", None)\n",
        "            if km is not None:\n",
        "                km.interrupt_kernel()\n",
        "                return\n",
        "        except Exception:\n",
        "            pass\n",
        "        # interrupt\n",
        "        self.reset()\n",
        "    #====================\n",
        "    \n",
        "    def close(self):\n",
        "        if self._jupyter_session is not None:\n",
        "            self._jupyter_session.close()\n",
        "            self._jupyter_session = None\n",
        "    \n",
        "    def __del__(self):\n",
        "        self.close()"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "import warnings\n",
        "warnings.simplefilter('ignore')\n",
        "import re\n",
        "import math\n",
        "import threading\n",
        "import time\n",
        "import subprocess\n",
        "from collections import Counter\n",
        "from concurrent.futures import ThreadPoolExecutor, as_completed\n",
        "from typing import List\n",
        "import pandas as pd\n",
        "import polars as pl\n",
        "from openai import OpenAI\n",
        "from transformers import set_seed, AutoTokenizer\n",
        "from openai_harmony import (\n",
        "    HarmonyEncodingName,\n",
        "    load_harmony_encoding,\n",
        "    Conversation,\n",
        "    Message,\n",
        "    Role,\n",
        "    SystemContent,\n",
        "    ReasoningEffort,\n",
        "    RenderConversationConfig,\n",
        ")\n",
        "from local_python_tool import PythonTool\n",
        "\n",
        "# Load Harmony encoding for GPT-OSS\n",
        "encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)\n",
        "\n",
        "# Constants\n",
        "SEED = 42\n",
        "set_seed(SEED)\n",
        "MAX_LEN = 64 * 1024\n",
        "USE_BUDGET = False  # forced to be false\n",
        "K = 8  # Number of parallel samples\n",
        "\n",
        "# Inference parameters\n",
        "TEMPERATURE = 1.0\n",
        "TOP_P = 1.0\n",
        "MIN_P = 0.02"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "def start_vllm_server() -> subprocess.Popen:\n",
        "    \"\"\"Start vLLM server in background.\"\"\"\n",
        "    command = [\n",
        "        \"python\", \"-m\", \"vllm.entrypoints.openai.api_server\",\n",
        "        \"--model\", \"/kaggle/input/gpt-oss-120b/transformers/default/1\",\n",
        "        \"--served-model-name\", \"gpt-oss\",\n",
        "        \"--tensor-parallel-size\", \"1\",\n",
        "        \"--max-num-seqs\", \"64\",\n",
        "        \"--gpu-memory-utilization\", \"0.96\",\n",
        "        \"--host\", \"0.0.0.0\",\n",
        "        \"--port\", \"8000\",\n",
        "        \"--dtype\", \"auto\",\n",
        "        \"--max-model-len\", str(MAX_LEN),\n",
        "        \"--stream-interval\", \"20\",\n",
        "    ]\n",
        "    with open(\"./vllm.log\", \"w\") as logfile:\n",
        "        process = subprocess.Popen(\n",
        "            command, stdout=logfile, stderr=subprocess.STDOUT, start_new_session=True\n",
        "        )\n",
        "    print(\"vLLM server started. Logs: ./vllm.log\")\n",
        "    return process\n",
        "\n",
        "vllm_process = start_vllm_server()\n",
        "\n",
        "# TIR Prompts\n",
        "TIR_PROMPT_SIMPLE = \"\"\"Please reason step by step and use the python tool to solve the math problem. Finally, Return only the verified final answer in \\\\boxed{}, where the answer is an integer in [0, 99999]. Never guess.\"\"\"\n",
        "\n",
        "TIR_PROMPT_ENHANCED = \"\"\"Please reason step by step and use the python tool to solve the math problem. For extremely large numbers, find patterns from small cases instead of direct computation. Finally, Return only the verified final answer in \\\\boxed{}, where the answer is an integer in [0, 99999]. Never guess.\"\"\"\n",
        "\n",
        "TIR_PROMPTS = [TIR_PROMPT_SIMPLE]\n",
        "\n",
        "# Create Python tool pool\n",
        "import queue\n",
        "python_pool = queue.Queue(maxsize=K)\n",
        "for _ in range(K):\n",
        "    t = PythonTool(execution_backend=\"jupyter\", local_jupyter_timeout=60.0)\n",
        "    t._ensure_session()\n",
        "    python_pool.put(t)\n",
        "print(\"Pool created!\")\n",
        "\n",
        "import gc\n",
        "CLEANUP_CODE = r\"\"\"\n",
        "import gc\n",
        "_keep = {\n",
        "    \"__builtins__\",\n",
        "    \"__name__\",\n",
        "    \"__doc__\",\n",
        "    \"__package__\",\n",
        "    \"__loader__\",\n",
        "    \"__spec__\",\n",
        "    \"np\",\n",
        "    \"sp\",\n",
        "    \"math\",\n",
        "}\n",
        "g = globals()\n",
        "for k in list(g.keys()):\n",
        "    if k in _keep or k.startswith(\"_\"):\n",
        "        continue\n",
        "    try:\n",
        "        del g[k]\n",
        "    except Exception:\n",
        "        pass\n",
        "gc.collect()\n",
        "\"\"\"\n",
        "print(\"yes\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "class HarmonyTIRInferencer:\n",
        "    \"\"\"Inferencer using Harmony protocol with Tool-Integrated Reasoning (TIR).\"\"\"\n",
        "    \n",
        "    def __init__(\n",
        "        self,\n",
        "        model_path: str,\n",
        "        max_model_len: int = MAX_LEN,\n",
        "        temperature: float = TEMPERATURE,\n",
        "        top_p: float = TOP_P,\n",
        "        min_p: float = MIN_P,\n",
        "        seed: int = SEED,\n",
        "        k: int = K,\n",
        "        use_budget: bool = USE_BUDGET,\n",
        "        max_iter: int = 100,\n",
        "    ):\n",
        "        self.model_path = model_path\n",
        "        self.model = \"gpt-oss\"\n",
        "        self.max_model_len = max_model_len\n",
        "        self.temperature = temperature\n",
        "        self.top_p = top_p\n",
        "        self.min_p = min_p\n",
        "        self.seed = seed\n",
        "        self.k = k\n",
        "        self.use_budget = use_budget\n",
        "        self.max_iter = max_iter\n",
        "        self.base_budget = 60 * 5.5  # 5.5 minutes base per problem\n",
        "        self.budget = 370  # initial budget in seconds (~6.1 min for first problem)\n",
        "        self.deadline = None\n",
        "        \n",
        "        # Initialize the OpenAI-compatible client pointing to local vLLM server\n",
        "        self.client = OpenAI(\n",
        "            base_url=\"http://127.0.0.1:8000/v1\",\n",
        "            api_key=\"sk-local\",\n",
        "            timeout=360,\n",
        "        )\n",
        "        self.stop_token_ids = encoding.stop_tokens_for_assistant_actions()\n",
        "        self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
        "    \n",
        "    def wait_server(self):\n",
        "        \"\"\"Wait until the vLLM server is ready to accept requests.\"\"\"\n",
        "        for _ in range(15 * 60):\n",
        "            time.sleep(1)\n",
        "            try:\n",
        "                # List models to check if server is up\n",
        "                print(self.client.models.list())\n",
        "                return\n",
        "            except Exception:\n",
        "                continue\n",
        "        raise RuntimeError(\"vLLM server failed to start\")\n",
        "    \n",
        "    def get_num_samples(self) -> int:\n",
        "        \"\"\"Determine number of parallel samples to generate based on remaining budget.\"\"\"\n",
        "        if not self.use_budget:\n",
        "            print(f\"Budget disabled -> N: {self.k}\")\n",
        "            return self.k\n",
        "        estimated = (self.budget - 190) / 90\n",
        "        ret = min(self.k, math.floor(estimated))\n",
        "        print(f\"Budget: {self.budget} -> N: {ret}\")\n",
        "        return max(4, ret)\n",
        "    \n",
        "    def apply_chat_template(self, prompt: str, python_tool: PythonTool) -> list[Message]:\n",
        "        \"\"\"Wrap user prompt into Harmony conversation format with system and tool info.\"\"\"\n",
        "        return [\n",
        "            Message.from_role_and_content(\n",
        "                Role.SYSTEM,\n",
        "                SystemContent.new()\n",
        "                .with_reasoning_effort(reasoning_effort=ReasoningEffort.HIGH)\n",
        "                .with_tools(python_tool.tool_config)\n",
        "            ),\n",
        "            Message.from_role_and_content(Role.USER, prompt),\n",
        "        ]\n",
        "    \n",
        "    def format_prompts(self, problem: str) -> list[str]:\n",
        "        \"\"\"Create multiple prompts (possibly with different TIR strategies) for one problem.\"\"\"\n",
        "        num_samples = self.get_num_samples()\n",
        "        prompts = []\n",
        "        for i in range(num_samples):\n",
        "            # Alternate between the prompt templates for diversity\n",
        "            tir_prompt = TIR_PROMPTS[i % len(TIR_PROMPTS)]\n",
        "            prompts.append(problem + \"\\n\\n\" + tir_prompt)\n",
        "        return prompts"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "    def inference(self, problem: str, deadline: float) -> tuple[int, float]:\n",
        "        \"\"\"Run the multi-sample inference for a single problem and return the final answer and saved time.\"\"\"\n",
        "        self.deadline = deadline\n",
        "        start_time = time.time()\n",
        "        prompts = self.format_prompts(problem)\n",
        "        responses = self._inference_parallel(prompts)\n",
        "        duration = time.time() - start_time\n",
        "        saved_time = max(0.0, deadline - time.time())\n",
        "        \n",
        "        print(f\"[Budget]: {(deadline - start_time):.2f}s\")\n",
        "        print(f\"[inference] Took {duration:.2f}s\")\n",
        "        print(f\"[Saved time]: {saved_time:.2f}s\")\n",
        "        \n",
        "        if self.use_budget:\n",
        "            budget_left = max(0, self.budget - duration)\n",
        "            self.budget = self.base_budget + budget_left\n",
        "            print(f\"[inference] Updated budget: {self.budget:.2f}s\")\n",
        "        \n",
        "        return self.parse_responses(responses), saved_time\n",
        "    \n",
        "    def extract_boxed_text(self, text: str) -> int | None:\n",
        "        \"\"\"Extract a numeric answer from '\\\\boxed{}' or 'final answer is ...' in the text.\"\"\"\n",
        "        # Pattern for \\boxed{NUMBER}\n",
        "        pattern = r'oxed{(.*?)}'\n",
        "        matches = re.findall(pattern, str(text))\n",
        "        if matches:\n",
        "            for match in reversed(matches):\n",
        "                if match:\n",
        "                    try:\n",
        "                        # Remove commas/spaces and parse as number\n",
        "                        clean_match = match.strip().replace(',', '').replace(' ', '')\n",
        "                        val = int(float(clean_match[:20]))\n",
        "                        if 0 <= val <= 99999:\n",
        "                            return val\n",
        "                    except Exception:\n",
        "                        pass\n",
        "        \n",
        "        # Pattern for \"final answer is X\" or \"Final Answer: X\"\n",
        "        pattern = r'(?i)final\\s+answer\\s*(?:is|:)?\\s*(\\d+)'\n",
        "        matches = re.findall(pattern, text)\n",
        "        if matches:\n",
        "            for match in reversed(matches):\n",
        "                if match:\n",
        "                    try:\n",
        "                        val = int(match)\n",
        "                        if 0 <= val <= 99999:\n",
        "                            return val\n",
        "                    except Exception:\n",
        "                        pass\n",
        "        \n",
        "        return None\n",
        "    \n",
        "    def parse_responses(self, responses: list[str]) -> int:\n",
        "        \"\"\"Decide on the final answer from all responses by majority vote (with tie-break).\"\"\"\n",
        "        answers = [self.extract_boxed_text(r) for r in responses]\n",
        "        # Filter out any None values\n",
        "        valid_answers = [a for a in answers if a is not None]\n",
        "        \n",
        "        if not valid_answers:\n",
        "            print(\"No valid answers found\")\n",
        "            return 8687\n",
        "        \n",
        "        counter = Counter(valid_answers)\n",
        "        print(f\"Answers: {counter}\")\n",
        "        \n",
        "        # Majority vote: pick the most common answer; break ties by choosing the largest answer\n",
        "        most_common_list = counter.most_common(2)\n",
        "        if len(most_common_list) > 1 and most_common_list[0][1] == most_common_list[1][1]:\n",
        "            tied_answers = [ans for ans, cnt in counter.items() if cnt == most_common_list[0][1]]\n",
        "            answer = max(tied_answers)\n",
        "        else:\n",
        "            answer = most_common_list[0][0]\n",
        "        \n",
        "        return answer"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "    def single_generate_tir(self, prompt: str, stop_event: threading.Event, seed_offset: int = 0) -> str:\n",
        "        \"\"\"Generate a single reasoning trace (with tool use) for the prompt.\"\"\"\n",
        "        python_tool = None\n",
        "        borrowed_from_pool = False\n",
        "        \n",
        "        def _get_pool():\n",
        "            return getattr(self, \"python_pool\", None) or globals().get(\"python_pool\", None)\n",
        "        \n",
        "        def _recreate_tool(close_old: bool = True):\n",
        "            nonlocal python_tool\n",
        "            if close_old and python_tool is not None:\n",
        "                try:\n",
        "                    python_tool.close()\n",
        "                except Exception:\n",
        "                    pass\n",
        "            python_tool = PythonTool(execution_backend=\"jupyter\", local_jupyter_timeout=60.0)\n",
        "            python_tool._ensure_session()\n",
        "        \n",
        "        def _time_left():\n",
        "            if not self.deadline:\n",
        "                return None\n",
        "            return self.deadline - time.time()\n",
        "        \n",
        "        try:\n",
        "            # Borrow PythonTool from pool if exists\n",
        "            pool = _get_pool()\n",
        "            if isinstance(pool, queue.Queue):\n",
        "                print(\"pool.qsize=\", pool.qsize())\n",
        "                python_tool = pool.get(timeout=5)\n",
        "                borrowed_from_pool = True\n",
        "            else:\n",
        "                python_tool = PythonTool(execution_backend=\"jupyter\")\n",
        "            \n",
        "            # Ensure session exists\n",
        "            try:\n",
        "                python_tool._ensure_session()\n",
        "            except Exception as e:\n",
        "                print(f\"‚ö†Ô∏è ensure_session failed: {e} -> recreate kernel now\")\n",
        "                _recreate_tool(close_old=True)\n",
        "            \n",
        "            # Cleanup at start\n",
        "            try:\n",
        "                python_tool._jupyter_session.execute(CLEANUP_CODE, timeout=5)\n",
        "            except Exception as e:\n",
        "                print(f\"‚ö†Ô∏è Cleanup failed: {e} -> recreate kernel now\")\n",
        "                _recreate_tool(close_old=True)\n",
        "            \n",
        "            messages = self.apply_chat_template(prompt, python_tool)\n",
        "            final_answer_found = \"\"\n",
        "            \n",
        "            for iteration in range(self.max_iter):\n",
        "                # Termination checks\n",
        "                if self.deadline and time.time() >= self.deadline:\n",
        "                    print(\"‚è∞ Deadline reached\")\n",
        "                    break\n",
        "                if final_answer_found:\n",
        "                    print(\"‚úÖ Final answer found at iteration:\", iteration)\n",
        "                    break\n",
        "                if stop_event and stop_event.is_set():\n",
        "                    break\n",
        "                \n",
        "                # Render conversation to token IDs\n",
        "                prompt_ids = encoding.render_conversation_for_completion(\n",
        "                    Conversation.from_messages(messages),\n",
        "                    Role.ASSISTANT\n",
        "                )\n",
        "                max_tokens = self.max_model_len - len(prompt_ids)\n",
        "                if max_tokens < 1:\n",
        "                    print(\"‚ö†Ô∏è Context full!\")\n",
        "                    break\n",
        "                \n",
        "                token_buffer = []\n",
        "                token_buffer_str = \"\"\n",
        "                breaking = False\n",
        "                \n",
        "                # Stream tokens from the model\n",
        "                stream = self.client.completions.create(\n",
        "                    model=self.model,\n",
        "                    prompt=prompt_ids,\n",
        "                    max_tokens=max_tokens,\n",
        "                    temperature=self.temperature,\n",
        "                    top_p=self.top_p,\n",
        "                    seed=self.seed + seed_offset,\n",
        "                    stream=True,\n",
        "                    extra_body=dict(\n",
        "                        min_p=self.min_p,\n",
        "                        stop_token_ids=self.stop_token_ids,\n",
        "                        return_token_ids=True,\n",
        "                    ),\n",
        "                    timeout=360,\n",
        "                )\n",
        "                \n",
        "                try:\n",
        "                    for chunk in stream:\n",
        "                        if stop_event and stop_event.is_set():\n",
        "                            breaking = True\n",
        "                            break\n",
        "                        \n",
        "                        token_chunk = chunk.choices[0].token_ids\n",
        "                        text_chunk = chunk.choices[0].text\n",
        "                        \n",
        "                        if token_chunk:\n",
        "                            token_buffer.extend(token_chunk)\n",
        "                            token_buffer_str += text_chunk\n",
        "                        \n",
        "                        if self.deadline and time.time() >= self.deadline:\n",
        "                            breaking = True\n",
        "                            break\n",
        "                        \n",
        "                        if len(token_buffer) > 60_000:\n",
        "                            print(\"‚ö†Ô∏è Token limit\")\n",
        "                            breaking = True\n",
        "                            break\n",
        "                        \n",
        "                        # Early stop if boxed detected\n",
        "                        if \"}\" in text_chunk and self.extract_boxed_text(token_buffer_str) is not None:\n",
        "                            final_answer_found = token_buffer_str\n",
        "                            breaking = True\n",
        "                            break\n",
        "                finally:\n",
        "                    try:\n",
        "                        stream.close()\n",
        "                    except Exception:\n",
        "                        pass\n",
        "                \n",
        "                if breaking:\n",
        "                    break\n",
        "                \n",
        "                # Parse any full assistant messages generated\n",
        "                if token_buffer:\n",
        "                    new_messages = encoding.parse_messages_from_completion_tokens(\n",
        "                        token_buffer, Role.ASSISTANT\n",
        "                    )\n",
        "                    messages.extend(new_messages)\n",
        "                \n",
        "                last_message = messages[-1]\n",
        "                if last_message.channel == \"final\" or (token_buffer and token_buffer[-1] == 200002):\n",
        "                    break\n",
        "                \n",
        "                # Python tool execution (deadline-aware)\n",
        "                if last_message.recipient == \"python\":\n",
        "                    tl = _time_left()\n",
        "                    if tl is not None and tl <= 0:\n",
        "                        print(\"‚è∞ Deadline reached before python call\")\n",
        "                        break\n",
        "                    \n",
        "                    if tl is None:\n",
        "                        tool_timeout = None\n",
        "                    else:\n",
        "                        tool_timeout = max(1.0, min(20.0, tl - 1.0))\n",
        "                    \n",
        "                    print(f\"üêç Executing Python code... (timeout={tool_timeout})\")\n",
        "                    try:\n",
        "                        python_tool._ensure_session()\n",
        "                        out = python_tool._jupyter_session.execute(last_message.content[0].text, timeout=tool_timeout)\n",
        "                        response_msgs = [python_tool._make_response(out, channel=last_message.channel)]\n",
        "                        messages.extend(response_msgs)\n",
        "                    except TimeoutError as e:\n",
        "                        print(f\"‚ö†Ô∏è Python timed out: {e} -> reset/recreate kernel now\")\n",
        "                        try:\n",
        "                            python_tool.reset()\n",
        "                        except Exception:\n",
        "                            _recreate_tool(close_old=True)\n",
        "                        break\n",
        "                    except Exception as e:\n",
        "                        print(f\"‚ö†Ô∏è Python tool execution failed: {e} -> recreate kernel now\")\n",
        "                        _recreate_tool(close_old=True)\n",
        "                        break\n",
        "                \n",
        "                # If a final boxed answer was found during streaming, return that text\n",
        "                if final_answer_found:\n",
        "                    return final_answer_found\n",
        "            \n",
        "            # Otherwise, return the entire conversation as text\n",
        "            return encoding.decode_utf8(\n",
        "                encoding.render_conversation_for_training(\n",
        "                    Conversation.from_messages(messages),\n",
        "                    RenderConversationConfig(auto_drop_analysis=False)\n",
        "                )\n",
        "            )\n",
        "        except Exception as e:\n",
        "            print(f\"Error in generation: {e}\")\n",
        "            return \"\"\n",
        "        finally:\n",
        "            # Return to pool OR close\n",
        "            if python_tool is not None:\n",
        "                if borrowed_from_pool:\n",
        "                    try:\n",
        "                        pool = _get_pool()\n",
        "                        if isinstance(pool, queue.Queue):\n",
        "                            pool.put(python_tool)\n",
        "                        else:\n",
        "                            python_tool.close()\n",
        "                    except Exception:\n",
        "                        try:\n",
        "                            python_tool.close()\n",
        "                        except Exception:\n",
        "                            pass\n",
        "                else:\n",
        "                    try:\n",
        "                        python_tool.close()\n",
        "                    except Exception:\n",
        "                        pass"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "    def _inference_parallel(self, prompts: list[str]) -> list[str]:\n",
        "        \"\"\"Run multiple single_generate_tir in parallel and return all raw responses.\"\"\"\n",
        "        from concurrent.futures import ThreadPoolExecutor, wait, FIRST_COMPLETED\n",
        "        \n",
        "        stop_event = threading.Event()\n",
        "        answers_collected: List[int] = []\n",
        "        raw_responses = [\"\"] * len(prompts)\n",
        "        majority_threshold = len(prompts) / 2\n",
        "        \n",
        "        def time_left() -> float:\n",
        "            dl = getattr(self, \"deadline\", None)\n",
        "            return float(\"inf\") if dl is None else (dl - time.time())\n",
        "        \n",
        "        print(f\"üöÄ Sampling {len(prompts)} times (threshold: > {majority_threshold})...\")\n",
        "        executor = ThreadPoolExecutor(max_workers=min(self.k, len(prompts)))\n",
        "        futures = []\n",
        "        fut2idx = {}\n",
        "        \n",
        "        try:\n",
        "            for i, p in enumerate(prompts):\n",
        "                fut = executor.submit(self.single_generate_tir, p, stop_event, i)\n",
        "                futures.append(fut)\n",
        "                fut2idx[fut] = i\n",
        "            \n",
        "            pending = set(futures)\n",
        "            majority_reached = False\n",
        "            \n",
        "            while pending:\n",
        "                if time_left() <= 0:\n",
        "                    stop_event.set()\n",
        "                    break\n",
        "                \n",
        "                timeout = min(0.5, max(0.0, time_left()))\n",
        "                done, pending = wait(pending, timeout=timeout, return_when=FIRST_COMPLETED)\n",
        "                \n",
        "                if not done:\n",
        "                    continue\n",
        "                \n",
        "                for fut in done:\n",
        "                    idx = fut2idx[fut]\n",
        "                    try:\n",
        "                        result_text = fut.result()\n",
        "                    except Exception as e:\n",
        "                        print(f\"Task exception: {e}\")\n",
        "                        result_text = \"\"\n",
        "                    \n",
        "                    raw_responses[idx] = result_text\n",
        "                    ans = self.extract_boxed_text(result_text)\n",
        "                    if ans is not None:\n",
        "                        answers_collected.append(ans)\n",
        "                \n",
        "                counts = Counter(answers_collected)\n",
        "                if len(counts) > 0:\n",
        "                    most_common_ans, count = counts.most_common(1)[0]\n",
        "                    if count > majority_threshold:\n",
        "                        print(f\"üéØ Majority reached! {most_common_ans} appeared {count} times\")\n",
        "                        stop_event.set()\n",
        "                        majority_reached = True\n",
        "                        break\n",
        "                \n",
        "                if majority_reached:\n",
        "                    break\n",
        "            \n",
        "            if majority_reached and time_left() > 0:\n",
        "                grace = min(2.0, max(0.0, time_left()))\n",
        "                wait(pending, timeout=grace)\n",
        "        finally:\n",
        "            executor.shutdown(wait=False, cancel_futures=True)\n",
        "        \n",
        "        return raw_responses"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Initialize the inferencer\n",
        "inferencer = HarmonyTIRInferencer(\n",
        "    \"/kaggle/input/gpt-oss-120b/transformers/default/1\",\n",
        "    use_budget=USE_BUDGET,\n",
        "    k=K,\n",
        ")\n",
        "\n",
        "# Wait for the vLLM server to be ready\n",
        "inferencer.wait_server()\n",
        "\n",
        "init_time = time.time()\n",
        "final_cutoff_time = init_time + 3600  # 1 hour from start\n",
        "cutoff_times = [int(x) for x in np.linspace(final_cutoff_time, init_time, 50 + 1)]\n",
        "cutoff_times.pop()  # remove the last element to get exactly 50 cutoff deadlines"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "def predict(id_: pl.DataFrame, question: pl.DataFrame) -> pl.DataFrame | pd.DataFrame:\n",
        "    \"\"\"Make a prediction.\"\"\"\n",
        "    global correct_count, total_count, predictions, cutoff_times\n",
        "    \n",
        "    question_id = id_.item(0)\n",
        "    question_text = question.item(0)\n",
        "    \n",
        "    print(\"------\")\n",
        "    print(f\"ID: {question_id}\")\n",
        "    print(f\"Question: {question_text[:200]}...\")\n",
        "    \n",
        "    current_deadline = cutoff_times[-1]\n",
        "    answer, saved_time = inferencer.inference(question_text, deadline=current_deadline)\n",
        "    cutoff_times.pop()\n",
        "    \n",
        "    # Dynamically recompute cutoff_times and distribute saved_time\n",
        "    if len(cutoff_times) > 0:\n",
        "        now = time.time()\n",
        "        num_remaining = len(cutoff_times)\n",
        "        base_times = np.linspace(final_cutoff_time, now, num_remaining + 1)\n",
        "        base_times = base_times[:-1]  # keep only N timestamps\n",
        "        extra = saved_time / num_remaining\n",
        "        cutoff_times = [int(t + extra) for t in base_times]\n",
        "    \n",
        "    # Store prediction\n",
        "    predictions[question_id] = answer\n",
        "    \n",
        "    # Check accuracy if ground truth available\n",
        "    total_count += 1\n",
        "    if question_id in ground_truth:\n",
        "        gt = ground_truth[question_id]\n",
        "        is_correct = (answer == gt)\n",
        "        if is_correct:\n",
        "            correct_count += 1\n",
        "        status = \"‚úÖ\" if is_correct else \"‚ùå\"\n",
        "        print(f\"Answer: {answer} | Ground Truth: {gt} | {status}\")\n",
        "        print(f\"üìä Running Accuracy: {correct_count}/{total_count} ({100*correct_count/total_count:.1f}%)\")\n",
        "    else:\n",
        "        print(f\"Answer: {answer}\")\n",
        "    \n",
        "    print(\"------\\n\")\n",
        "    return pl.DataFrame({\"id\": question_id, \"answer\": answer})"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "# Load reference data and keep ground truth for accuracy calculation\n",
        "df = pd.read_csv(\n",
        "    \"/kaggle/input/ai-mathematical-olympiad-progress-prize-3/reference.csv\"\n",
        ")\n",
        "\n",
        "# Store ground truth answers for accuracy calculation (only in local mode)\n",
        "ground_truth = dict(zip(df[\"id\"], df[\"answer\"])) if \"answer\" in df.columns else {}\n",
        "\n",
        "# Create input file without answers\n",
        "df.drop(\"answer\", axis=1, errors=\"ignore\").to_csv(\"reference.csv\", index=False)\n",
        "\n",
        "# Track predictions for accuracy calculation\n",
        "predictions = {}\n",
        "correct_count = 0\n",
        "total_count = 0\n",
        "\n",
        "import kaggle_evaluation.aimo_3_inference_server\n",
        "inference_server = kaggle_evaluation.aimo_3_inference_server.AIMO3InferenceServer(predict)\n",
        "\n",
        "if os.getenv(\"KAGGLE_IS_COMPETITION_RERUN\"):\n",
        "    inference_server.serve()\n",
        "else:\n",
        "    inference_server.run_local_gateway((\"reference.csv\",))\n",
        "\n",
        "# Print final accuracy summary\n",
        "if ground_truth and total_count > 0:\n",
        "    print(\"\\n\" + \"=\" * 50)\n",
        "    print(\"üìä FINAL ACCURACY SUMMARY\")\n",
        "    print(\"=\" * 50)\n",
        "    print(f\"Correct: {correct_count}/{total_count}\")\n",
        "    print(f\"Accuracy: {100*correct_count/total_count:.1f}%\")\n",
        "    print(\"=\" * 50)\n",
        "    \n",
        "    # Show details\n",
        "    print(\"\\nDetails:\")\n",
        "    for qid, pred in predictions.items():\n",
        "        if qid in ground_truth:\n",
        "            gt = ground_truth[qid]\n",
        "            status = \"‚úÖ\" if pred == gt else \"‚ùå\"\n",
        "            print(f\"  {qid}: pred={pred}, gt={gt} {status}\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {},
      "source": [
        "if train is not None and 'resp' in train.columns and len(continuous_features) > 0:\n",
        "    print(\"Target looks like regression\")\n",
        "    \n",
        "    # Prepare data for regression\n",
        "    X_reg = train[continuous_features].fillna(0)\n",
        "    y_reg = train['resp']\n",
        "    \n",
        "    # Use weights if available\n",
        "    sample_weight_reg = None\n",
        "    if 'weight' in train.columns:\n",
        "        sample_weight_reg = train['weight'].values\n",
        "    \n",
        "    # Split data\n",
        "    X_train_reg, X_test_reg, y_train_reg, y_test_reg, sw_train_reg, sw_test_reg = train_test_split(\n",
        "        X_reg, y_reg, sample_weight_reg, test_size=0.2, random_state=42\n",
        "    )\n",
        "    \n",
        "    # Scale features\n",
        "    scaler_reg = StandardScaler()\n",
        "    X_train_reg_scaled = scaler_reg.fit_transform(X_train_reg)\n",
        "    X_test_reg_scaled = scaler_reg.transform(X_test_reg)\n",
        "    \n",
        "    # Train linear regression\n",
        "    lr = LinearRegression()\n",
        "    lr.fit(X_train_reg_scaled, y_train_reg, sample_weight=sw_train_reg)\n",
        "    \n",
        "    # Predictions\n",
        "    y_pred_reg = lr.predict(X_test_reg_scaled)\n",
        "    \n",
        "    # Evaluate\n",
        "    mse = mean_squared_error(y_test_reg, y_pred_reg, sample_weight=sw_test_reg)\n",
        "    r2 = r2_score(y_test_reg, y_pred_reg, sample_weight=sw_test_reg)\n",
        "    \n",
        "    print(f\"Mean Squared Error: {mse:.6f}\")\n",
        "    print(f\"R¬≤ Score: {r2:.4f}\")\n",
        "    print(f\"RMSE: {np.sqrt(mse):.6f}\")\n",
        "    \n",
        "    # Regression visualization\n",
        "    fig, axes = plt.subplots(1, 2, figsize=(15, 5))\n",
        "    fig.suptitle('Regression Analysis Results', fontsize=16, fontweight='bold')\n",
        "    \n",
        "    # Predicted vs Actual\n",
        "    axes[0].scatter(y_test_reg, y_pred_reg, alpha=0.5, s=1)\n",
        "    axes[0].plot([y_test_reg.min(), y_test_reg.max()], \n",
        "                 [y_test_reg.min(), y_test_reg.max()], \n",
        "                 'r--', lw=2, label='Perfect Prediction')\n",
        "    axes[0].set_xlabel('Actual Response')\n",
        "    axes[0].set_ylabel('Predicted Response')\n",
        "    axes[0].set_title(f'Predicted vs Actual (R¬≤ = {r2:.4f})')\n",
        "    axes[0].legend()\n",
        "    axes[0].grid(True, alpha=0.3)\n",
        "    \n",
        "    # Residuals\n",
        "    residuals = y_test_reg - y_pred_reg\n",
        "    axes[1].scatter(y_pred_reg, residuals, alpha=0.5, s=1)\n",
        "    axes[1].axhline(y=0, color='r', linestyle='--', linewidth=2)\n",
        "    axes[1].set_xlabel('Predicted Response')\n",
        "    axes[1].set_ylabel('Residuals')\n",
        "    axes[1].set_title('Residual Plot')\n",
        "    axes[1].grid(True, alpha=0.3)\n",
        "    \n",
        "    plt.tight_layout()\n",
        "    plt.show()"
      ],
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "name": "python",
      "version": "3.8.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}

NameError: name 'null' is not defined