In [1]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# LLM Failure-Oriented Evaluation Analysis\n",
    "\n",
    "This notebook analyzes the results from our failure-oriented evaluation suite."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('..')\n",
    "\n",
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from glob import glob\n",
    "import os\n",
    "\n",
    "# Set style\n",
    "sns.set_style('whitegrid')\n",
    "plt.rcParams['figure.figsize'] = (12, 6)\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Load Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load latest results\n",
    "results_files = glob('../results/metrics/*.json')\n",
    "results_files.sort(key=os.path.getmtime, reverse=True)\n",
    "\n",
    "print(f\"Found {len(results_files)} result files\")\n",
    "\n",
    "# Load all results\n",
    "all_results = []\n",
    "for file in results_files[:5]:  # Load latest 5\n",
    "    with open(file, 'r') as f:\n",
    "        all_results.append(json.load(f))\n",
    "\n",
    "print(f\"\\nLoaded results for models:\")\n",
    "for r in all_results:\n",
    "    print(f\"  - {r['model']}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Summary Statistics"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract summary data\n",
    "summary_data = []\n",
    "\n",
    "for result in all_results:\n",
    "    model_name = result['model'].split('/')[-1]\n",
    "    summary = result['summary']\n",
    "    \n",
    "    summary_data.append({\n",
    "        'Model': model_name,\n",
    "        'Avg Consistency': summary['contradictory_contexts']['avg_consistency_opposing_contexts'],\n",
    "        'Hedging Rate': summary['contradictory_contexts']['hedging_rate'],\n",
    "        'Avg Variance': summary['response_variance']['avg_variance'],\n",
    "        'Avg Similarity': summary['response_variance']['avg_mean_similarity'],\n",
    "        'Flip-Flop Rate': summary['response_variance']['flip_flop_rate']\n",
    "    })\n",
    "\n",
    "summary_df = pd.DataFrame(summary_data)\n",
    "summary_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Visualization: Contradictory Context Handling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n",
    "\n",
    "# Plot 1: Consistency vs Hedging\n",
    "ax1 = axes[0]\n",
    "x = np.arange(len(summary_df))\n",
    "width = 0.35\n",
    "\n",
    "ax1.bar(x - width/2, summary_df['Avg Consistency'], width, label='Avg Consistency', alpha=0.8)\n",
    "ax1.bar(x + width/2, summary_df['Hedging Rate'], width, label='Hedging Rate', alpha=0.8)\n",
    "\n",
    "ax1.set_xlabel('Model')\n",
    "ax1.set_ylabel('Score')\n",
    "ax1.set_title('Contradictory Context: Consistency vs Hedging')\n",
    "ax1.set_xticks(x)\n",
    "ax1.set_xticklabels(summary_df['Model'], rotation=45, ha='right')\n",
    "ax1.legend()\n",
    "ax1.grid(axis='y', alpha=0.3)\n",
    "\n",
    "# Plot 2: Response Variance\n",
    "ax2 = axes[1]\n",
    "ax2.bar(x - width/2, summary_df['Avg Similarity'], width, label='Avg Similarity', alpha=0.8)\n",
    "ax2.bar(x + width/2, summary_df['Flip-Flop Rate'], width, label='Flip-Flop Rate', alpha=0.8)\n",
    "\n",
    "ax2.set_xlabel('Model')\n",
    "ax2.set_ylabel('Score')\n",
    "ax2.set_title('Response Variance: Similarity vs Flip-Flops')\n",
    "ax2.set_xticks(x)\n",
    "ax2.set_xticklabels(summary_df['Model'], rotation=45, ha='right')\n",
    "ax2.legend()\n",
    "ax2.grid(axis='y', alpha=0.3)\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.savefig('../results/visualizations/summary_comparison.png', dpi=300, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Detailed Analysis: Individual Test Cases"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze first model's detailed results\n",
    "if all_results:\n",
    "    first_model = all_results[0]\n",
    "    \n",
    "    print(f\"Detailed Analysis for: {first_model['model']}\\n\")\n",
    "    print(\"=\"*70)\n",
    "    \n",
    "    # Contradictory contexts\n",
    "    print(\"\\nüìä CONTRADICTORY CONTEXT CASES:\\n\")\n",
    "    for idx, case in enumerate(first_model['contradictory_contexts'], 1):\n",
    "        print(f\"Case {idx}: {case['question']}\")\n",
    "        print(f\"  Consistency (A vs B): {case['consistency_a_vs_b']:.3f}\")\n",
    "        print(f\"  Hedging: {case['contradiction_analysis']['hedging']['has_hedging']}\")\n",
    "        if case['contradiction_analysis']['hedging']['has_hedging']:\n",
    "            print(f\"    Words: {case['contradiction_analysis']['hedging']['hedging_words_found']}\")\n",
    "        print(f\"  Response (contradictory): {case['response_contradictory'][:100]}...\\n\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Heatmap: Model Comparison Matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create comparison heatmap\n",
    "metrics = ['Avg Consistency', 'Hedging Rate', 'Avg Similarity', 'Flip-Flop Rate']\n",
    "heatmap_data = summary_df[metrics].T\n",
    "heatmap_data.columns = summary_df['Model']\n",
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "sns.heatmap(heatmap_data, annot=True, fmt='.3f', cmap='RdYlGn_r', \n",
    "            cbar_kws={'label': 'Score'}, linewidths=0.5)\n",
    "plt.title('Model Performance Heatmap', fontsize=14, fontweight='bold')\n",
    "plt.ylabel('Metric', fontsize=12)\n",
    "plt.xlabel('Model', fontsize=12)\n",
    "plt.tight_layout()\n",
    "plt.savefig('../results/visualizations/performance_heatmap.png', dpi=300, bbox_inches='tight')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Key Insights & Findings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"üîç KEY FINDINGS:\\n\")\n",
    "print(\"=\"*70)\n",
    "\n",
    "# Best/worst performers\n",
    "best_consistency = summary_df.loc[summary_df['Avg Consistency'].idxmax()]\n",
    "worst_variance = summary_df.loc[summary_df['Avg Variance'].idxmax()]\n",
    "most_hedging = summary_df.loc[summary_df['Hedging Rate'].idxmax()]\n",
    "\n",
    "print(f\"\\n‚úÖ Most Consistent (opposing contexts): {best_consistency['Model']}\")\n",
    "print(f\"   Score: {best_consistency['Avg Consistency']:.3f}\")\n",
    "\n",
    "print(f\"\\n‚ö†Ô∏è  Highest Variance: {worst_variance['Model']}\")\n",
    "print(f\"   Variance: {worst_variance['Avg Variance']:.4f}\")\n",
    "\n",
    "print(f\"\\nü§î Most Hedging: {most_hedging['Model']}\")\n",
    "print(f\"   Hedging Rate: {most_hedging['Hedging Rate']:.2%}\")\n",
    "\n",
    "print(\"\\n\" + \"=\"*70)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Export Summary Report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save summary to CSV\n",
    "summary_df.to_csv('../results/metrics/summary_comparison.csv', index=False)\n",
    "print(\"‚úÖ Summary exported to: results/metrics/summary_comparison.csv\")\n",
    "\n",
    "# Create markdown report\n",
    "report = f\"\"\"# Failure-Oriented Evaluation Report\n",
    "\n",
    "**Generated:** {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n",
    "\n",
    "## Summary Statistics\n",
    "\n",
    "{summary_df.to_markdown(index=False)}\n",
    "\n",
    "## Key Findings\n",
    "\n",
    "- **Most Consistent:** {best_consistency['Model']} ({best_consistency['Avg Consistency']:.3f})\n",
    "- **Highest Variance:** {worst_variance['Model']} ({worst_variance['Avg Variance']:.4f})\n",
    "- **Most Hedging:** {most_hedging['Model']} ({most_hedging['Hedging Rate']:.2%})\n",
    "\n",
    "## Insights\n",
    "\n",
    "1. Models show varying levels of consistency when faced with contradictory information\n",
    "2. Response variance indicates stability across multiple runs with same prompt\n",
    "3. Hedging behavior reveals how models handle uncertainty\n",
    "\"\"\"\n",
    "\n",
    "with open('../results/EVALUATION_REPORT.md', 'w') as f:\n",
    "    f.write(report)\n",
    "\n",
    "print(\"‚úÖ Report exported to: results/EVALUATION_REPORT.md\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.9.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}

NameError: name 'null' is not defined