In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Game Matchup Estimator - Data Exploration\n",
    "## Initial exploration of Clash Royale battle data\n",
    "\n",
    "This notebook explores the dataset collected from the Clash Royale API, containing over 150,000 labeled data points of game matchups."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# Set style for better visualizations\n",
    "plt.style.use('seaborn-v0_8')\n",
    "sns.set_palette(\"husl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the dataset\n",
    "df = pd.read_csv('../data/processed/training_data.csv')\n",
    "\n",
    "print(\"Dataset Overview:\")\n",
    "print(f\"Shape: {df.shape}\")\n",
    "print(f\"Columns: {df.columns.tolist()}\")\n",
    "print(\"\\nFirst few rows:\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Basic statistics\n",
    "print(\"Dataset Info:\")\n",
    "df.info()\n",
    "\n",
    "print(\"\\nDescriptive Statistics:\")\n",
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check for missing values\n",
    "missing_data = df.isnull().sum()\n",
    "print(\"Missing Values:\")\n",
    "print(missing_data[missing_data > 0])\n",
    "\n",
    "# Check target distribution\n",
    "print(f\"\\nTarget Distribution (Winner):\")\n",
    "print(df['winner'].value_counts())\n",
    "print(f\"\\nWin Rate: {df['winner'].mean():.2%}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize target distribution\n",
    "plt.figure(figsize=(10, 6))\n",
    "\n",
    "plt.subplot(1, 2, 1)\n",
    "df['winner'].value_counts().plot.pie(autopct='%1.1f%%', colors=['lightcoral', 'lightblue'])\n",
    "plt.title('Match Outcome Distribution')\n",
    "\n",
    "plt.subplot(1, 2, 2)\n",
    "sns.countplot(data=df, x='winner')\n",
    "plt.title('Win/Loss Count')\n",
    "plt.xlabel('Winner (1=Win, 0=Loss)')\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Correlation matrix\n",
    "plt.figure(figsize=(12, 10))\n",
    "\n",
    "# Select only numeric columns for correlation\n",
    "numeric_cols = df.select_dtypes(include=[np.number]).columns\n",
    "correlation_matrix = df[numeric_cols].corr()\n",
    "\n",
    "# Create a mask for the upper triangle\n",
    "mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))\n",
    "\n",
    "sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', \n",
    "            center=0, square=True, linewidths=0.5)\n",
    "plt.title('Feature Correlation Matrix')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feature distributions by outcome\n",
    "features_to_plot = ['player_trophies', 'opponent_trophies', 'trophy_diff', \n",
    "                    'player_avg_elixir', 'opponent_avg_elixir']\n",
    "\n",
    "fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n",
    "axes = axes.ravel()\n",
    "\n",
    "for i, feature in enumerate(features_to_plot):\n",
    "    if feature in df.columns:\n",
    "        # Box plot\n",
    "        df.boxplot(column=feature, by='winner', ax=axes[i])\n",
    "        axes[i].set_title(f'{feature} by Outcome')\n",
    "        axes[i].set_xlabel('Winner (1=Win, 0=Loss)')\n",
    "\n",
    "# Remove empty subplots\n",
    "for i in range(len(features_to_plot), len(axes)):\n",
    "    fig.delaxes(axes[i])\n",
    "\n",
    "plt.suptitle('Feature Distributions by Match Outcome')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Pairplot for key features\n",
    "key_features = ['player_trophies', 'opponent_trophies', 'trophy_diff', 'winner']\n",
    "sns.pairplot(df[key_features], hue='winner', diag_kind='kde', palette='viridis')\n",
    "plt.suptitle('Pairplot of Key Features by Outcome', y=1.02)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Key Insights from Data Exploration\n",
    "\n",
    "1. **Dataset Size**: Successfully collected 150,000+ labeled data points\n",
    "2. **Class Balance**: Relatively balanced dataset with approximately equal win/loss distribution\n",
    "3. **Feature Quality**: Strong correlations between certain features and match outcomes\n",
    "4. **Data Quality**: Minimal missing values, clean dataset ready for modeling\n",
    "\n",
    "The data shows promising patterns for predicting game outcomes using machine learning."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}