In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploratory Data Analysis for Animal Classification Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from PIL import Image\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "%matplotlib inline\n",
    "plt.style.use('seaborn')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Loading the Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Предполагаем, что у нас есть папка 'dataset' с подпапками для каждого класса животных\n",
    "data_dir = 'dataset'\n",
    "classes = os.listdir(data_dir)\n",
    "print(f\"Classes: {classes}\")\n",
    "\n",
    "# Создаем DataFrame с путями к изображениям и их классами\n",
    "data = []\n",
    "for cls in classes:\n",
    "    class_dir = os.path.join(data_dir, cls)\n",
    "    for img_name in os.listdir(class_dir):\n",
    "        data.append({'path': os.path.join(class_dir, img_name), 'class': cls})\n",
    "\n",
    "df = pd.DataFrame(data)\n",
    "print(f\"Total images: {len(df)}\")\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Class Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "class_distribution = df['class'].value_counts()\n",
    "plt.figure(figsize=(12, 6))\n",
    "sns.barplot(x=class_distribution.index, y=class_distribution.values)\n",
    "plt.title('Distribution of Animal Classes')\n",
    "plt.xlabel('Animal Class')\n",
    "plt.ylabel('Number of Images')\n",
    "plt.xticks(rotation=45)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Image Size Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def get_image_size(path):\n",
    "    with Image.open(path) as img:\n",
    "        return img.size\n",
    "\n",
    "df['size'] = df['path'].apply(get_image_size)\n",
    "df['width'] = df['size'].apply(lambda x: x[0])\n",
    "df['height'] = df['size'].apply(lambda x: x[1])\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.scatter(df['width'], df['height'], alpha=0.5)\n",
    "plt.title('Image Dimensions')\n",
    "plt.xlabel('Width')\n",
    "plt.ylabel('Height')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. Sample Images"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "def plot_sample_images(df, n_samples=5):\n",
    "    sampled = df.groupby('class').sample(n=n_samples, replace=True)\n",
    "    fig, axes = plt.subplots(len(classes), n_samples, figsize=(15, 3*len(classes)))\n",
    "    for i, cls in enumerate(classes):\n",
    "        for j in range(n_samples):\n",
    "            img_path = sampled[sampled['class'] == cls].iloc[j]['path']\n",
    "            img = Image.open(img_path)\n",
    "            axes[i, j].imshow(img)\n",
    "            axes[i, j].axis('off')\n",
    "        axes[i, 0].set_ylabel(cls, rotation=0, labelpad=40)\n",
    "    plt.tight_layout()\n",
    "    plt.show()\n",
    "\n",
    "plot_sample_images(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. Train-Test Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['class'], random_state=42)\n",
    "print(f\"Training set size: {len(train_df)}\")\n",
    "print(f\"Test set size: {len(test_df)}\")\n",
    "\n",
    "# Проверка распределения классов в обучающем и тестовом наборах\n",
    "train_dist = train_df['class'].value_counts(normalize=True)\n",
    "test_dist = test_df['class'].value_counts(normalize=True)\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "train_dist.plot(kind='bar', position=0, width=0.4, alpha=0.8, label='Train')\n",
    "test_dist.plot(kind='bar', position=1, width=0.4, alpha=0.8, label='Test')\n",
    "plt.title('Class Distribution in Train and Test Sets')\n",
    "plt.xlabel('Animal Class')\n",
    "plt.ylabel('Proportion')\n",
    "plt.legend()\n",
    "plt.xticks(rotation=45)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. Conclusions\n",
    "\n",
    "Based on our exploratory data analysis, we can conclude:\n",
    "\n",
    "1. The dataset contains X classes of animals with Y total images.\n",
    "2. The class distribution is [balanced/imbalanced], with [class] having the most samples and [class] having the least.\n",
    "3. Image sizes vary, with the majority of images having dimensions between [X] and [Y].\n",
    "4. The train-test split maintains the class distribution, ensuring representative samples in both sets.\n",
    "\n",
    "Next steps for preprocessing and model development:\n",
    "1. Implement data augmentation to address class imbalance (if any).\n",
    "2. Standardize image sizes for input to the neural network.\n",
    "3. Consider using transfer learning with a pre-trained model due to the diverse nature of the dataset.\n",
    "4. Implement cross-validation to ensure robust model performance across all classes."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}


: 