In [None]:
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sleep Quality Analysis - Data Exploration\n",
    "\n",
    "This notebook explores the sleep dataset to understand its structure, contents, and potential insights for analysis."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Import necessary libraries\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import sys\n",
    "import os\n",
    "\n",
    "# Add the project root directory to the Python path\n",
    "sys.path.append('..')\n",
    "\n",
    "# Import project modules\n",
    "from src import data_processing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load the Dataset\n",
    "\n",
    "Let's load the sleep health and lifestyle dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Set paths\n",
    "DATA_DIR = os.path.join('..', 'data')\n",
    "RAW_DATA_PATH = os.path.join(DATA_DIR, 'raw', 'Sleep_health_and_lifestyle_dataset.csv')\n",
    "\n",
    "# Load data\n",
    "sleep_data = data_processing.load_sleep_data(RAW_DATA_PATH)\n",
    "sleep_data.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explore Dataset Structure"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Basic information about the dataset\n",
    "print(f\"Dataset Shape: {sleep_data.shape}\")\n",
    "print(\"\\nData Types:\")\n",
    "print(sleep_data.dtypes)\n",
    "print(\"\\nBasic Statistics:\")\n",
    "sleep_data.describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Check for Missing Values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Check for missing values\n",
    "missing_values = sleep_data.isnull().sum()\n",
    "print(\"Missing Values per Column:\")\n",
    "print(missing_values[missing_values > 0] if any(missing_values > 0) else \"No missing values found\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explore Categorical Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Identify categorical columns\n",
    "categorical_cols = sleep_data.select_dtypes(include=['object']).columns.tolist()\n",
    "print(f\"Categorical columns: {categorical_cols}\")\n",
    "\n",
    "# Explore each categorical variable\n",
    "for col in categorical_cols:\n",
    "    print(f\"\\nDistribution of {col}:\")\n",
    "    print(sleep_data[col].value_counts())\n",
    "    \n",
    "    # Create a visualization\n",
    "    plt.figure(figsize=(10, 6))\n",
    "    sns.countplot(y=col, data=sleep_data, order=sleep_data[col].value_counts().index)\n",
    "    plt.title(f\"Distribution of {col}\")\n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explore Numerical Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Identify numerical columns\n",
    "numerical_cols = sleep_data.select_dtypes(include=['int64', 'float64']).columns.tolist()\n",
    "print(f\"Numerical columns: {numerical_cols}\")\n",
    "\n",
    "# Visualize distributions of numerical variables\n",
    "for col in numerical_cols:\n",
    "    plt.figure(figsize=(12, 5))\n",
    "    \n",
    "    # Histogram\n",
    "    plt.subplot(1, 2, 1)\n",
    "    sns.histplot(sleep_data[col], kde=True)\n",
    "    plt.title(f\"Distribution of {col}\")\n",
    "    \n",
    "    # Box plot\n",
    "    plt.subplot(1, 2, 2)\n",
    "    sns.boxplot(y=sleep_data[col])\n",
    "    plt.title(f\"Box Plot of {col}\")\n",
    "    \n",
    "    plt.tight_layout()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Explore Relationships Between Sleep Quality and Other Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Check if 'Sleep Quality' or similar columns exist\n",
    "sleep_quality_col = None\n",
    "for col in sleep_data.columns:\n",
    "    if 'quality' in col.lower() or 'score' in col.lower():\n",
    "        sleep_quality_col = col\n",
    "        break\n",
    "\n",
    "if sleep_quality_col is None and 'Quality of Sleep' in sleep_data.columns:\n",
    "    sleep_quality_col = 'Quality of Sleep'\n",
    "\n",
    "if sleep_quality_col:\n",
    "    print(f\"Using '{sleep_quality_col}' as the sleep quality measure\")\n",
    "    \n",
    "    # Explore relationships with numerical variables\n",
    "    for col in [c for c in numerical_cols if c != sleep_quality_col]:\n",
    "        plt.figure(figsize=(10, 6))\n",
    "        sns.scatterplot(x=col, y=sleep_quality_col, data=sleep_data)\n",
    "        plt.title(f\"{col} vs {sleep_quality_col}\")\n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "    \n",
    "    # Explore relationships with categorical variables\n",
    "    for col in categorical_cols:\n",
    "        plt.figure(figsize=(12, 6))\n",
    "        sns.boxplot(x=col, y=sleep_quality_col, data=sleep_data)\n",
    "        plt.title(f\"{sleep_quality_col} by {col}\")\n",
    "        plt.xticks(rotation=45)\n",
    "        plt.tight_layout()\n",
    "        plt.show()\n",
    "else:\n",
    "    print(\"No sleep quality column found. Creating a proxy metric using sleep duration.\")\n",
    "    # Use sleep duration as a proxy\n",
    "    duration_col = None\n",
    "    for col in sleep_data.columns:\n",
    "        if 'duration' in col.lower() or 'hours' in col.lower():\n",
    "            duration_col = col\n",
    "            break\n",
    "    \n",
    "    if duration_col:\n",
    "        print(f\"Using '{duration_col}' as proxy for sleep quality\")\n",
    "        \n",
    "        # Explore relationships with other variables\n",
    "        for col in [c for c in numerical_cols if c != duration_col]:\n",
    "            plt.figure(figsize=(10, 6))\n",
    "            sns.scatterplot(x=col, y=duration_col, data=sleep_data)\n",
    "            plt.title(f\"{col} vs {duration_col}\")\n",
    "            plt.tight_layout()\n",
    "            plt.show()\n",
    "    else:\n",
    "        print(\"No suitable sleep metric found for relationship analysis\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Correlation Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Calculate correlation matrix\n",
    "correlation_matrix = sleep_data[numerical_cols].corr()\n",
    "\n",
    "# Create correlation heatmap\n",
    "plt.figure(figsize=(14, 10))\n",
    "mask = np.triu(np.ones_like(correlation_matrix))\n",
    "sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)\n",
    "plt.title('Correlation Heatmap')\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Summary of Initial Findings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Key observations from the exploratory data analysis:**\n",
    "\n",
    "1. Dataset Overview:\n",
    "   - The dataset contains [X] records and [Y] features\n",
    "   - [Discuss any missing values or data quality issues]\n",
    "\n",
    "2. Sleep Quality Insights:\n",
    "   - [Discuss distribution of sleep quality or duration]\n",
    "   - [Highlight any interesting patterns observed]\n",
    "\n",
    "3. Potential Factors Affecting Sleep:\n",
    "   - [List the top variables that show correlation with sleep quality]\n",
    "   - [Note any surprising relationships]\n",
    "\n",
    "4. Next Steps:\n",
    "   - Clean the dataset and handle any data quality issues\n",
    "   - Create additional sleep-related features for deeper analysis\n",
    "   - Explore how lifestyle factors affect sleep quality\n",
    "   - Develop models to predict sleep quality based on daily habits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "source": [
    "# Save cleaned data for the next analysis steps\n",
    "PROCESSED_DATA_PATH = os.path.join(DATA_DIR, 'processed', 'clean_sleep_data.csv')\n",
    "\n",
    "# Clean the data using the module\n",
    "clean_sleep_data = data_processing.clean_sleep_data(sleep_data)\n",
    "\n",
    "# Save the cleaned data\n",
    "data_processing.save_processed_data(clean_sleep_data, PROCESSED_DATA_PATH)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}