# Diabetes Dataset - Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory data analysis on the Pima Indians Diabetes Dataset.

## Dataset Overview
The dataset contains diagnostic measurements for diabetes prediction:
- **Pregnancies**: Number of times pregnant
- **Glucose**: Plasma glucose concentration
- **BloodPressure**: Diastolic blood pressure (mm Hg)
- **SkinThickness**: Triceps skin fold thickness (mm)
- **Insulin**: 2-Hour serum insulin (mu U/ml)
- **BMI**: Body mass index
- **DiabetesPedigreeFunction**: Diabetes pedigree function
- **Age**: Age in years
- **Outcome**: Class variable (0: No diabetes, 1: Diabetes)

## 1. Import Required Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Statistics
from scipy import stats

# Warnings
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Create results directory if it doesn't exist
import os
os.makedirs('../results/eda', exist_ok=True)

print("Libraries imported successfully!")

## 2. Load the Dataset

In [None]:
# Load the diabetes dataset
df = pd.read_csv('../data/raw/diabetes.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"\nFirst few rows:")
df.head()

## 3. Dataset Information and Structure

In [None]:
# Display dataset info
print("Dataset Information:")
print("="*50)
df.info()

print("\n" + "="*50)
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")
print(f"\nColumn names: {list(df.columns)}")

## 4. Statistical Summary

In [None]:
# Display statistical summary
print("Statistical Summary:")
print("="*80)
df.describe().T

In [None]:
# Additional statistics: median and mode
print("\nMedian Values:")
print("="*50)
print(df.median())

print("\nSkewness:")
print("="*50)
print(df.skew())

print("\nKurtosis:")
print("="*50)
print(df.kurtosis())

## 5. Missing Value Analysis

In [None]:
# Check for missing values
print("Missing Values:")
print("="*50)
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Percentage': missing_percentage
})
print(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("\nNo missing values found in the dataset!")

In [None]:
# Check for zero values (which might indicate missing data)
print("\nZero Values Analysis:")
print("="*50)
print("Note: Zero values in certain columns (Glucose, BloodPressure, etc.) are biologically impossible")
print("and likely represent missing data.\n")

zero_counts = (df == 0).sum()
zero_percentage = (zero_counts / len(df)) * 100
zero_df = pd.DataFrame({
    'Zero Count': zero_counts,
    'Percentage': zero_percentage
})
print(zero_df[zero_df['Zero Count'] > 0])

## 6. Data Type Checking

In [None]:
# Check data types
print("Data Types:")
print("="*50)
dtypes_df = pd.DataFrame(df.dtypes, columns=['Data Type'])
dtypes_df['Non-Null Count'] = df.count()
print(dtypes_df)

## 7. Target Variable Distribution

In [None]:
# Analyze target variable distribution
print("Target Variable (Outcome) Distribution:")
print("="*50)
outcome_counts = df['Outcome'].value_counts()
outcome_percentages = df['Outcome'].value_counts(normalize=True) * 100

outcome_df = pd.DataFrame({
    'Count': outcome_counts,
    'Percentage': outcome_percentages
})
outcome_df.index = ['No Diabetes (0)', 'Diabetes (1)']
print(outcome_df)

# Create count plot
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Outcome', palette='Set2')
plt.title('Distribution of Target Variable (Outcome)', fontsize=16, fontweight='bold')
plt.xlabel('Outcome (0: No Diabetes, 1: Diabetes)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks([0, 1], ['No Diabetes', 'Diabetes'])

# Add count labels on bars
for i, v in enumerate(outcome_counts):
    plt.text(i, v + 5, str(v), ha='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../results/eda/target_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nClass Imbalance Ratio: {outcome_counts[0]/outcome_counts[1]:.2f}:1")

## 8. Feature Distributions - Histograms

In [None]:
# Create histograms for all features
features = df.columns.drop('Outcome')

fig, axes = plt.subplots(4, 2, figsize=(15, 16))
axes = axes.ravel()

for idx, feature in enumerate(features):
    axes[idx].hist(df[feature], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
    axes[idx].set_title(f'Distribution of {feature}', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(feature, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/eda/feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Feature Distributions by Outcome (KDE Plots)

In [None]:
# Create KDE plots for features split by outcome
fig, axes = plt.subplots(4, 2, figsize=(15, 16))
axes = axes.ravel()

for idx, feature in enumerate(features):
    # Plot for no diabetes
    df[df['Outcome'] == 0][feature].plot(kind='kde', ax=axes[idx], 
                                          label='No Diabetes', color='green', linewidth=2)
    # Plot for diabetes
    df[df['Outcome'] == 1][feature].plot(kind='kde', ax=axes[idx], 
                                          label='Diabetes', color='red', linewidth=2)
    
    axes[idx].set_title(f'{feature} Distribution by Outcome', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel(feature, fontsize=10)
    axes[idx].set_ylabel('Density', fontsize=10)
    axes[idx].legend()
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.savefig('../results/eda/feature_distributions_by_outcome.png', dpi=300, bbox_inches='tight')
plt.show()

## 10. Box Plots to Identify Outliers

In [None]:
# Create box plots for all features
fig, axes = plt.subplots(4, 2, figsize=(15, 16))
axes = axes.ravel()

for idx, feature in enumerate(features):
    sns.boxplot(data=df, y=feature, ax=axes[idx], color='lightblue')
    axes[idx].set_title(f'Box Plot of {feature}', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel(feature, fontsize=10)
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/eda/boxplots_outliers.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Box plots by outcome
fig, axes = plt.subplots(4, 2, figsize=(15, 16))
axes = axes.ravel()

for idx, feature in enumerate(features):
    sns.boxplot(data=df, x='Outcome', y=feature, ax=axes[idx], palette='Set2')
    axes[idx].set_title(f'{feature} by Outcome', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Outcome', fontsize=10)
    axes[idx].set_ylabel(feature, fontsize=10)
    axes[idx].set_xticklabels(['No Diabetes', 'Diabetes'])
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/eda/boxplots_by_outcome.png', dpi=300, bbox_inches='tight')
plt.show()

## 11. Outlier Detection using IQR Method

In [None]:
# Detect outliers using IQR method
print("Outlier Detection using IQR Method:")
print("="*80)

outlier_summary = []

for feature in features:
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[feature] < lower_bound) | (df[feature] > upper_bound)]
    outlier_count = len(outliers)
    outlier_percentage = (outlier_count / len(df)) * 100
    
    outlier_summary.append({
        'Feature': feature,
        'Outlier Count': outlier_count,
        'Percentage': f"{outlier_percentage:.2f}%",
        'Lower Bound': f"{lower_bound:.2f}",
        'Upper Bound': f"{upper_bound:.2f}"
    })

outlier_df = pd.DataFrame(outlier_summary)
print(outlier_df.to_string(index=False))

## 12. Correlation Analysis

In [None]:
# Calculate correlation matrix
correlation_matrix = df.corr()

print("Correlation with Outcome (Target Variable):")
print("="*50)
print(correlation_matrix['Outcome'].sort_values(ascending=False))

In [None]:
# Create correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../results/eda/correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

## 13. Pairplot - Relationships between Features

In [None]:
# Create pairplot (selecting key features for clarity)
key_features = ['Glucose', 'BMI', 'Age', 'Pregnancies', 'Outcome']

print("Creating pairplot for key features...")
print("This may take a moment...")

pairplot = sns.pairplot(df[key_features], hue='Outcome', palette='Set1', 
                        diag_kind='kde', plot_kws={'alpha': 0.6}, height=2.5)
pairplot.fig.suptitle('Pairplot of Key Features by Outcome', y=1.02, fontsize=16, fontweight='bold')
plt.savefig('../results/eda/pairplot_key_features.png', dpi=300, bbox_inches='tight')
plt.show()

print("Pairplot saved successfully!")

## 14. Statistical Tests

In [None]:
# Perform t-tests to compare means between groups
print("Statistical Tests: Comparing features between Diabetes and No Diabetes groups")
print("="*80)

test_results = []

for feature in features:
    group_0 = df[df['Outcome'] == 0][feature]
    group_1 = df[df['Outcome'] == 1][feature]
    
    # Perform t-test
    t_stat, p_value = stats.ttest_ind(group_0, group_1)
    
    # Calculate effect size (Cohen's d)
    cohens_d = (group_1.mean() - group_0.mean()) / np.sqrt(((len(group_0)-1)*group_0.std()**2 + (len(group_1)-1)*group_1.std()**2) / (len(group_0) + len(group_1) - 2))
    
    test_results.append({
        'Feature': feature,
        'Mean (No Diabetes)': f"{group_0.mean():.2f}",
        'Mean (Diabetes)': f"{group_1.mean():.2f}",
        'T-Statistic': f"{t_stat:.3f}",
        'P-Value': f"{p_value:.4f}",
        'Significant': 'Yes' if p_value < 0.05 else 'No',
        "Cohen's d": f"{cohens_d:.3f}"
    })

test_df = pd.DataFrame(test_results)
print(test_df.to_string(index=False))

print("\nNote: P-value < 0.05 indicates statistically significant difference between groups")

## 15. Summary Statistics by Outcome

In [None]:
# Group statistics by outcome
print("Summary Statistics by Outcome:")
print("="*80)
print("\nNo Diabetes (Outcome = 0):")
print(df[df['Outcome'] == 0].describe().T)

print("\n" + "="*80)
print("\nDiabetes (Outcome = 1):")
print(df[df['Outcome'] == 1].describe().T)

## 16. Key Insights and Findings

In [None]:
print("="*80)
print("KEY INSIGHTS FROM EDA")
print("="*80)

insights = [
    f"1. Dataset Size: {df.shape[0]} samples with {df.shape[1]} features",
    f"2. Class Distribution: {outcome_counts[0]} No Diabetes vs {outcome_counts[1]} Diabetes",
    f"3. Class Imbalance: {outcome_percentages[0]:.1f}% vs {outcome_percentages[1]:.1f}% (imbalanced dataset)",
    "4. Missing Values: No explicit missing values, but zeros in biological measurements likely represent missing data",
    "5. Key Predictors (highest correlation with Outcome):",
]

# Get top correlations
top_corr = correlation_matrix['Outcome'].sort_values(ascending=False)[1:4]
for i, (feature, corr) in enumerate(top_corr.items(), 1):
    insights.append(f"   {i}. {feature}: {corr:.3f}")

insights.extend([
    "6. Outliers detected in multiple features - need handling during preprocessing",
    "7. Feature distributions show clear differences between diabetes and non-diabetes groups",
    "8. Several features show skewed distributions - may benefit from transformation",
    "9. Class imbalance suggests need for SMOTE or similar techniques",
    "10. Feature engineering opportunities identified (BMI categories, glucose levels, age groups)"
])

for insight in insights:
    print(insight)

print("\n" + "="*80)
print("RECOMMENDATIONS FOR NEXT STEPS")
print("="*80)
recommendations = [
    "1. Handle zero values in biological measurements (replace with median/mean)",
    "2. Address outliers using IQR method or capping",
    "3. Create engineered features (BMI categories, glucose levels, risk scores)",
    "4. Apply feature scaling (StandardScaler or MinMaxScaler)",
    "5. Handle class imbalance using SMOTE",
    "6. Consider feature selection based on correlation and statistical tests",
    "7. Split data with stratification to maintain class distribution"
]

for rec in recommendations:
    print(rec)

print("\n" + "="*80)
print("EDA COMPLETED SUCCESSFULLY!")
print("All visualizations saved to: ../results/eda/")
print("="*80)

## 17. Save EDA Summary Report

In [None]:
# Create a summary report
with open('../results/eda/eda_summary_report.txt', 'w') as f:
    f.write("DIABETES DATASET - EDA SUMMARY REPORT\n")
    f.write("="*80 + "\n\n")
    
    f.write(f"Dataset Shape: {df.shape}\n")
    f.write(f"Number of Features: {df.shape[1] - 1}\n")
    f.write(f"Target Variable: Outcome (0: No Diabetes, 1: Diabetes)\n\n")
    
    f.write("CLASS DISTRIBUTION:\n")
    f.write("-"*40 + "\n")
    f.write(f"No Diabetes: {outcome_counts[0]} ({outcome_percentages[0]:.2f}%)\n")
    f.write(f"Diabetes: {outcome_counts[1]} ({outcome_percentages[1]:.2f}%)\n\n")
    
    f.write("CORRELATION WITH OUTCOME:\n")
    f.write("-"*40 + "\n")
    for feature, corr in correlation_matrix['Outcome'].sort_values(ascending=False).items():
        if feature != 'Outcome':
            f.write(f"{feature}: {corr:.3f}\n")
    
    f.write("\nKEY INSIGHTS:\n")
    f.write("-"*40 + "\n")
    for insight in insights:
        f.write(insight + "\n")
    
    f.write("\nRECOMMENDATIONS:\n")
    f.write("-"*40 + "\n")
    for rec in recommendations:
        f.write(rec + "\n")

print("EDA Summary Report saved to: ../results/eda/eda_summary_report.txt")