# Descriptive Statistics

Summarize and understand your data.

## Key Concepts
- **Central Tendency:** Mean, Median, Mode
- **Dispersion:** Range, Variance, Std Dev
- **Shape:** Skewness, Kurtosis
- **Position:** Quartiles, Percentiles

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

sns.set_style('whitegrid')
np.random.seed(42)

## 1. Measures of Central Tendency

In [None]:
# Generate sample data (Salary)
salaries = np.array([
    45000, 50000, 55000, 60000, 65000, 
    70000, 75000, 80000, 250000  # Outlier
])

mean_val = np.mean(salaries)
median_val = np.median(salaries)
mode_val = stats.mode(salaries, keepdims=True)[0][0]

print(f"Mean:   ${mean_val:,.2f}")
print(f"Median: ${median_val:,.2f}")
print(f"Mode:   ${mode_val:,.2f}")

print("\nNote: Mean is sensitive to outliers!")

## 2. Measures of Dispersion (Spread)

In [None]:
range_val = np.ptp(salaries)
variance = np.var(salaries, ddof=1)
std_dev = np.std(salaries, ddof=1)

print(f"Range:    ${range_val:,.2f}")
print(f"Variance: {variance:,.2f}")
print(f"Std Dev:  ${std_dev:,.2f}")

## 3. Quartiles and IQR

In [None]:
q1 = np.percentile(salaries, 25)
q3 = np.percentile(salaries, 75)
iqr = q3 - q1

print(f"Q1 (25th): ${q1:,.2f}")
print(f"Q3 (75th): ${q3:,.2f}")
print(f"IQR:       ${iqr:,.2f}")

# Outlier detection rule
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

print(f"\nOutliers are < {lower_bound:,.0f} "
      f"or > {upper_bound:,.0f}")

## 4. Visualizing Distribution

In [None]:
# Generate normal data
data = np.random.normal(loc=100, scale=15, size=1000)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Histogram
sns.histplot(data, kde=True, ax=axes[0])
axes[0].set_title('Histogram (Shape)')

# Box Plot
sns.boxplot(x=data, ax=axes[1])
axes[1].set_title('Box Plot (Outliers)')

plt.tight_layout()
plt.show()

## 5. Skewness and Kurtosis

In [None]:
skew = stats.skew(data)
kurt = stats.kurtosis(data)

print(f"Skewness: {skew:.4f}")
print(f"Kurtosis: {kurt:.4f}")

print("\nInterpretation:")
print("- Skew â‰ˆ 0: Symmetric")
print("- Positive Skew: Tail on right")
print("- Negative Skew: Tail on left")

## Practice Exercise
Calculate descriptive stats for the Titanic dataset.

In [None]:
# Load data
df = sns.load_dataset('titanic')
age = df['age'].dropna()

# Calculate mean, median, std, min, max
# Visualize with histogram and boxplot
# Your code here