# NumPy Statistics and Aggregations

## üìö Learning Objectives
- Calculate descriptive statistics
- Understand axis-based operations
- Work with percentiles and quantiles
- Compute correlations and covariance
- Apply statistical functions to real datasets

---

In [None]:
import numpy as np

np.random.seed(42)

## 1. Descriptive Statistics

Basic statistical measures to understand data distribution.

In [None]:
# Create sample data
data = np.array([23, 45, 67, 12, 89, 34, 56, 78, 90, 45, 23, 67, 89, 12, 34])

print("Data:", data)
print("\n--- Measures of Central Tendency ---")
print(f"Mean: {np.mean(data):.2f}")
print(f"Median: {np.median(data):.2f}")
print(f"Mode (most frequent): {np.bincount(data).argmax()}")

print("\n--- Measures of Spread ---")
print(f"Standard Deviation: {np.std(data):.2f}")
print(f"Variance: {np.var(data):.2f}")
print(f"Min: {np.min(data)}")
print(f"Max: {np.max(data)}")
print(f"Range (ptp): {np.ptp(data)}")

print("\n--- Other Statistics ---")
print(f"Sum: {np.sum(data)}")
print(f"Product: {np.prod(data[:5])}")  # First 5 elements (product gets large!)
print(f"Count: {len(data)}")

## 2. Percentiles and Quantiles

Understand data distribution through percentiles.

In [None]:
# Generate sample data
data = np.random.randint(0, 100, size=100)

print("Data sample:", data[:20])
print("\n--- Percentiles ---")
print(f"25th percentile (Q1): {np.percentile(data, 25):.2f}")
print(f"50th percentile (Median/Q2): {np.percentile(data, 50):.2f}")
print(f"75th percentile (Q3): {np.percentile(data, 75):.2f}")
print(f"90th percentile: {np.percentile(data, 90):.2f}")
print(f"95th percentile: {np.percentile(data, 95):.2f}")

# Interquartile Range (IQR)
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
iqr = q3 - q1
print(f"\nInterquartile Range (IQR): {iqr:.2f}")

In [None]:
# Quantiles (alternative to percentiles)
data = np.random.normal(100, 15, 1000)  # Normal distribution

print("--- Quantiles ---")
quantiles = np.quantile(data, [0.25, 0.5, 0.75])
print(f"Quartiles: {quantiles}")

# Deciles (10 equal parts)
deciles = np.quantile(data, np.arange(0.1, 1.0, 0.1))
print(f"\nDeciles: {deciles}")

## 3. Axis-based Operations

Apply statistical functions along specific axes in multi-dimensional arrays.

In [None]:
# Create 2D array (students x subjects)
# Rows: Students, Columns: Math, Science, English
grades = np.array([
    [85, 90, 78],  # Student 1
    [92, 88, 95],  # Student 2
    [78, 85, 82],  # Student 3
    [95, 92, 88],  # Student 4
    [88, 86, 90]   # Student 5
])

print("Grades (Students x Subjects):\n", grades)
print("Shape:", grades.shape)

print("\n--- Overall Statistics ---")
print(f"Overall mean: {np.mean(grades):.2f}")
print(f"Overall std: {np.std(grades):.2f}")

print("\n--- Statistics per Subject (axis=0, down columns) ---")
print(f"Subject means: {np.mean(grades, axis=0)}")
print(f"Subject std: {np.std(grades, axis=0)}")
print(f"Subject max: {np.max(grades, axis=0)}")
print(f"Subject min: {np.min(grades, axis=0)}")

print("\n--- Statistics per Student (axis=1, across rows) ---")
print(f"Student means: {np.mean(grades, axis=1)}")
print(f"Student std: {np.std(grades, axis=1)}")
print(f"Student max: {np.max(grades, axis=1)}")
print(f"Student min: {np.min(grades, axis=1)}")

In [None]:
# 3D example: Multiple classes
# Shape: (classes, students, subjects)
all_grades = np.random.randint(70, 100, size=(3, 5, 3))

print("Shape (classes, students, subjects):", all_grades.shape)
print("\nClass 1 grades:\n", all_grades[0])

print("\n--- Statistics across different axes ---")
print(f"Mean per class: {np.mean(all_grades, axis=(1, 2))}")
print(f"Mean per subject (all classes): {np.mean(all_grades, axis=(0, 1))}")
print(f"Mean per student (all subjects, all classes): {np.mean(all_grades, axis=2)[0]}")

## 4. Correlation and Covariance

Measure relationships between variables.

In [None]:
# Create correlated data
np.random.seed(42)
study_hours = np.random.randint(1, 10, size=20)
test_scores = study_hours * 8 + np.random.randint(-5, 5, size=20)

print("Study Hours:", study_hours)
print("Test Scores:", test_scores)

# Correlation coefficient
correlation = np.corrcoef(study_hours, test_scores)
print("\nCorrelation Matrix:\n", correlation)
print(f"\nCorrelation coefficient: {correlation[0, 1]:.3f}")

# Covariance
covariance = np.cov(study_hours, test_scores)
print("\nCovariance Matrix:\n", covariance)
print(f"Covariance: {covariance[0, 1]:.3f}")

In [None]:
# Multiple variables correlation
# Variables: Height, Weight, Age
height = np.array([165, 170, 180, 175, 160, 185, 172, 168])
weight = np.array([60, 70, 85, 75, 55, 90, 72, 65])
age = np.array([25, 30, 35, 28, 22, 40, 32, 27])

# Stack variables
data = np.vstack([height, weight, age])
print("Data shape:", data.shape)

# Correlation matrix
corr_matrix = np.corrcoef(data)
print("\nCorrelation Matrix:")
print("         Height  Weight  Age")
print(f"Height   {corr_matrix[0, 0]:.3f}  {corr_matrix[0, 1]:.3f}  {corr_matrix[0, 2]:.3f}")
print(f"Weight   {corr_matrix[1, 0]:.3f}  {corr_matrix[1, 1]:.3f}  {corr_matrix[1, 2]:.3f}")
print(f"Age      {corr_matrix[2, 0]:.3f}  {corr_matrix[2, 1]:.3f}  {corr_matrix[2, 2]:.3f}")

## 5. Sorting and Ordering

In [None]:
# Basic sorting
arr = np.array([64, 34, 25, 12, 22, 11, 90])
print("Original:", arr)
print("Sorted:", np.sort(arr))
print("Sorted (descending):", np.sort(arr)[::-1])

# Get indices that would sort the array
indices = np.argsort(arr)
print("\nSort indices:", indices)
print("Sorted using indices:", arr[indices])

In [None]:
# Sorting 2D arrays
arr_2d = np.array([[3, 2, 1],
                   [6, 5, 4],
                   [9, 8, 7]])

print("Original:\n", arr_2d)
print("\nSort each row:", np.sort(arr_2d, axis=1))
print("\nSort each column:\n", np.sort(arr_2d, axis=0))

In [None]:
# Partition - find k smallest/largest elements
arr = np.array([64, 34, 25, 12, 22, 11, 90, 88, 45, 50])
print("Array:", arr)

# Find 3 smallest elements
k = 3
partitioned = np.partition(arr, k)
print(f"\nPartitioned at {k}:", partitioned)
print(f"3 smallest: {partitioned[:k]}")

# Find 3 largest elements
partitioned = np.partition(arr, -k)
print(f"3 largest: {partitioned[-k:]}")

## 6. Unique Values and Counts

In [None]:
# Find unique values
arr = np.array([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5])
print("Array:", arr)

unique_values = np.unique(arr)
print("\nUnique values:", unique_values)

# Get counts
unique, counts = np.unique(arr, return_counts=True)
print("\nValue\tCount")
for val, count in zip(unique, counts):
    print(f"{val}\t{count}")

In [None]:
# Bincount - count occurrences of non-negative integers
arr = np.array([0, 1, 1, 2, 2, 2, 3, 3, 3, 3])
counts = np.bincount(arr)
print("Array:", arr)
print("Counts:", counts)
print("\nIndex\tCount")
for i, count in enumerate(counts):
    print(f"{i}\t{count}")

## 7. Histogram and Binning

In [None]:
# Create histogram
data = np.random.normal(100, 15, 1000)

# Create 10 bins
hist, bin_edges = np.histogram(data, bins=10)

print("Histogram:")
print("Bin Range\t\tCount")
for i in range(len(hist)):
    print(f"[{bin_edges[i]:.1f}, {bin_edges[i+1]:.1f})\t{hist[i]}")

print(f"\nTotal count: {hist.sum()}")

In [None]:
# Digitize - assign values to bins
data = np.array([1.2, 2.5, 3.7, 4.1, 5.8, 6.3, 7.9, 8.2])
bins = np.array([0, 3, 6, 9])  # Bins: [0-3), [3-6), [6-9)

bin_indices = np.digitize(data, bins)
print("Data:", data)
print("Bins:", bins)
print("Bin indices:", bin_indices)

# Show which bin each value belongs to
for val, bin_idx in zip(data, bin_indices):
    if bin_idx < len(bins):
        print(f"{val:.1f} -> Bin {bin_idx} [{bins[bin_idx-1]}, {bins[bin_idx]})")
    else:
        print(f"{val:.1f} -> Bin {bin_idx} [>= {bins[-1]})")

## 8. Weighted Statistics

In [None]:
# Weighted average
values = np.array([85, 90, 78, 92])
weights = np.array([0.2, 0.3, 0.25, 0.25])  # Must sum to 1

print("Values:", values)
print("Weights:", weights)

weighted_avg = np.average(values, weights=weights)
print(f"\nWeighted average: {weighted_avg:.2f}")

# Compare with regular average
regular_avg = np.mean(values)
print(f"Regular average: {regular_avg:.2f}")

## 9. Practical Examples

In [None]:
# Example 1: Sales Analysis
np.random.seed(42)
daily_sales = np.random.normal(1000, 200, 30)  # 30 days of sales

print("Daily Sales (first 10 days):", daily_sales[:10].astype(int))
print("\n--- Sales Statistics ---")
print(f"Mean daily sales: ${np.mean(daily_sales):.2f}")
print(f"Median daily sales: ${np.median(daily_sales):.2f}")
print(f"Std deviation: ${np.std(daily_sales):.2f}")
print(f"Best day: ${np.max(daily_sales):.2f}")
print(f"Worst day: ${np.min(daily_sales):.2f}")
print(f"Total monthly sales: ${np.sum(daily_sales):.2f}")

# Identify outliers (> 2 std from mean)
mean = np.mean(daily_sales)
std = np.std(daily_sales)
outliers = daily_sales[(daily_sales > mean + 2*std) | (daily_sales < mean - 2*std)]
print(f"\nOutlier days: {len(outliers)}")

In [None]:
# Example 2: Student Performance Analysis
# Subjects: Math, Science, English, History, Art
student_scores = np.array([
    [85, 90, 78, 88, 92],  # Student 1
    [92, 88, 95, 85, 90],  # Student 2
    [78, 85, 82, 90, 88],  # Student 3
    [95, 92, 88, 92, 85],  # Student 4
    [88, 86, 90, 87, 91],  # Student 5
])

subjects = ['Math', 'Science', 'English', 'History', 'Art']

print("--- Subject Performance ---")
for i, subject in enumerate(subjects):
    scores = student_scores[:, i]
    print(f"\n{subject}:")
    print(f"  Mean: {np.mean(scores):.1f}")
    print(f"  Median: {np.median(scores):.1f}")
    print(f"  Std: {np.std(scores):.1f}")
    print(f"  Range: {np.min(scores)} - {np.max(scores)}")

print("\n--- Student Performance ---")
for i in range(len(student_scores)):
    avg = np.mean(student_scores[i])
    print(f"Student {i+1} average: {avg:.1f}")

# Find best performing subject overall
subject_means = np.mean(student_scores, axis=0)
best_subject_idx = np.argmax(subject_means)
print(f"\nBest performing subject: {subjects[best_subject_idx]} ({subject_means[best_subject_idx]:.1f})")

In [None]:
# Example 3: Z-score Normalization
data = np.random.normal(100, 15, 100)

print("Original data (first 10):", data[:10])
print(f"Mean: {np.mean(data):.2f}, Std: {np.std(data):.2f}")

# Calculate z-scores
z_scores = (data - np.mean(data)) / np.std(data)

print("\nZ-scores (first 10):", z_scores[:10])
print(f"Z-score mean: {np.mean(z_scores):.10f}")  # Should be ~0
print(f"Z-score std: {np.std(z_scores):.10f}")    # Should be ~1

# Find extreme values (|z| > 2)
extreme = np.abs(z_scores) > 2
print(f"\nExtreme values (|z| > 2): {np.sum(extreme)} out of {len(data)}")

## üéØ Key Takeaways

1. **Descriptive statistics**: mean, median, std, var, min, max
2. **Percentiles**: Use `np.percentile()` or `np.quantile()` to understand distribution
3. **Axis parameter**: Control which dimension to aggregate (axis=0 for columns, axis=1 for rows)
4. **Correlation**: Measure relationships with `np.corrcoef()` and `np.cov()`
5. **Sorting**: Use `np.sort()` and `np.argsort()` for ordering data
6. **Unique values**: `np.unique()` with `return_counts=True` for frequency analysis
7. **Histograms**: `np.histogram()` for data distribution

## üìù Practice Exercises

1. Calculate the 5-number summary (min, Q1, median, Q3, max) for a dataset
2. Identify and remove outliers using the IQR method
3. Normalize a dataset using min-max scaling (0 to 1)
4. Calculate moving averages for time series data
5. Find the most correlated pair of variables in a multi-variable dataset

In [None]:
# Your practice code here
