# Advanced Statistical Functions with SciPy

## Introduction

This notebook explores advanced SciPy features for probability and statistics:
- Combinatorics (combinations and permutations)
- Fitting distributions to data
- Monte Carlo methods
- Confidence intervals
- Agricultural applications

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import stats
from scipy.special import comb, perm, factorial

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
np.random.seed(42)

print("âœ“ Libraries imported!")

## 1. Combinatorics

### Combinations: Order doesn't matter
**Example**: Selecting 3 fields from 10 for inspection

In [None]:
# Combinations: C(n, k) = n! / (k! * (n-k)!)
n_fields = 10
k_select = 3

n_combinations = comb(n_fields, k_select, exact=True)
print(f"Combinations: Selecting {k_select} fields from {n_fields}")
print(f"C({n_fields}, {k_select}) = {n_combinations} ways")
print(f"\nProbability any specific selection: 1/{n_combinations} = {1/n_combinations:.4%}")

# Permutations: P(n, k) = n! / (n-k)!
n_permutations = perm(n_fields, k_select, exact=True)
print(f"\nPermutations: Ordered selection")
print(f"P({n_fields}, {k_select}) = {n_permutations} ways")
print(f"\nDifference: Order matters for permutations!")
print(f"Ratio: P/C = {n_permutations/n_combinations:.0f} = {k_select}! (ways to arrange {k_select} items)")

### Application: Sampling Strategy

In [None]:
# Problem: Sample 5 plots from 20 to estimate average yield
# With 3 diseased plots, what's P(sample contains at least 1 diseased plot)?

total_plots = 20
diseased_plots = 3
healthy_plots = total_plots - diseased_plots
sample_size = 5

# Total ways to select 5 from 20
total_ways = comb(total_plots, sample_size, exact=True)

# Ways to select 5 all from healthy plots (missing disease)
ways_all_healthy = comb(healthy_plots, sample_size, exact=True)

# P(at least 1 diseased) = 1 - P(all healthy)
p_all_healthy = ways_all_healthy / total_ways
p_at_least_one_diseased = 1 - p_all_healthy

print(f"Sampling Strategy: {sample_size} plots from {total_plots} ({diseased_plots} diseased)")
print("="*60)
print(f"Total sampling combinations: {total_ways:,}")
print(f"Combinations with no disease: {ways_all_healthy:,}")
print(f"\nP(all healthy) = {p_all_healthy:.4f}")
print(f"P(at least 1 diseased) = {p_at_least_one_diseased:.4f} = {p_at_least_one_diseased:.1%}")
print(f"\nðŸ’¡ Good chance ({p_at_least_one_diseased:.0%}) of detecting disease with this sample size!")

## 2. Fitting Distributions to Data

Given agricultural data, find the best-fitting probability distribution

In [None]:
# Generate synthetic crop yield data
np.random.seed(42)
true_mean, true_std = 75, 12
yield_data = np.random.normal(true_mean, true_std, size=200)

print("Fitting Distribution to Crop Yield Data")
print("="*60)
print(f"Data: {len(yield_data)} observations")
print(f"Sample mean: {yield_data.mean():.2f}")
print(f"Sample std: {yield_data.std():.2f}")

# Fit normal distribution
mu, sigma = stats.norm.fit(yield_data)
print(f"\nFitted Normal Distribution:")
print(f"  Î¼ (mu) = {mu:.2f}")
print(f"  Ïƒ (sigma) = {sigma:.2f}")

# Goodness of fit test (Kolmogorov-Smirnov)
ks_stat, ks_pvalue = stats.kstest(yield_data, stats.norm(mu, sigma).cdf)
print(f"\nKolmogorov-Smirnov Test:")
print(f"  Statistic: {ks_stat:.4f}")
print(f"  P-value: {ks_pvalue:.4f}")
if ks_pvalue > 0.05:
    print(f"  âœ“ Normal distribution is a good fit (p > 0.05)")

# Visualize
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Histogram with fitted PDF
ax1.hist(yield_data, bins=30, density=True, alpha=0.7, color='skyblue', edgecolor='black', label='Data')
x = np.linspace(yield_data.min(), yield_data.max(), 100)
ax1.plot(x, stats.norm(mu, sigma).pdf(x), 'r-', linewidth=2, label=f'Fitted Normal\nÎ¼={mu:.1f}, Ïƒ={sigma:.1f}')
ax1.set_xlabel('Yield (bushels/acre)')
ax1.set_ylabel('Probability Density')
ax1.set_title('Histogram with Fitted Distribution')
ax1.legend()

# Q-Q plot (quantile-quantile)
stats.probplot(yield_data, dist="norm", plot=ax2)
ax2.set_title('Q-Q Plot: Checking Normality')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nðŸ’¡ Q-Q Plot: Points on diagonal indicate good fit to normal distribution")

## 3. Monte Carlo Simulation: Complex Risk Assessment

In [None]:
# Scenario: Should we invest in irrigation?
# Multiple uncertain factors affect profitability

print("Monte Carlo: Irrigation Investment Decision")
print("="*60)

n_sim = 10000
irrigation_cost = 50000  # upfront
years = 10

# Without irrigation
yield_no_irr = stats.norm(65, 15).rvs((n_sim, years))
price_no_irr = stats.norm(5.5, 0.8).rvs((n_sim, years))
revenue_no_irr = (yield_no_irr * price_no_irr * 100).sum(axis=1) - 250*years*100

# With irrigation
yield_with_irr = stats.norm(85, 8).rvs((n_sim, years))  # Higher, more consistent
price_with_irr = stats.norm(5.5, 0.8).rvs((n_sim, years))
revenue_with_irr = (yield_with_irr * price_with_irr * 100).sum(axis=1) - 250*years*100 - irrigation_cost

# Analyze
net_benefit = revenue_with_irr - revenue_no_irr

print(f"\nResults from {n_sim:,} simulations:")
print(f"\nWithout Irrigation:")
print(f"  Mean profit: ${revenue_no_irr.mean():,.0f}")
print(f"  Std dev: ${revenue_no_irr.std():,.0f}")
print(f"\nWith Irrigation:")
print(f"  Mean profit: ${revenue_with_irr.mean():,.0f}")
print(f"  Std dev: ${revenue_with_irr.std():,.0f}")
print(f"\nNet Benefit of Irrigation:")
print(f"  Mean: ${net_benefit.mean():,.0f}")
print(f"  P(positive benefit) = {(net_benefit > 0).mean():.1%}")
print(f"  P(benefit > $50k) = {(net_benefit > 50000).mean():.1%}")

# Decision
if net_benefit.mean() > 0 and (net_benefit > 0).mean() > 0.7:
    print(f"\nâœ“ INVEST: High probability of positive returns")
else:
    print(f"\nâœ— DON'T INVEST: Too risky")

# Visualize
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Revenue distributions
axes[0,0].hist([revenue_no_irr, revenue_with_irr], bins=50, label=['No Irrigation', 'With Irrigation'],
              alpha=0.6, edgecolor='black')
axes[0,0].set_xlabel('Total Revenue ($)')
axes[0,0].set_ylabel('Frequency')
axes[0,0].set_title('Revenue Distributions')
axes[0,0].legend()

# Net benefit
axes[0,1].hist(net_benefit, bins=50, color='green', alpha=0.7, edgecolor='black')
axes[0,1].axvline(0, color='red', linestyle='--', linewidth=2, label='Break-even')
axes[0,1].axvline(net_benefit.mean(), color='blue', linestyle='--', linewidth=2, label=f'Mean: ${net_benefit.mean():,.0f}')
axes[0,1].set_xlabel('Net Benefit ($)')
axes[0,1].set_ylabel('Frequency')
axes[0,1].set_title('Net Benefit Distribution')
axes[0,1].legend()

# Cumulative probability
sorted_benefit = np.sort(net_benefit)
cumulative = np.arange(1, len(sorted_benefit)+1) / len(sorted_benefit)
axes[1,0].plot(sorted_benefit, cumulative, 'g-', linewidth=2)
axes[1,0].axvline(0, color='red', linestyle='--', linewidth=2)
axes[1,0].set_xlabel('Net Benefit ($)')
axes[1,0].set_ylabel('Cumulative Probability')
axes[1,0].set_title('Probability of Achieving Benefit Level')
axes[1,0].grid(True, alpha=0.3)

# Risk-return scatter
axes[1,1].scatter([revenue_no_irr.std()], [revenue_no_irr.mean()], 
                 s=200, c='orange', edgecolor='black', linewidth=2, label='No Irrigation', zorder=3)
axes[1,1].scatter([revenue_with_irr.std()], [revenue_with_irr.mean()], 
                 s=200, c='blue', edgecolor='black', linewidth=2, label='With Irrigation', zorder=3)
axes[1,1].set_xlabel('Risk (Std Dev)')
axes[1,1].set_ylabel('Return (Mean)')
axes[1,1].set_title('Risk-Return Trade-off')
axes[1,1].legend()
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Confidence Intervals

In [None]:
# Sample data: observed yields from 30 fields
np.random.seed(42)
sample_yields = np.random.normal(75, 12, size=30)

print("Confidence Intervals for Mean Yield")
print("="*60)
print(f"Sample size: {len(sample_yields)}")
print(f"Sample mean: {sample_yields.mean():.2f}")
print(f"Sample std: {sample_yields.std(ddof=1):.2f}")

# Calculate confidence intervals
confidence_levels = [0.90, 0.95, 0.99]

print(f"\nConfidence Intervals:")
for conf in confidence_levels:
    ci = stats.t.interval(conf, len(sample_yields)-1, 
                         loc=sample_yields.mean(), 
                         scale=stats.sem(sample_yields))
    width = ci[1] - ci[0]
    print(f"  {conf:.0%}: [{ci[0]:.2f}, {ci[1]:.2f}] (width: {width:.2f})")

print(f"\nðŸ’¡ Interpretation: We're 95% confident the true mean yield is between")
ci_95 = stats.t.interval(0.95, len(sample_yields)-1, 
                        loc=sample_yields.mean(), 
                        scale=stats.sem(sample_yields))
print(f"   {ci_95[0]:.2f} and {ci_95[1]:.2f} bushels/acre")

## Summary

### Advanced SciPy Features

**Combinatorics:**
- `comb(n, k)`: Combinations
- `perm(n, k)`: Permutations
- Applications in sampling strategies

**Distribution Fitting:**
- `.fit()`: Estimate parameters from data
- `kstest()`: Goodness of fit
- Q-Q plots: Visual assessment

**Monte Carlo:**
- Simulate complex scenarios
- Risk-return analysis
- Probability estimation for rare events

**Confidence Intervals:**
- Quantify uncertainty in estimates
- t-distribution for small samples
- Trade-off: confidence vs. precision

### Next Steps

Apply all concepts to real agricultural problems in Section 4: Agricultural Applications!