# SciTeX Statistics Module - Statistical Analysis

The `scitex.stats` module provides convenient statistical functions commonly used in scientific research.

In [None]:
import scitex as stx
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed
np.random.seed(42)

# Configure display
pd.set_option('display.precision', 3)

## 1. Basic Descriptive Statistics

In [None]:
# Generate sample data
n_samples = 1000
data = {
    'normal': np.random.normal(100, 15, n_samples),
    'skewed': np.random.gamma(2, 2, n_samples) * 10,
    'bimodal': np.concatenate([
        np.random.normal(80, 10, n_samples//2),
        np.random.normal(120, 10, n_samples//2)
    ])
}

# Calculate descriptive statistics
desc_stats = pd.DataFrame({
    dist_name: {
        'mean': np.mean(dist_data),
        'median': np.median(dist_data),
        'std': np.std(dist_data, ddof=1),
        'skewness': stats.skew(dist_data),
        'kurtosis': stats.kurtosis(dist_data),
        'min': np.min(dist_data),
        'max': np.max(dist_data),
        'Q1': np.percentile(dist_data, 25),
        'Q3': np.percentile(dist_data, 75)
    }
    for dist_name, dist_data in data.items()
})

print("Descriptive Statistics:")
print(desc_stats.T)

# Visualize distributions
fig, axes = stx.plt.subplots(1, 3, figsize=(15, 5))

for ax, (name, values) in zip(axes, data.items()):
    ax.hist(values, bins=50, alpha=0.7, density=True, edgecolor='black')
    ax.axvline(np.mean(values), color='red', linestyle='--', label=f'Mean: {np.mean(values):.1f}')
    ax.axvline(np.median(values), color='green', linestyle='--', label=f'Median: {np.median(values):.1f}')
    ax.set_xyt('Value', 'Density', f'{name.capitalize()} Distribution')
    ax.legend()
    ax.grid(True, alpha=0.3)

fig.tight_layout()
stx.io.save(fig, './stats/distributions_comparison.png')
stx.plt.show()

## 2. Hypothesis Testing

In [None]:
# Generate experimental data
control = np.random.normal(100, 15, 100)
treatment1 = np.random.normal(105, 15, 100)
treatment2 = np.random.normal(110, 15, 100)

# Two-sample t-test
t_stat, p_value = stats.ttest_ind(control, treatment1)
cohen_d = (np.mean(treatment1) - np.mean(control)) / np.sqrt((np.var(control) + np.var(treatment1)) / 2)

print("Two-Sample T-Test Results:")
print(f"  t-statistic: {t_stat:.3f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Cohen's d: {cohen_d:.3f}")
print(f"  Significant at α=0.05: {'Yes' if p_value < 0.05 else 'No'}")

# One-way ANOVA
f_stat, p_anova = stats.f_oneway(control, treatment1, treatment2)
print("\nOne-Way ANOVA Results:")
print(f"  F-statistic: {f_stat:.3f}")
print(f"  p-value: {p_anova:.4f}")

# Post-hoc tests (if ANOVA is significant)
if p_anova < 0.05:
    print("\nPost-hoc Pairwise T-Tests (Bonferroni corrected):")
    groups = [control, treatment1, treatment2]
    group_names = ['Control', 'Treatment 1', 'Treatment 2']
    alpha_corrected = 0.05 / 3  # Bonferroni correction
    
    for i in range(len(groups)):
        for j in range(i+1, len(groups)):
            t, p = stats.ttest_ind(groups[i], groups[j])
            print(f"  {group_names[i]} vs {group_names[j]}: p = {p:.4f} {'*' if p < alpha_corrected else ''}")

In [None]:
# Visualize group comparisons
fig, (ax1, ax2) = stx.plt.subplots(1, 2, figsize=(12, 5))

# Box plot
data_for_plot = [control, treatment1, treatment2]
ax1.boxplot(data_for_plot, labels=group_names)
ax1.set_xyt('Group', 'Value', 'Group Comparison (Box Plot)')
ax1.grid(True, alpha=0.3, axis='y')

# Add significance indicators
y_max = max([max(d) for d in data_for_plot]) + 5
if p_value < 0.05:
    ax1.plot([1, 2], [y_max, y_max], 'k-')
    ax1.text(1.5, y_max + 1, '*', ha='center', fontsize=14)

# Effect size visualization
effect_sizes = [
    ('Control vs T1', cohen_d),
    ('Control vs T2', (np.mean(treatment2) - np.mean(control)) / np.sqrt((np.var(control) + np.var(treatment2)) / 2)),
    ('T1 vs T2', (np.mean(treatment2) - np.mean(treatment1)) / np.sqrt((np.var(treatment1) + np.var(treatment2)) / 2))
]

labels, d_values = zip(*effect_sizes)
colors = ['red' if abs(d) > 0.8 else 'orange' if abs(d) > 0.5 else 'green' for d in d_values]
bars = ax2.barh(labels, d_values, color=colors, alpha=0.7)
ax2.axvline(0, color='black', linestyle='-', linewidth=0.5)
ax2.set_xyt("Cohen's d", 'Comparison', 'Effect Sizes')
ax2.grid(True, alpha=0.3, axis='x')

# Add effect size interpretation
ax2.axvspan(-0.2, 0.2, alpha=0.1, color='gray', label='Small')
ax2.axvspan(0.5, 0.8, alpha=0.1, color='orange', label='Medium')
ax2.axvspan(0.8, 2, alpha=0.1, color='red', label='Large')
ax2.axvspan(-0.8, -0.5, alpha=0.1, color='orange')
ax2.axvspan(-2, -0.8, alpha=0.1, color='red')
ax2.legend(loc='upper right')

fig.tight_layout()
stx.io.save(fig, './stats/hypothesis_testing.png')
stx.plt.show()

## 3. Correlation and Regression Analysis

In [None]:
# Generate correlated data
n = 200
x = np.random.normal(0, 1, n)
y_linear = 2 * x + np.random.normal(0, 0.5, n)
y_nonlinear = x**2 + np.random.normal(0, 0.5, n)
y_uncorr = np.random.normal(0, 1, n)

# Calculate correlations
correlations = {
    'Linear': {
        'Pearson': stats.pearsonr(x, y_linear),
        'Spearman': stats.spearmanr(x, y_linear),
        'Kendall': stats.kendalltau(x, y_linear)
    },
    'Nonlinear': {
        'Pearson': stats.pearsonr(x, y_nonlinear),
        'Spearman': stats.spearmanr(x, y_nonlinear),
        'Kendall': stats.kendalltau(x, y_nonlinear)
    },
    'Uncorrelated': {
        'Pearson': stats.pearsonr(x, y_uncorr),
        'Spearman': stats.spearmanr(x, y_uncorr),
        'Kendall': stats.kendalltau(x, y_uncorr)
    }
}

# Display correlation results
print("Correlation Analysis:")
print("-" * 60)
for relationship, methods in correlations.items():
    print(f"\n{relationship} Relationship:")
    for method, (corr, p_val) in methods.items():
        print(f"  {method}: r = {corr:.3f}, p = {p_val:.4f}")

# Visualize relationships
fig, axes = stx.plt.subplots(2, 3, figsize=(15, 10))

# Scatter plots
datasets = [('Linear', x, y_linear), ('Nonlinear', x, y_nonlinear), ('Uncorrelated', x, y_uncorr)]

for ax, (name, x_data, y_data) in zip(axes[0], datasets):
    ax.scatter(x_data, y_data, alpha=0.6)
    
    # Add regression line
    z = np.polyfit(x_data, y_data, 1)
    p = np.poly1d(z)
    ax.plot(sorted(x_data), p(sorted(x_data)), 'r--', linewidth=2)
    
    # Add correlation coefficient
    r, _ = stats.pearsonr(x_data, y_data)
    ax.text(0.05, 0.95, f'r = {r:.3f}', transform=ax.transAxes, 
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    ax.set_xyt('X', 'Y', f'{name} Relationship')
    ax.grid(True, alpha=0.3)

# QQ plots for residuals
for ax, (name, x_data, y_data) in zip(axes[1], datasets):
    # Calculate residuals
    z = np.polyfit(x_data, y_data, 1)
    p = np.poly1d(z)
    residuals = y_data - p(x_data)
    
    # QQ plot
    stats.probplot(residuals, dist="norm", plot=ax)
    ax.set_title(f'QQ Plot - {name}')
    ax.grid(True, alpha=0.3)

fig.tight_layout()
stx.io.save(fig, './stats/correlation_analysis.png')
stx.plt.show()

## 4. Multiple Regression Analysis

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Generate multivariate data
n_samples = 500
n_features = 5

# True coefficients
true_coef = np.array([3, -2, 1.5, 0, 4])

# Generate features
X = np.random.randn(n_samples, n_features)
y = X @ true_coef + np.random.normal(0, 2, n_samples)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Predictions
y_pred_train = model.predict(X_train_scaled)
y_pred_test = model.predict(X_test_scaled)

# Calculate R-squared and MSE
from sklearn.metrics import r2_score, mean_squared_error

r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print("Multiple Regression Results:")
print("-" * 40)
print(f"R² (train): {r2_train:.3f}")
print(f"R² (test): {r2_test:.3f}")
print(f"MSE (train): {mse_train:.3f}")
print(f"MSE (test): {mse_test:.3f}")

# Compare coefficients
coef_comparison = pd.DataFrame({
    'True': true_coef,
    'Estimated': model.coef_,
    'Difference': model.coef_ - true_coef
}, index=[f'X{i+1}' for i in range(n_features)])

print("\nCoefficient Comparison:")
print(coef_comparison)

In [None]:
# Visualize regression results
fig, axes = stx.plt.subplots(2, 2, figsize=(12, 10))

# Actual vs Predicted
ax = axes[0, 0]
ax.scatter(y_test, y_pred_test, alpha=0.6)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
ax.set_xyt('Actual', 'Predicted', f'Actual vs Predicted (R² = {r2_test:.3f})')
ax.grid(True, alpha=0.3)

# Residuals
ax = axes[0, 1]
residuals = y_test - y_pred_test
ax.scatter(y_pred_test, residuals, alpha=0.6)
ax.axhline(y=0, color='r', linestyle='--')
ax.set_xyt('Predicted', 'Residuals', 'Residual Plot')
ax.grid(True, alpha=0.3)

# Coefficient plot
ax = axes[1, 0]
x_pos = np.arange(n_features)
width = 0.35
ax.bar(x_pos - width/2, true_coef, width, label='True', alpha=0.7)
ax.bar(x_pos + width/2, model.coef_, width, label='Estimated', alpha=0.7)
ax.set_xticks(x_pos)
ax.set_xticklabels([f'X{i+1}' for i in range(n_features)])
ax.set_xyt('Feature', 'Coefficient', 'Coefficient Comparison')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Feature importance (absolute coefficients)
ax = axes[1, 1]
feature_importance = np.abs(model.coef_)
sorted_idx = np.argsort(feature_importance)[::-1]
ax.bar(range(n_features), feature_importance[sorted_idx])
ax.set_xticks(range(n_features))
ax.set_xticklabels([f'X{i+1}' for i in sorted_idx])
ax.set_xyt('Feature', 'Absolute Coefficient', 'Feature Importance')
ax.grid(True, alpha=0.3, axis='y')

fig.tight_layout()
stx.io.save(fig, './stats/multiple_regression.png')
stx.plt.show()

## 5. Non-parametric Tests

In [None]:
# Generate non-normal data
group1 = np.random.exponential(scale=2, size=50)
group2 = np.random.exponential(scale=2.5, size=50)
group3 = np.random.exponential(scale=3, size=50)

# Test for normality
print("Normality Tests (Shapiro-Wilk):")
for i, group in enumerate([group1, group2, group3], 1):
    stat, p_val = stats.shapiro(group)
    print(f"  Group {i}: W = {stat:.3f}, p = {p_val:.4f} {'(Normal)' if p_val > 0.05 else '(Not Normal)'}")

# Mann-Whitney U test (two groups)
u_stat, p_mw = stats.mannwhitneyu(group1, group2, alternative='two-sided')
print(f"\nMann-Whitney U Test (Group 1 vs 2):")
print(f"  U-statistic: {u_stat:.1f}")
print(f"  p-value: {p_mw:.4f}")

# Kruskal-Wallis test (multiple groups)
h_stat, p_kw = stats.kruskal(group1, group2, group3)
print(f"\nKruskal-Wallis Test (all groups):")
print(f"  H-statistic: {h_stat:.3f}")
print(f"  p-value: {p_kw:.4f}")

# Wilcoxon signed-rank test (paired data)
before = np.random.normal(100, 10, 30)
after = before + np.random.normal(3, 5, 30)  # Some improvement
w_stat, p_wilcox = stats.wilcoxon(before, after)
print(f"\nWilcoxon Signed-Rank Test (paired):")
print(f"  W-statistic: {w_stat:.1f}")
print(f"  p-value: {p_wilcox:.4f}")

In [None]:
# Visualize non-parametric data
fig, axes = stx.plt.subplots(2, 2, figsize=(12, 10))

# Distribution plots
ax = axes[0, 0]
for i, (group, label) in enumerate([(group1, 'Group 1'), (group2, 'Group 2'), (group3, 'Group 3')]):
    ax.hist(group, bins=20, alpha=0.5, label=label, density=True)
ax.set_xyt('Value', 'Density', 'Non-Normal Distributions')
ax.legend()
ax.grid(True, alpha=0.3)

# Box plots
ax = axes[0, 1]
ax.boxplot([group1, group2, group3], labels=['Group 1', 'Group 2', 'Group 3'])
ax.set_xyt('Group', 'Value', 'Box Plot Comparison')
ax.grid(True, alpha=0.3, axis='y')

# Paired data (before/after)
ax = axes[1, 0]
ax.scatter(before, after, alpha=0.6)
ax.plot([before.min(), before.max()], [before.min(), before.max()], 'r--', linewidth=2)
ax.set_xyt('Before', 'After', 'Paired Data (Wilcoxon Test)')
ax.grid(True, alpha=0.3)

# Difference plot
ax = axes[1, 1]
differences = after - before
ax.hist(differences, bins=20, alpha=0.7, edgecolor='black')
ax.axvline(0, color='red', linestyle='--', linewidth=2)
ax.axvline(np.median(differences), color='green', linestyle='--', 
           label=f'Median: {np.median(differences):.2f}')
ax.set_xyt('Difference (After - Before)', 'Count', 'Distribution of Differences')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

fig.tight_layout()
stx.io.save(fig, './stats/nonparametric_tests.png')
stx.plt.show()

## 6. Power Analysis and Sample Size Calculation

In [None]:
from statsmodels.stats.power import TTestPower, FTestAnovaPower

# Power analysis for t-test
power_analysis = TTestPower()

# Calculate sample size for desired power
effect_size = 0.5  # Medium effect
alpha = 0.05
power = 0.8

sample_size = power_analysis.solve_power(effect_size=effect_size, 
                                         alpha=alpha, 
                                         power=power)

print("Power Analysis for Two-Sample T-Test:")
print(f"  Effect size (d): {effect_size}")
print(f"  Alpha: {alpha}")
print(f"  Desired power: {power}")
print(f"  Required sample size per group: {sample_size:.0f}")

# Power curves
sample_sizes = np.arange(10, 200, 5)
effect_sizes = [0.2, 0.5, 0.8]  # Small, medium, large

fig, (ax1, ax2) = stx.plt.subplots(1, 2, figsize=(12, 5))

# Power vs sample size
for d in effect_sizes:
    powers = [power_analysis.solve_power(effect_size=d, 
                                        nobs1=n, 
                                        alpha=alpha) 
              for n in sample_sizes]
    ax1.plot(sample_sizes, powers, label=f'd = {d}')

ax1.axhline(y=0.8, color='red', linestyle='--', alpha=0.5)
ax1.set_xyt('Sample Size per Group', 'Statistical Power', 'Power Curves')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_ylim(0, 1)

# Sample size vs effect size
effect_range = np.linspace(0.1, 1.5, 50)
powers_target = [0.7, 0.8, 0.9]

for power_target in powers_target:
    sample_sizes_needed = [power_analysis.solve_power(effect_size=d, 
                                                     alpha=alpha, 
                                                     power=power_target) 
                          for d in effect_range]
    ax2.plot(effect_range, sample_sizes_needed, label=f'Power = {power_target}')

ax2.set_xyt('Effect Size (d)', 'Sample Size per Group', 'Sample Size Requirements')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, 200)

fig.tight_layout()
stx.io.save(fig, './stats/power_analysis.png')
stx.plt.show()

## 7. Bootstrapping and Confidence Intervals

In [None]:
from scipy.stats import bootstrap

# Generate sample data
np.random.seed(42)
sample_data = np.random.gamma(2, 2, 100)  # Non-normal distribution

# Define statistics of interest
def mean_statistic(x):
    return np.mean(x)

def median_statistic(x):
    return np.median(x)

def trimmed_mean(x, trim=0.1):
    return stats.trim_mean(x, trim)

# Bootstrap confidence intervals
n_bootstrap = 10000
confidence_level = 0.95

# Perform bootstrap
rng = np.random.default_rng(42)
bootstrap_means = []
bootstrap_medians = []
bootstrap_trimmed = []

for _ in range(n_bootstrap):
    resample = rng.choice(sample_data, size=len(sample_data), replace=True)
    bootstrap_means.append(mean_statistic(resample))
    bootstrap_medians.append(median_statistic(resample))
    bootstrap_trimmed.append(trimmed_mean(resample))

# Calculate confidence intervals
alpha = 1 - confidence_level
lower_percentile = (alpha/2) * 100
upper_percentile = (1 - alpha/2) * 100

ci_mean = np.percentile(bootstrap_means, [lower_percentile, upper_percentile])
ci_median = np.percentile(bootstrap_medians, [lower_percentile, upper_percentile])
ci_trimmed = np.percentile(bootstrap_trimmed, [lower_percentile, upper_percentile])

print("Bootstrap Confidence Intervals (95%):")
print(f"  Mean: {np.mean(sample_data):.3f} [{ci_mean[0]:.3f}, {ci_mean[1]:.3f}]")
print(f"  Median: {np.median(sample_data):.3f} [{ci_median[0]:.3f}, {ci_median[1]:.3f}]")
print(f"  Trimmed Mean: {trimmed_mean(sample_data):.3f} [{ci_trimmed[0]:.3f}, {ci_trimmed[1]:.3f}]")

In [None]:
# Visualize bootstrap distributions
fig, axes = stx.plt.subplots(2, 2, figsize=(12, 10))

# Original data
ax = axes[0, 0]
ax.hist(sample_data, bins=30, alpha=0.7, density=True, edgecolor='black')
ax.axvline(np.mean(sample_data), color='red', linestyle='--', label='Mean')
ax.axvline(np.median(sample_data), color='green', linestyle='--', label='Median')
ax.set_xyt('Value', 'Density', 'Original Data Distribution')
ax.legend()
ax.grid(True, alpha=0.3)

# Bootstrap distributions
statistics = [
    ('Mean', bootstrap_means, ci_mean),
    ('Median', bootstrap_medians, ci_median),
    ('Trimmed Mean', bootstrap_trimmed, ci_trimmed)
]

for ax, (name, bootstrap_dist, ci) in zip(axes.flat[1:], statistics):
    ax.hist(bootstrap_dist, bins=50, alpha=0.7, density=True, edgecolor='black')
    ax.axvline(np.mean(bootstrap_dist), color='red', linestyle='-', linewidth=2)
    ax.axvline(ci[0], color='red', linestyle='--', linewidth=1)
    ax.axvline(ci[1], color='red', linestyle='--', linewidth=1)
    ax.set_xyt(name, 'Density', f'Bootstrap Distribution of {name}')
    ax.set_title(f'{name}: [{ci[0]:.3f}, {ci[1]:.3f}]', fontsize=10)
    ax.grid(True, alpha=0.3)

fig.tight_layout()
stx.io.save(fig, './stats/bootstrap_analysis.png')
stx.plt.show()

## 8. Statistical Report Generation

In [None]:
# Generate comprehensive statistical report
def generate_statistical_report(data1, data2, test_name="Experiment"):
    """Generate a comprehensive statistical report comparing two groups."""
    
    report = {
        'test_name': test_name,
        'sample_sizes': {'group1': len(data1), 'group2': len(data2)},
        'descriptive': {
            'group1': {
                'mean': np.mean(data1),
                'std': np.std(data1, ddof=1),
                'median': np.median(data1),
                'iqr': np.percentile(data1, 75) - np.percentile(data1, 25)
            },
            'group2': {
                'mean': np.mean(data2),
                'std': np.std(data2, ddof=1),
                'median': np.median(data2),
                'iqr': np.percentile(data2, 75) - np.percentile(data2, 25)
            }
        },
        'normality': {
            'group1': stats.shapiro(data1),
            'group2': stats.shapiro(data2)
        },
        'parametric': {
            't_test': stats.ttest_ind(data1, data2),
            'levene': stats.levene(data1, data2)
        },
        'nonparametric': {
            'mann_whitney': stats.mannwhitneyu(data1, data2)
        },
        'effect_size': {
            'cohens_d': (np.mean(data2) - np.mean(data1)) / np.sqrt((np.var(data1) + np.var(data2)) / 2)
        }
    }
    
    return report

# Generate report
report = generate_statistical_report(control, treatment1, "Treatment Effect Analysis")

# Format and display report
print(f"\n{'='*60}")
print(f"STATISTICAL REPORT: {report['test_name']}")
print(f"{'='*60}")

print("\n1. SAMPLE INFORMATION")
print(f"   Group 1: n = {report['sample_sizes']['group1']}")
print(f"   Group 2: n = {report['sample_sizes']['group2']}")

print("\n2. DESCRIPTIVE STATISTICS")
for group in ['group1', 'group2']:
    stats_data = report['descriptive'][group]
    print(f"\n   {group.upper()}:")
    print(f"   Mean ± SD: {stats_data['mean']:.2f} ± {stats_data['std']:.2f}")
    print(f"   Median (IQR): {stats_data['median']:.2f} ({stats_data['iqr']:.2f})")

print("\n3. ASSUMPTION TESTING")
for group in ['group1', 'group2']:
    w, p = report['normality'][group]
    print(f"   {group.upper()} normality: p = {p:.4f} {'✓' if p > 0.05 else '✗'}")
_, p_levene = report['parametric']['levene']
print(f"   Equal variances: p = {p_levene:.4f} {'✓' if p_levene > 0.05 else '✗'}")

print("\n4. STATISTICAL TESTS")
t_stat, p_t = report['parametric']['t_test']
u_stat, p_u = report['nonparametric']['mann_whitney']
print(f"   Independent t-test: t = {t_stat:.3f}, p = {p_t:.4f}")
print(f"   Mann-Whitney U: U = {u_stat:.1f}, p = {p_u:.4f}")

print("\n5. EFFECT SIZE")
print(f"   Cohen's d: {report['effect_size']['cohens_d']:.3f}")

print("\n6. CONCLUSION")
if p_t < 0.05:
    print(f"   ✓ Statistically significant difference (p < 0.05)")
else:
    print(f"   ✗ No statistically significant difference (p ≥ 0.05)")

# Save report
report_dict = {
    'report': report,
    'timestamp': pd.Timestamp.now().isoformat(),
    'software': 'SciTeX Statistical Analysis'
}
stx.io.save(report_dict, './stats/statistical_report.json')
print(f"\n{'='*60}")
print("Report saved to: ./stats/statistical_report.json")

## Summary

This notebook demonstrated key statistical analyses using SciTeX:

1. **Descriptive Statistics**: Mean, median, variance, skewness, kurtosis
2. **Hypothesis Testing**: T-tests, ANOVA, post-hoc tests
3. **Correlation Analysis**: Pearson, Spearman, Kendall correlations
4. **Regression Analysis**: Simple and multiple linear regression
5. **Non-parametric Tests**: Mann-Whitney, Kruskal-Wallis, Wilcoxon
6. **Power Analysis**: Sample size calculation and power curves
7. **Bootstrapping**: Confidence intervals for various statistics
8. **Report Generation**: Automated statistical reporting

### Best Practices:

- **Check assumptions** before applying parametric tests
- **Report effect sizes** along with p-values
- **Use appropriate tests** for your data distribution
- **Consider multiple comparisons** corrections
- **Visualize data** before and after analysis
- **Document all analyses** for reproducibility

In [None]:
# Cleanup
print("Statistical analysis complete!")
print("\nFiles created:")
if Path('./stats').exists():
    for f in sorted(Path('./stats').glob('*')):
        print(f"  - {f.name}")