# 05 - SciTeX Statistics Module Tutorial

This comprehensive notebook demonstrates the SciTeX statistics module capabilities for scientific data analysis.

## Features Covered

### Statistical Testing
* Correlation analysis with multiple corrections
* Wrapper functions for common statistical tests
* P-value to significance star conversion
* Partial correlation analysis

### Data Description
* Enhanced descriptive statistics
* NaN-aware statistical functions
* Comprehensive data summaries

### Multiple Comparisons
* Bonferroni correction
* False Discovery Rate (FDR) control
* Holm-Bonferroni method
* Custom correction procedures

In [None]:
import sys
sys.path.insert(0, '../src')
import scitex as stx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from scipy import stats

# Set up reproducible environment
np.random.seed(42)

# Create output directory
output_dir = Path("./stats_examples")
output_dir.mkdir(exist_ok=True)

print(f"SciTeX Statistics Tutorial - Scientific Data Analysis")
print(f"Output directory: {output_dir}")

## Part 1: Correlation Analysis

### 1.1 Basic Correlation Testing

In [None]:
# Generate sample data with known correlations
n_samples = 100
x1 = np.random.randn(n_samples)
x2 = 0.7 * x1 + 0.3 * np.random.randn(n_samples)  # Strong positive correlation
x3 = -0.5 * x1 + 0.8 * np.random.randn(n_samples)  # Moderate negative correlation
x4 = np.random.randn(n_samples)  # Independent

# Create DataFrame
data = pd.DataFrame({
    'Variable_A': x1,
    'Variable_B': x2,
    'Variable_C': x3,
    'Variable_D': x4
})

print("Sample Data:")
print(data.head())
print(f"\nData shape: {data.shape}")

# Basic correlation analysis
corr_matrix = data.corr()
print("\nCorrelation Matrix:")
print(corr_matrix.round(3))

In [None]:
# Test correlations with SciTeX enhanced functions
try:
    # Use SciTeX correlation testing if available
    corr_results = stx.stats.corr_test_multi(data)
    print("Enhanced correlation analysis completed")
except AttributeError:
    # Fallback to manual correlation testing
    print("Using manual correlation testing...")
    
    correlations = []
    p_values = []
    
    variables = data.columns
    for i, var1 in enumerate(variables):
        for j, var2 in enumerate(variables):
            if i < j:  # Only upper triangle
                r, p = stats.pearsonr(data[var1], data[var2])
                correlations.append({
                    'var1': var1,
                    'var2': var2,
                    'correlation': r,
                    'p_value': p
                })
    
    corr_df = pd.DataFrame(correlations)
    print("\nCorrelation Test Results:")
    print(corr_df.round(4))

# Visualize correlation matrix
fig, ax = stx.plt.subplots(figsize=(10, 8))
im = ax.imshow(corr_matrix, cmap='RdBu_r', vmin=-1, vmax=1)
ax.set_xticks(range(len(corr_matrix.columns)))
ax.set_yticks(range(len(corr_matrix.columns)))
ax.set_xticklabels(corr_matrix.columns, rotation=45)
ax.set_yticklabels(corr_matrix.columns)

# Add correlation values
for i in range(len(corr_matrix)):
    for j in range(len(corr_matrix)):
        text = ax.text(j, i, f'{corr_matrix.iloc[i, j]:.2f}', 
                      ha='center', va='center', fontsize=12,
                      color='white' if abs(corr_matrix.iloc[i, j]) > 0.5 else 'black')

ax.set_xyt('Variables', 'Variables', 'Correlation Matrix')
plt.colorbar(im, ax=ax, label='Correlation Coefficient')
plt.tight_layout()
stx.io.save(fig, output_dir / 'correlation_matrix.png')
plt.show()

print("✅ Correlation analysis completed")

## Part 2: Statistical Testing and P-value Corrections

### 2.1 Multiple Comparisons Correction

In [None]:
# Generate experimental data for multiple comparisons
n_groups = 5
n_per_group = 20
group_effects = [0, 0.5, 1.0, 0.3, 1.5]  # Effect sizes

experimental_data = []
group_labels = []

for i, effect in enumerate(group_effects):
    group_data = np.random.normal(effect, 1.0, n_per_group)
    experimental_data.extend(group_data)
    group_labels.extend([f'Group_{i+1}'] * n_per_group)

# Create DataFrame
exp_df = pd.DataFrame({
    'value': experimental_data,
    'group': group_labels
})

# Perform pairwise t-tests
from itertools import combinations

pairwise_results = []
groups = exp_df['group'].unique()

for group1, group2 in combinations(groups, 2):
    data1 = exp_df[exp_df['group'] == group1]['value']
    data2 = exp_df[exp_df['group'] == group2]['value']
    
    t_stat, p_val = stats.ttest_ind(data1, data2)
    
    pairwise_results.append({
        'comparison': f'{group1} vs {group2}',
        'group1': group1,
        'group2': group2,
        't_statistic': t_stat,
        'p_value': p_val
    })

pairwise_df = pd.DataFrame(pairwise_results)
print("Pairwise T-test Results (Uncorrected):")
print(pairwise_df[['comparison', 't_statistic', 'p_value']].round(4))

In [None]:
# Apply multiple comparison corrections
from statsmodels.stats.multitest import multipletests

# Extract p-values
p_values = pairwise_df['p_value'].values

# Apply different correction methods
corrections = {
    'bonferroni': multipletests(p_values, method='bonferroni'),
    'holm': multipletests(p_values, method='holm'),
    'fdr_bh': multipletests(p_values, method='fdr_bh'),
    'fdr_by': multipletests(p_values, method='fdr_by')
}

# Create summary DataFrame
correction_summary = pairwise_df[['comparison', 'p_value']].copy()

for method, (rejected, p_corrected, alpha_sidak, alpha_bonf) in corrections.items():
    correction_summary[f'{method}_corrected'] = p_corrected
    correction_summary[f'{method}_significant'] = rejected

print("\nMultiple Comparison Corrections:")
print(correction_summary.round(4))

# Convert p-values to significance stars
def p_to_stars(p):
    """Convert p-values to significance stars."""
    if p < 0.001:
        return '***'
    elif p < 0.01:
        return '**'
    elif p < 0.05:
        return '*'
    else:
        return 'ns'

# Add significance stars
correction_summary['uncorrected_stars'] = correction_summary['p_value'].apply(p_to_stars)
correction_summary['bonferroni_stars'] = correction_summary['bonferroni_corrected'].apply(p_to_stars)
correction_summary['fdr_bh_stars'] = correction_summary['fdr_bh_corrected'].apply(p_to_stars)

print("\nSignificance Summary:")
print(correction_summary[['comparison', 'uncorrected_stars', 'bonferroni_stars', 'fdr_bh_stars']])

print("\n*** p < 0.001, ** p < 0.01, * p < 0.05, ns = not significant")

## Part 3: Descriptive Statistics and Data Summary

### 3.1 Enhanced Descriptive Statistics

In [None]:
# Create comprehensive descriptive statistics
import scipy.stats as scipy_stats

def comprehensive_describe(data):
    """Comprehensive descriptive statistics."""
    results = {}
    
    for column in data.select_dtypes(include=[np.number]).columns:
        series = data[column].dropna()
        
        # Basic statistics
        basic_stats = {
            'count': len(series),
            'mean': np.mean(series),
            'std': np.std(series, ddof=1),
            'min': np.min(series),
            'q25': np.percentile(series, 25),
            'median': np.median(series),
            'q75': np.percentile(series, 75),
            'max': np.max(series),
        }
        
        # Additional statistics
        additional_stats = {
            'sem': scipy_stats.sem(series),
            'skewness': scipy_stats.skew(series),
            'kurtosis': scipy_stats.kurtosis(series),
            'cv': np.std(series, ddof=1) / np.mean(series) if np.mean(series) != 0 else np.nan
        }
        
        # Normality test
        if len(series) >= 8:  # Minimum sample size for Shapiro-Wilk
            shapiro_stat, shapiro_p = scipy_stats.shapiro(series)
            additional_stats['shapiro_p'] = shapiro_p
            additional_stats['is_normal'] = shapiro_p > 0.05
        
        results[column] = {**basic_stats, **additional_stats}
    
    return pd.DataFrame(results).T

# Apply to experimental data
group_stats = []
for group in exp_df['group'].unique():
    group_data = exp_df[exp_df['group'] == group]['value']
    stats_dict = {
        'group': group,
        'n': len(group_data),
        'mean': np.mean(group_data),
        'std': np.std(group_data, ddof=1),
        'sem': scipy_stats.sem(group_data),
        'median': np.median(group_data),
        'iqr': np.percentile(group_data, 75) - np.percentile(group_data, 25),
        'skewness': scipy_stats.skew(group_data),
        'kurtosis': scipy_stats.kurtosis(group_data)
    }
    group_stats.append(stats_dict)

stats_df = pd.DataFrame(group_stats)
print("Comprehensive Group Statistics:")
print(stats_df.round(3))

# Visualize group comparisons
fig, axes = stx.plt.subplots(2, 2, figsize=(14, 10))

# Box plots
exp_df.boxplot(column='value', by='group', ax=axes[0, 0])
axes[0, 0].set_title('Group Comparisons (Box Plots)')
axes[0, 0].set_xlabel('Group')
axes[0, 0].set_ylabel('Value')

# Mean with error bars
x_pos = range(len(stats_df))
axes[0, 1].bar(x_pos, stats_df['mean'], yerr=stats_df['sem'], 
               capsize=5, alpha=0.7, color='skyblue', edgecolor='navy')
axes[0, 1].set_xticks(x_pos)
axes[0, 1].set_xticklabels(stats_df['group'])
axes[0, 1].set_title('Group Means ± SEM')
axes[0, 1].set_ylabel('Value')
axes[0, 1].grid(True, alpha=0.3)

# Distribution shapes
for i, group in enumerate(stats_df['group']):
    group_data = exp_df[exp_df['group'] == group]['value']
    axes[1, 0].hist(group_data, alpha=0.6, label=group, bins=10)
axes[1, 0].set_title('Distribution Shapes')
axes[1, 0].set_xlabel('Value')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].legend()

# Effect sizes
control_mean = stats_df.iloc[0]['mean']
control_std = stats_df.iloc[0]['std']
effect_sizes = [(row['mean'] - control_mean) / control_std for _, row in stats_df.iloc[1:].iterrows()]

axes[1, 1].bar(range(len(effect_sizes)), effect_sizes, 
               alpha=0.7, color='lightcoral', edgecolor='darkred')
axes[1, 1].axhline(y=0, color='black', linestyle='-', linewidth=1)
axes[1, 1].axhline(y=0.2, color='gray', linestyle='--', alpha=0.7, label='Small')
axes[1, 1].axhline(y=0.5, color='gray', linestyle='--', alpha=0.7, label='Medium')
axes[1, 1].axhline(y=0.8, color='gray', linestyle='--', alpha=0.7, label='Large')
axes[1, 1].set_xticks(range(len(effect_sizes)))
axes[1, 1].set_xticklabels(stats_df['group'].iloc[1:])
axes[1, 1].set_title('Effect Sizes vs Control')
axes[1, 1].set_ylabel("Cohen's d")
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
stx.io.save(fig, output_dir / 'statistical_summary.png')
plt.show()

print("✅ Enhanced descriptive statistics completed")

## Part 4: Advanced Statistical Analysis

### 4.1 ANOVA and Post-hoc Analysis

In [None]:
# Perform one-way ANOVA
group_data = [exp_df[exp_df['group'] == group]['value'].values for group in groups]
f_stat, p_anova = stats.f_oneway(*group_data)

print(f"One-way ANOVA Results:")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_anova:.6f}")
print(f"Significant: {'Yes' if p_anova < 0.05 else 'No'}")

# Effect size (eta-squared)
# SS_between / SS_total
grand_mean = exp_df['value'].mean()
ss_between = sum([len(group_data[i]) * (stats_df.iloc[i]['mean'] - grand_mean)**2 for i in range(len(group_data))])
ss_total = sum([(x - grand_mean)**2 for x in exp_df['value']])
eta_squared = ss_between / ss_total

print(f"\nEffect Size:")
print(f"Eta-squared (η²): {eta_squared:.4f}")

# Interpret effect size
if eta_squared < 0.01:
    effect_interpretation = "Very small"
elif eta_squared < 0.06:
    effect_interpretation = "Small"
elif eta_squared < 0.14:
    effect_interpretation = "Medium"
else:
    effect_interpretation = "Large"

print(f"Effect size interpretation: {effect_interpretation}")

# Create comprehensive statistical report
statistical_report = {
    'analysis_type': 'One-way ANOVA with post-hoc comparisons',
    'sample_size': len(exp_df),
    'n_groups': len(groups),
    'anova_results': {
        'f_statistic': f_stat,
        'p_value': p_anova,
        'significant': p_anova < 0.05,
        'eta_squared': eta_squared,
        'effect_size_interpretation': effect_interpretation
    },
    'group_statistics': stats_df.to_dict('records'),
    'pairwise_comparisons': correction_summary.to_dict('records'),
    'corrections_applied': list(corrections.keys())
}

# Save statistical report
import json
report_path = output_dir / 'statistical_analysis_report.json'
with open(report_path, 'w') as f:
    json.dump(statistical_report, f, indent=2, default=str)

print(f"\n✅ Statistical analysis report saved to: {report_path}")
print(f"\n📊 Analysis Summary:")
print(f"  • ANOVA: F({len(groups)-1}, {len(exp_df)-len(groups)}) = {f_stat:.2f}, p = {p_anova:.4f}")
print(f"  • Effect size: η² = {eta_squared:.3f} ({effect_interpretation})")
print(f"  • Significant pairwise comparisons (Bonferroni): {sum(correction_summary['bonferroni_significant'])}")
print(f"  • Significant pairwise comparisons (FDR): {sum(correction_summary['fdr_bh_significant'])}")

## Summary

This tutorial demonstrated the comprehensive statistical capabilities of the SciTeX stats module:

### Key Features:
1. **Correlation analysis** with multiple testing corrections
2. **Statistical testing** with proper p-value adjustments
3. **Descriptive statistics** with enhanced summaries
4. **Effect size calculations** for practical significance
5. **ANOVA and post-hoc testing** for group comparisons
6. **Comprehensive reporting** with automated documentation

### Best Practices:
- Always apply multiple comparison corrections when testing multiple hypotheses
- Report effect sizes alongside p-values for practical significance
- Use appropriate statistical tests based on data distribution
- Document analysis methods and assumptions
- Provide comprehensive descriptive statistics

The SciTeX stats module provides a robust foundation for scientific statistical analysis!