# SciTeX Stats Module - Statistical Analysis Made Simple

The `scitex.stats` module provides comprehensive statistical analysis tools with automatic result saving and visualization.

## Key Features
- **Comprehensive Tests**: t-tests, ANOVA, correlations, and more
- **Automatic Reporting**: Results saved as structured data
- **Effect Sizes**: Automatic calculation of effect sizes
- **Visualization**: Integrated plotting for statistical results

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-07-02 07:48:00 (ywatanabe)"
# File: ./examples/02_scitex_stats.ipynb
# ----------------------------------------
import os
__FILE__ = "./examples/02_scitex_stats.ipynb"
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------

import scitex as stx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

print("SciTeX Stats Module Demo")
print(f"SciTeX version: {stx.__version__}")

## 1. Basic Statistical Tests

Perform common statistical tests with automatic result formatting.

In [None]:
# Generate sample data
np.random.seed(42)
group1 = np.random.normal(100, 15, 30)  # Control group
group2 = np.random.normal(110, 15, 30)  # Treatment group
group3 = np.random.normal(105, 12, 30)  # Another treatment

print("Sample data generated:")
print(f"Group 1 (Control): n={len(group1)}, mean={np.mean(group1):.2f}, std={np.std(group1):.2f}")
print(f"Group 2 (Treatment): n={len(group2)}, mean={np.mean(group2):.2f}, std={np.std(group2):.2f}")
print(f"Group 3 (Treatment 2): n={len(group3)}, mean={np.mean(group3):.2f}, std={np.std(group3):.2f}")

In [None]:
# Independent t-test
t_result = stx.stats.ttest_ind(group1, group2)
print("Independent t-test results:")
print(f"t-statistic: {t_result['statistic']:.4f}")
print(f"p-value: {t_result['pvalue']:.4f}")
print(f"Effect size (Cohen's d): {t_result['effect_size']:.4f}")
print(f"Significance: {'Yes' if t_result['pvalue'] < 0.05 else 'No'}")

# Save results
stx.io.save(t_result, './results/ttest_results.json')

## 2. ANOVA and Multiple Comparisons

In [None]:
# One-way ANOVA
anova_result = stx.stats.anova_oneway(group1, group2, group3)
print("One-way ANOVA results:")
print(f"F-statistic: {anova_result['F_statistic']:.4f}")
print(f"p-value: {anova_result['pvalue']:.4f}")
print(f"Effect size (eta-squared): {anova_result['eta_squared']:.4f}")

# Post-hoc tests
if anova_result['pvalue'] < 0.05:
    posthoc_result = stx.stats.posthoc_tukey([group1, group2, group3], 
                                            labels=['Control', 'Treatment1', 'Treatment2'])
    print("\nPost-hoc Tukey HSD results:")
    print(posthoc_result['summary'])
    
    # Save detailed results
    stx.io.save(anova_result, './results/anova_results.json')
    stx.io.save(posthoc_result, './results/posthoc_results.json')

## 3. Correlation Analysis

In [None]:
# Generate correlated data
n_samples = 100
x = np.random.randn(n_samples)
y = 0.7 * x + 0.3 * np.random.randn(n_samples)  # r ≈ 0.7
z = -0.5 * x + 0.8 * np.random.randn(n_samples)  # r ≈ -0.5

# Correlation matrix
data_matrix = np.column_stack([x, y, z])
corr_result = stx.stats.correlation_matrix(data_matrix, 
                                          labels=['Variable X', 'Variable Y', 'Variable Z'])

print("Correlation Analysis:")
print("Correlation matrix:")
print(corr_result['correlation_matrix'])
print("\nP-values:")
print(corr_result['p_values'])

# Visualize correlations
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# X vs Y
axes[0].scatter(x, y, alpha=0.6, color='blue')
axes[0].set_xlabel('Variable X')
axes[0].set_ylabel('Variable Y')
axes[0].set_title(f'X vs Y (r = {corr_result["correlation_matrix"][0,1]:.3f})')

# X vs Z  
axes[1].scatter(x, z, alpha=0.6, color='red')
axes[1].set_xlabel('Variable X')
axes[1].set_ylabel('Variable Z')
axes[1].set_title(f'X vs Z (r = {corr_result["correlation_matrix"][0,2]:.3f})')

# Y vs Z
axes[2].scatter(y, z, alpha=0.6, color='green')
axes[2].set_xlabel('Variable Y')
axes[2].set_ylabel('Variable Z')
axes[2].set_title(f'Y vs Z (r = {corr_result["correlation_matrix"][1,2]:.3f})')

plt.tight_layout()
stx.io.save(fig, './figures/correlation_plots.png')
plt.show()

# Save correlation results
stx.io.save(corr_result, './results/correlation_analysis.json')

## 4. Descriptive Statistics

In [None]:
# Comprehensive descriptive statistics
all_data = np.concatenate([group1, group2, group3])
desc_stats = stx.stats.describe(all_data)

print("Descriptive Statistics:")
for key, value in desc_stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")
    else:
        print(f"{key}: {value}")

# Visualize distribution
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Histogram with normal overlay
ax1.hist(all_data, bins=30, density=True, alpha=0.7, color='skyblue', edgecolor='black')
x_norm = np.linspace(all_data.min(), all_data.max(), 100)
y_norm = stats.norm.pdf(x_norm, desc_stats['mean'], desc_stats['std'])
ax1.plot(x_norm, y_norm, 'r-', linewidth=2, label='Normal distribution')
ax1.set_xlabel('Value')
ax1.set_ylabel('Density')
ax1.set_title('Data Distribution')
ax1.legend()

# Q-Q plot for normality
stats.probplot(all_data, dist="norm", plot=ax2)
ax2.set_title('Q-Q Plot (Normality Check)')

plt.tight_layout()
stx.io.save(fig, './figures/descriptive_stats.png')
plt.show()

# Save descriptive statistics
stx.io.save(desc_stats, './results/descriptive_stats.json')

## 5. Non-parametric Tests

In [None]:
# Mann-Whitney U test (non-parametric alternative to t-test)
mannwhitney_result = stx.stats.mannwhitneyu(group1, group2)
print("Mann-Whitney U test results:")
print(f"U-statistic: {mannwhitney_result['statistic']:.4f}")
print(f"p-value: {mannwhitney_result['pvalue']:.4f}")
print(f"Effect size (rank biserial): {mannwhitney_result['effect_size']:.4f}")

# Kruskal-Wallis test (non-parametric alternative to ANOVA)
kruskal_result = stx.stats.kruskal(group1, group2, group3)
print("\nKruskal-Wallis test results:")
print(f"H-statistic: {kruskal_result['statistic']:.4f}")
print(f"p-value: {kruskal_result['pvalue']:.4f}")
print(f"Effect size (eta-squared): {kruskal_result['eta_squared']:.4f}")

# Save non-parametric test results
nonparametric_results = {
    'mannwhitney': mannwhitney_result,
    'kruskal': kruskal_result
}
stx.io.save(nonparametric_results, './results/nonparametric_tests.json')

## 6. Power Analysis and Sample Size Calculation

In [None]:
# Power analysis for t-test
effect_sizes = np.arange(0.1, 1.5, 0.1)
sample_sizes = [10, 20, 30, 50, 100]

power_results = stx.stats.power_analysis_ttest(effect_sizes, sample_sizes, alpha=0.05)

# Visualize power curves
fig, ax = plt.subplots(figsize=(10, 6))

for n in sample_sizes:
    power_curve = [power_results[f'n_{n}'][f'effect_{es:.1f}'] for es in effect_sizes]
    ax.plot(effect_sizes, power_curve, marker='o', label=f'n = {n}')

ax.axhline(y=0.8, color='red', linestyle='--', alpha=0.7, label='Power = 0.8')
ax.set_xlabel('Effect Size (Cohen\'s d)')
ax.set_ylabel('Statistical Power')
ax.set_title('Power Analysis for Independent t-test')
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
stx.io.save(fig, './figures/power_analysis.png')
plt.show()

# Save power analysis results
stx.io.save(power_results, './results/power_analysis.json')

## 7. Comprehensive Statistical Report

In [None]:
# Generate comprehensive report
report_data = {
    'study_info': {
        'title': 'Statistical Analysis Demo',
        'date': '2025-07-02',
        'groups': ['Control', 'Treatment1', 'Treatment2'],
        'sample_sizes': [len(group1), len(group2), len(group3)]
    },
    'descriptive_stats': {
        'group1': stx.stats.describe(group1),
        'group2': stx.stats.describe(group2),
        'group3': stx.stats.describe(group3)
    },
    'inferential_tests': {
        'ttest_1v2': t_result,
        'anova_all': anova_result,
        'mannwhitney_1v2': mannwhitney_result,
        'kruskal_all': kruskal_result
    },
    'correlation_analysis': corr_result,
    'power_analysis': power_results
}

# Save comprehensive report
stx.io.save(report_data, './reports/statistical_analysis_report.json')

print("✅ Comprehensive Statistical Analysis Complete!")
print("\n📊 Results saved:")
print("  • Individual tests: ./results/")
print("  • Visualizations: ./figures/")
print("  • Comprehensive report: ./reports/statistical_analysis_report.json")

# Summary statistics table
summary_df = pd.DataFrame({
    'Group': ['Control', 'Treatment1', 'Treatment2'],
    'N': [len(group1), len(group2), len(group3)],
    'Mean': [np.mean(group1), np.mean(group2), np.mean(group3)],
    'SD': [np.std(group1), np.std(group2), np.std(group3)],
    'Min': [np.min(group1), np.min(group2), np.min(group3)],
    'Max': [np.max(group1), np.max(group2), np.max(group3)]
})

print("\n📋 Summary Statistics:")
print(summary_df.round(3))

# Save summary table
stx.io.save(summary_df, './results/summary_statistics.csv')

## Summary

The SciTeX Stats module provides:

✅ **Comprehensive Tests**: t-tests, ANOVA, non-parametric alternatives  
✅ **Effect Sizes**: Automatic calculation of Cohen's d, eta-squared, etc.  
✅ **Power Analysis**: Sample size and power calculations  
✅ **Correlation Analysis**: Pearson, Spearman correlations with p-values  
✅ **Automatic Reporting**: Structured results saving  
✅ **Visualization**: Integrated plotting for statistical results  

This makes statistical analysis reproducible, comprehensive, and well-documented.