<a href="https://colab.research.google.com/github/taylor33189-beep/Taylor_Hoskins_Repository/blob/main/Comparison_of_more_than_2_groups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
# This script analyzes red cell folate levels and demonstrates the issue
# with doing many comparisons without adjusting for them (alpha inflation).

# --- Setup ---
import pandas as pd      # Used for working with data in tables
from scipy import stats  # Used for statistical tests like t-tests
import numpy as np       # Used for numerical operations and simulation
import itertools         # Used for finding combinations of groups

# --- 1. Load and Prepare Data ---
print("--- 1. Data Loading and Preparation ---")

# Load the data from the CSV file
amess_df = pd.read_csv('amess.csv')

# Display the first few rows to see the data
print("Data Head:")
print(amess_df.head())

# Treat the 'treatmnt' column as categories (groups)
amess_df['treatmnt'] = amess_df['treatmnt'].astype('category')

# Rename the categories if they are numbers (1, 2, 3) to more descriptive names
# Using the correct column name 'treatmnt' for accessing categories
if all(isinstance(x, (int, float))
       for x in amess_df['treatmnt'].cat.categories):
    amess_df['treatmnt'] = amess_df['treatmnt'].cat.rename_categories({
        1: "Cont_N2O", # Continuous N2O
        2: "Oper_N2O", # N2O during operation only
        3: "No_N2O"    # No N2O
    })

# Show summary statistics for each group (count, mean, standard deviation)
print("\nData Summary by Group:")
# Using the correct column name 'treatmnt' for grouping
summary = amess_df.groupby('treatmnt', observed=False)['folate'].agg(
    ['count', 'mean', 'std'])
print(summary)


# --- 2. Pairwise Comparisons ---
print("\n\n--- 2. Pairwise Comparisons ---")

# We want to know if the average folate levels are different between groups.
# Global Null Hypothesis (H0): All group means are equal.
# Global Alternative Hypothesis (Ha): At least one group mean is different.

# Calculate the number of unique pairs of groups to compare
# Using the correct column name 'treatmnt' for accessing categories
group_names = amess_df['treatmnt'].cat.categories
g = len(list(itertools.combinations(group_names, 2)))
print(f"Number of pairwise tests: {g}\n")

# Perform t-tests for each pair of groups (without adjusting p-values yet)
print("Results of pairwise t-tests (alpha=0.05):")

p_values = [] # List to store p-values from each test

# Using the correct column name 'treatmnt' in the loop
for group1_name, group2_name in itertools.combinations(group_names, 2):
    group1_data = amess_df[amess_df['treatmnt'] == group1_name]['folate']
    group2_data = amess_df[amess_df['treatmnt'] == group2_name]['folate']

    # Run the independent t-test (assuming equal variances)
    t_stat, p_value = stats.ttest_ind(group1_data, group2_data,
                                      equal_var=True)

    # Print the p-value for this comparison
    print(f"  {group1_name} vs {group2_name}: p-value = {p_value:.4f}")
    p_values.append(p_value) # Add p-value to our list

# Check if any p-value is less than 0.05 to decide on the global H0
reject_global_H0 = any(p < 0.05 for p in p_values)

if reject_global_H0:
  print("\nConclusion (based on unadjusted p-values): Reject global H0.")
else:
  print("\nConclusion (based on unadjusted p-values): Do not reject global H0.")


# --- 3. Alpha Inflation Simulation ---
print("\n\n--- 3. Alpha Inflation Simulation ---")

# This part shows why doing multiple tests without adjustment is risky.
# Even if there's no real difference (H0 is true), you might get a 'significant' result by chance.

R = 1000      # Number of times to repeat the simulation
num_tests = 3 # Number of tests in each simulation (our 3 pairwise tests)
alpha = 0.05  # Significance level for each test

# Simulate p-values for 'num_tests' assuming H0 is true (p-values are random from 0 to 1)
simulated_p_values = np.random.uniform(0, 1, size=(R, num_tests))

# Check in each simulation run if ANY of the p-values are less than alpha
# This simulates incorrectly rejecting the global H0
incorrect_rejections = np.sum(simulated_p_values < alpha, axis=1) > 0

# Calculate the proportion of times we incorrectly rejected the global H0
simulated_global_alpha = np.sum(incorrect_rejections) / R

# The theoretical chance of incorrectly rejecting H0 is 1 - (1 - alpha)^num_tests
theoretical_global_alpha = 1 - (1 - alpha)**num_tests

print(f"Simulated overall chance of a false positive (Global Alpha):"
      f" {simulated_global_alpha:.4f}")
print(f"Theoretical overall chance of a false positive:"
      f" {theoretical_global_alpha:.4f}")

print("\nComment:")
print("The simulation shows that the overall chance of finding a 'significant' result"
      " (false positive) across multiple tests is higher than the alpha level used"
      " for each individual test (0.05). This is alpha inflation.")

--- 1. Data Loading and Preparation ---
Data Head:
   folate  treatmnt
0     243         1
1     251         1
2     275         1
3     291         1
4     347         1

Data Summary by Group:
          count        mean        std
treatmnt                              
Cont_N2O      8  316.625000  58.717088
Oper_N2O      9  256.444444  37.121797
No_N2O        5  278.000000  33.756481


--- 2. Pairwise Comparisons ---
Number of pairwise tests: 3

Results of pairwise t-tests (alpha=0.05):
  Cont_N2O vs Oper_N2O: p-value = 0.0218
  Cont_N2O vs No_N2O: p-value = 0.2115
  Oper_N2O vs No_N2O: p-value = 0.3046

Conclusion (based on unadjusted p-values): Reject global H0.


--- 3. Alpha Inflation Simulation ---
Simulated overall chance of a false positive (Global Alpha): 0.1510
Theoretical overall chance of a false positive: 0.1426

Comment:
The simulation shows that the overall chance of finding a 'significant' result (false positive) across multiple tests is higher than the alpha level us