In [32]:
import scipy.stats as stats
import numpy as np
import statsmodels.api as sm
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import f_oneway
from statsmodels.sandbox.stats.multicomp import MultiComparison


A1. Assumptions for ANOVA:
 * Independence of observations,
 * Normality within groups,
 * Homogeneity of variances. Violations: outliers, non-normality, unequal variances.

A2. Types of ANOVA:
* One-way ANOVA - compare means of three or more groups.
* Two-way ANOVA - assess main and interaction effects.
* Repeated measures ANOVA - analyze within-subject changes over time.

A3. Partitioning of Variance: ANOVA decomposes total variance into between-group and within-group components. Important for understanding the sources of variability and assessing group differences.


In [8]:
#Answer 2:

# Assume you have a list of data and corresponding group labels
group1_data = [1, 2, 3, 4]
group2_data = [5, 6, 7, 8]
group3_data = [9, 10, 11, 12]
data = [group1_data, group2_data, group3_data]
group_labels = ['Group1', 'Group2', 'Group3']

# Calculate overall mean
overall_mean = np.mean(np.concatenate(data))

# Calculate SST (Total Sum of Squares)
ss_total = np.sum((np.concatenate(data) - overall_mean)**2)

# Calculate SSB (Between-group Sum of Squares)
ss_between = sum(len(group) * (np.mean(group) - overall_mean)**2 for group in data)

# Calculate SSW (Within-group Sum of Squares)
ss_within = sum(np.sum((x - np.mean(group))**2 for x in group) for group in data)

# Verify that SST is the sum of SSB and SSW
assert np.isclose(ss_total, ss_between + ss_within)

# Degrees of freedom
df_between = len(data) - 1
df_within = sum(len(group) - 1 for group in data)
df_total = len(np.concatenate(data)) - 1

# Mean Squares
ms_between = ss_between / df_between
ms_within = ss_within / df_within

# F-statistic
f_statistic = ms_between / ms_within

# P-value
p_value = 1 - stats.f.cdf(f_statistic, df_between, df_within)

# Output results
print("SST:", ss_total)
print("SSB:", ss_between)
print("SSW:", ss_within)
print("Degrees of Freedom - Between:", df_between)
print("Degrees of Freedom - Within:", df_within)
print("Degrees of Freedom - Total:", df_total)
print("Mean Square - Between:", ms_between)
print("Mean Square - Within:", ms_within)
print("F-statistic:", f_statistic)
print("P-value:", p_value)


SST: 143.0
SSB: 128.0
SSW: 15.0
Degrees of Freedom - Between: 2
Degrees of Freedom - Within: 9
Degrees of Freedom - Total: 11
Mean Square - Between: 64.0
Mean Square - Within: 1.6666666666666667
F-statistic: 38.4
P-value: 3.921014940799772e-05


  ss_within = sum(np.sum((x - np.mean(group))**2 for x in group) for group in data)


A6. Interpretation: The groups are significantly different (p=0.02), suggesting at least one group mean differs. Further analyses or comparisons are needed to identify specific differences between groups.

A7. Handling Missing Data: Impute or exclude missing values. Consequences: biased results, reduced power. Methods: mean imputation, multiple imputation, or statistical techniques.

A8. Post-hoc Tests: Tukey's HSD for equal sample sizes, Bonferroni for unequal sizes. Use when ANOVA indicates group differences. Example: Comparing means after finding a significant overall difference.

In [12]:
#Answer 9:

# Assuming you have the weight loss data for each diet
diet_A = [2, 3, 4, 3, 5, 6, 4, 3, 2, 1, 2, 3, 4, 5, 3, 2, 4, 5, 6, 7, 3, 4, 5, 3, 2, 4, 5, 6, 4, 3, 2, 1, 2, 3, 4, 5, 3, 2, 4, 5, 6, 7, 3, 4, 5, 3, 2, 4, 5]
diet_B = [3, 4, 5, 4, 6, 7, 5, 4, 3, 2, 3, 4, 5, 6, 4, 3, 5, 6, 7, 8, 4, 5, 6, 4, 3, 5, 6, 7, 5, 4, 3, 2, 3, 4, 5, 6, 4, 3, 5, 6, 7, 8, 4, 5, 6, 4, 3, 5, 6]
diet_C = [4, 5, 6, 5, 7, 8, 6, 5, 4, 3, 4, 5, 6, 7, 5, 4, 6, 7, 8, 9, 5, 6, 7, 5, 4, 6, 7, 8, 6, 5, 4, 3, 4, 5, 6, 7, 5, 4, 6, 7, 8, 9, 5, 6, 7, 5, 4, 6, 7]

# Combine data into a list of arrays
data = [diet_A, diet_B, diet_C]

# Perform one-way ANOVA
f_statistic, p_value = stats.f_oneway(*data)

# Output results
print("F-statistic:", f_statistic)
print("P-value:", p_value)


F-statistic: 22.283062645011604
P-value: 3.7043727062106384e-09


In [16]:
#Answer 10:
# Assuming you have a DataFrame with columns: 'Time', 'Program', 'Experience'
# Create a sample DataFrame

data = {
    'Time': [10, 12, 15, 11, 13, 16, 14, 18, 20, 19, 22, 25, 18, 21, 23, 16, 19, 22, 26, 24, 28, 30, 14, 16, 18, 12, 14, 17, 19, 21],
    'Program': ['A'] * 10 + ['B'] * 10 + ['C'] * 10,
    'Experience': ['Novice'] * 15 + ['Experienced'] * 15
}

df = pd.DataFrame(data)

# Fit the two-way ANOVA model
model = ols('Time ~ C(Program) + C(Experience) + C(Program):C(Experience)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)

# Output results
print(anova_table)




                              sum_sq    df          F    PR(>F)
C(Program)                417.237361   2.0  10.542441  0.003206
C(Experience)                    NaN   1.0        NaN       NaN
C(Program):C(Experience)    2.083333   2.0   0.052640  0.820327
Residual                  514.500000  26.0        NaN       NaN


  F /= J


In [19]:
#Answer 11:

# Assuming you have test score data for control and experimental groups
control_group = [75, 80, 85, 78, 82, 79, 81, 77, 83, 80]
experimental_group = [85, 88, 92, 80, 87, 89, 91, 84, 90, 86]

# Perform two-sample t-test
t_statistic, p_value = stats.ttest_ind(control_group, experimental_group)

# Output results
print("Two-Sample T-Test:")
print("T-statistic:", t_statistic)
print("P-value:", p_value)

# If results are significant, perform post-hoc test (Tukey's HSD)
if p_value < 0.05:
    data = control_group + experimental_group
    groups = ['Control'] * len(control_group) + ['Experimental'] * len(experimental_group)
    tukey_results = pairwise_tukeyhsd(data, groups)
    print("\nPost-Hoc (Tukey's HSD):")
    print(tukey_results)


Two-Sample T-Test:
T-statistic: -4.883928821778255
P-value: 0.00011943155310842275

Post-Hoc (Tukey's HSD):
   Multiple Comparison of Means - Tukey HSD, FWER=0.05    
 group1    group2    meandiff p-adj  lower   upper  reject
----------------------------------------------------------
Control Experimental      7.2 0.0001 4.1028 10.2972   True
----------------------------------------------------------


In [34]:
#Answer 12:
# Create a sample DataFrame
data = {
    'Store': ['A'] * 30 + ['B'] * 30 + ['C'] * 30,
    'Sales': np.random.randint(100, 1000, 90)
}

# Ensure that each store has the same number of observations
# For the sake of example, let's assume each store has 30 days of sales data
data['Sales'] = np.concatenate([np.random.randint(100, 1000, 30) for _ in range(3)])

df = pd.DataFrame(data)

# Perform one-way ANOVA
anova_result = f_oneway(df['Sales'][df['Store'] == 'A'],
                         df['Sales'][df['Store'] == 'B'],
                         df['Sales'][df['Store'] == 'C'])

# Output results
print("One-way ANOVA:")
print("F-statistic:", anova_result.statistic)
print("P-value:", anova_result.pvalue)

# If results are significant, perform post-hoc test (Tukey's HSD)
if anova_result.pvalue < 0.05:
    tukey_results = pairwise_tukeyhsd(df['Sales'], df['Store'])
    print("\nPost-Hoc (Tukey's HSD):")
    print(tukey_results.summary())


One-way ANOVA:
F-statistic: 1.8307802917104692
P-value: 0.16640972600773127
