### Exam 2
Name: Zach Hatzenbeller

Course: EN.625.661.83.FA25

In [1]:
import pandas as pd
from pathlib import Path
from itertools import product
import numpy as np
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(Path.cwd() / "Exam 2 Problem 2 Data.txt", delim_whitespace=True)
levels_A = df['A'].unique()
levels_B = df['B'].unique()
combos = list(product(levels_A, levels_B))

# Used this code to sample one observation from each combination
# Forgot to set random_state for reproducibility so added it now
# But will use the sampled values from the original run for consistency with the exam
sampled_values = {}
for idx, combo in enumerate(combos):
    mask = (df['A'] == combo[0]) & (df['B'] == combo[1])
    subset = df[mask]
    sample_value = subset.sample(random_state=42).values[0]
    sampled_values[idx] = sample_value

sampled_df = pd.DataFrame.from_dict(sampled_values, orient='index', columns=df.columns)
sampled_df["Sample_Mean"] = [10.7,11.1,14.5,8.8,5.0,8.8]
sampled_df["Sample_SD"] = [7.5,6.5,8.3,6.6,7.1,7.6]

sampled_df

Unnamed: 0,Obs,A,B,N,Sample_Mean,Sample_SD
0,9,a1,b1,20,10.7,7.5
1,19,a1,b2,20,11.1,6.5
2,29,a1,b3,20,14.5,8.3
3,39,a2,b1,20,8.8,6.6
4,49,a2,b2,20,5.0,7.1
5,59,a2,b3,20,8.8,7.6


### Averages across all cells and B

In [3]:
y1_bar = sampled_df.loc[sampled_df["B"]=="b1","Sample_Mean"].mean()
y2_bar = sampled_df.loc[sampled_df["B"]=="b2","Sample_Mean"].mean()
y3_bar = sampled_df.loc[sampled_df["B"]=="b3","Sample_Mean"].mean()
grand_avg = sampled_df["Sample_Mean"].mean()

print("Mean Results for Dataset:")
print(f"B1 Cell Mean: {y1_bar}")
print(f"B2 Cell Mean: {y2_bar}")
print(f"B3 Cell Mean: {y3_bar}")
print(f"Grand Average: {grand_avg}")

Mean Results for Dataset:
B1 Cell Mean: 9.75
B2 Cell Mean: 8.05
B3 Cell Mean: 11.65
Grand Average: 9.816666666666665


### Sum of Squares

Full ANOVA Table

In [4]:
# Sum of Squares
sse_full = np.sum((sampled_df["N"] - 1)*(sampled_df["Sample_SD"])**2)
sst_full = sse_full + np.sum((sampled_df["Sample_Mean"] - grand_avg)**2 * sampled_df["N"])
ss_model = sst_full - sse_full

# Decompose SS_model into SSA, SSB and SSAB
ssa = np.sum(sampled_df.groupby("A").apply(lambda x: x["N"].sum() * (x["Sample_Mean"].mean() - grand_avg)**2))
ssb = np.sum(sampled_df.groupby("B").apply(lambda x: x["N"].sum() * (x["Sample_Mean"].mean() - grand_avg)**2))
ssab = ss_model - ssa - ssb

# Degrees of Freedom
df_a = len(levels_A) - 1
df_b = len(levels_B) - 1
df_ab = df_a * df_b

anova_table_full = pd.DataFrame({
    "Source": ["A", "B", "AB", "Error", "Total"],
    "SS": [ssa, ssb, ssab, sse_full, sst_full],
    "DF": [df_a, df_b, df_ab, sampled_df["N"].sum() - len(combos), len(sampled_df) - 1]
})
anova_table_full["MS"] = anova_table_full["SS"] / anova_table_full["DF"]
anova_table_full["F"] = anova_table_full["MS"] / anova_table_full.loc[anova_table_full["Source"]=="Error", "MS"].values[0]
anova_table_full["p-value"] = anova_table_full["F"].apply(lambda f: 1 - stats.f.cdf(f, anova_table_full.loc[anova_table_full["Source"]!="Error", "DF"].values[0], anova_table_full.loc[anova_table_full["Source"]=="Error", "DF"].values[0]) if not np.isnan(f) else np.nan)
anova_table_full

Unnamed: 0,Source,SS,DF,MS,F,p-value
0,A,625.633333,1,625.633333,11.762973,0.000842
1,B,259.466667,2,129.733333,2.439208,0.121108
2,AB,107.466667,2,53.733333,1.010278,0.316965
3,Error,6063.28,114,53.186667,1.0,0.319428
4,Total,7055.846667,5,1411.169333,26.532389,1e-06


Additive ANOVA Table

In [5]:
anova_table_additive = pd.DataFrame({
    "Source": ["A", "B", "Error", "Total"],
    "SS": [ssa, ssb, sse_full + ssab, sst_full],
    "DF": [df_a, df_b, sampled_df["N"].sum() - len(combos) + df_ab, len(sampled_df) - 1]
})
anova_table_additive["MS"] = anova_table_additive["SS"] / anova_table_additive["DF"]
anova_table_additive["F"] = anova_table_additive["MS"] / anova_table_additive.loc[anova_table_additive["Source"]=="Error", "MS"].values[0]
anova_table_additive["p-value"] = anova_table_additive["F"].apply(lambda f: 1 - stats.f.cdf(f, anova_table_additive.loc[anova_table_additive["Source"]!="Error", "DF"].values[0], anova_table_additive.loc[anova_table_additive["Source"]=="Error", "DF"].values[0]) if not np.isnan(f) else np.nan)
anova_table_additive

Unnamed: 0,Source,SS,DF,MS,F,p-value
0,A,625.633333,1,625.633333,11.760889,0.000838
1,B,259.466667,2,129.733333,2.438776,0.121093
2,Error,6170.746667,116,53.196092,1.0,0.319392
3,Total,7055.846667,5,1411.169333,26.527688,1e-06


### Part C

In [6]:
theta_hat = -2*(y1_bar) + y2_bar + y3_bar
sigma2 = anova_table_additive.loc[anova_table_additive["Source"]=="Error", "MS"].values[0]
var_theta_hat = ((4/40) + (1/40) + (1/40))*sigma2
se_theta_hat = np.sqrt(var_theta_hat)
t_statistic = theta_hat / se_theta_hat
t_critical = stats.t.ppf(0.975, df=anova_table_additive.loc[anova_table_additive["Source"]=="Error", "DF"].values[0])
ci_95 = (theta_hat - stats.t.ppf(0.975, df=anova_table_additive.loc[anova_table_additive["Source"]=="Error", "DF"].values[0]) * se_theta_hat,
          theta_hat + stats.t.ppf(0.975, df=anova_table_additive.loc[anova_table_additive["Source"]=="Error", "DF"].values[0]) * se_theta_hat)
print(f"theta_hat: {theta_hat}")
print(f"se_theta_hat: {se_theta_hat}")
print(f"t_stat: {t_statistic}")
print(f"t_critical: {t_critical}")
print(f"95% CI: {ci_95}")

theta_hat: 0.20000000000000107
se_theta_hat: 2.824785618963579
t_stat: 0.07080183312225356
t_critical: 1.9806260024239375
95% CI: (-5.39484384819246, 5.794843848192462)


### Part D1

In [7]:
means = sampled_df["Sample_Mean"].values

# Theta Model 1
theta_hat_1 = -2*(means[0]) + means[1] + means[2]
sigma2_1 = anova_table_full.loc[anova_table_full["Source"]=="Error", "MS"].values[0]
var_theta_hat_1 = ((4/20) + (1/20) + (1/20))*sigma2_1
se_theta_hat_1 = np.sqrt(var_theta_hat_1)
t_statistic_1 = theta_hat_1 / se_theta_hat_1
t_critical_1 = stats.t.ppf(0.975, df=anova_table_full.loc[anova_table_full["Source"]=="Error", "DF"].values[0])
ci_95_1 = (theta_hat_1 - t_critical_1 * se_theta_hat_1, theta_hat_1 + t_critical_1 * se_theta_hat_1)

# Theta Model 2
theta_hat_2 = -2*(means[3]) + means[4] + means[5]
sigma2_2 = anova_table_full.loc[anova_table_full["Source"]=="Error", "MS"].values[0]
var_theta_hat_2 = ((4/20) + (1/20) + (1/20))*sigma2_2
se_theta_hat_2 = np.sqrt(var_theta_hat_2)
t_statistic_2 = theta_hat_1 / se_theta_hat_2
t_critical_2 = stats.t.ppf(0.975, df=anova_table_full.loc[anova_table_full["Source"]=="Error", "DF"].values[0])
ci_95_2 = (theta_hat_2 - t_critical_2 * se_theta_hat_2, theta_hat_2 + t_critical_2 * se_theta_hat_2)


print("Theta Model 1 Results:")
print(f"theta_hat: {theta_hat_1}")
print(f"se_theta_hat: {se_theta_hat_1}")
print(f"t_stat: {t_statistic_1}")
print(f"t_critical: {t_critical_1}")
print(f"95% CI: {ci_95_1}")

print("\n")

print("Theta Model 2 Results:")
print(f"theta_hat: {theta_hat_2}")
print(f"se_theta_hat: {se_theta_hat_2}")
print(f"t_stat: {t_statistic_2}")
print(f"t_critical: {t_critical_2}")
print(f"95% CI: {ci_95_2}")


Theta Model 1 Results:
theta_hat: 4.200000000000001
se_theta_hat: 3.994496213541828
t_stat: 1.0514467345748109
t_critical: 1.9809922979375063
95% CI: (-3.7130662331668924, 12.113066233166894)


Theta Model 2 Results:
theta_hat: -3.8000000000000007
se_theta_hat: 3.994496213541828
t_stat: 1.0514467345748109
t_critical: 1.9809922979375063
95% CI: (-11.713066233166895, 4.113066233166893)


### Part D2

In [8]:
delta = theta_hat_1 - theta_hat_2
var_delta = var_theta_hat_1 + var_theta_hat_2
se_delta = np.sqrt(var_delta)
t_statistic_delta = delta / se_delta
t_critical_delta = stats.t.ppf(0.975, df=anova_table_full.loc[anova_table_full["Source"]=="Error", "DF"].values[0])
ci_95_delta = (delta - t_critical_delta * se_delta, 
               delta + t_critical_delta * se_delta)
p_value_delta = 2 * (1 - stats.t.cdf(abs(t_statistic_delta), df=anova_table_full.loc[anova_table_full["Source"]=="Error", "DF"].values[0]))

print("Difference between Theta Models Results:")
print(f"delta: {delta}")
print(f"se_delta: {se_delta}")
print(f"t_stat_delta: {t_statistic_delta}")
print(f"t_critical_delta: {t_critical_delta}")
print(f"95% CI delta: {ci_95_delta}")
print(f"p-value delta: {p_value_delta}")

Difference between Theta Models Results:
delta: 8.000000000000002
se_delta: 5.649070720038828
t_stat_delta: 1.4161621258558108
t_critical_delta: 1.9809922979375063
95% CI delta: (-3.1907655869011986, 19.1907655869012)
p-value delta: 0.15945459398124529
