# 📊 Attendance & Weather Analysis
This notebook contains all the analysis steps for the DSA210 final project.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ttest_ind

# Load data
attendance = pd.read_csv("attendance.csv", parse_dates=["date"])
weather = pd.read_csv("weather.csv", parse_dates=["date"])

# Merge datasets
df = pd.merge(attendance, weather, on="date", how="inner")
df["cold"] = df["tempmax_c"] < 8
df["wet"] = df["precip"] > 0.5
df["cold_and_wet"] = df["cold"] & df["wet"]


In [None]:
# Simplify condition labels
df["weather_type"] = df["conditions"].str.extract(r'^(Snow|Rain)?').fillna("Clear/Cloudy")

# Bar plot
sns.set(style="whitegrid")
plt.figure(figsize=(6,4))
sns.barplot(data=df, x="weather_type", y="attendance", ci=None)
plt.title("Average Attendance by Weather Type")
plt.xlabel("Weather Type")
plt.ylabel("Attendance Rate")
plt.tight_layout()
plt.show()

In [None]:
# Max temperature vs attendance
plt.figure(figsize=(6, 4))
sns.scatterplot(data=df, x="tempmax_c", y="attendance", marker="o", s=60)
plt.title("Attendance vs. Max Temperature")
plt.xlabel("Max Temperature (°C)")
plt.ylabel("Attendance")
plt.tight_layout()
plt.show()

# Precipitation vs attendance
plt.figure(figsize=(6, 4))
sns.scatterplot(data=df, x="precip", y="attendance", marker="o", s=60)
plt.title("Attendance vs. Precipitation")
plt.xlabel("Precipitation (mm)")
plt.ylabel("Attendance")
plt.tight_layout()
plt.show()

In [None]:
# T-test: cold & wet vs others
group_a = df[df["cold_and_wet"]]["attendance"]
group_b = df[~df["cold_and_wet"]]["attendance"]
t_stat, p_val = ttest_ind(group_a, group_b, equal_var=False)

print("T-statistic:", t_stat)
print("P-value:", p_val)
print("Group A size:", len(group_a))
print("Group B size:", len(group_b))
