In [None]:
# TO FILL WITH MANUALLY FILLED DATA

import pandas as pd
filled = pd.read_csv("manual_filled.csv")
people = pd.read_csv("final/people.csv")
contacts = pd.read_csv("final/contacts.csv")
for index, row in filled.iterrows():
    "id,Assigned,first_name,last_name,gender,class_year,is_jewish,school,contacts,"
    person_id = int(row['id'])
    person_index = people.index[people['id'] == person_id].tolist()[0]
    people.at[person_index, 'first_name'] = row['first_name']
    people.at[person_index, 'last_name'] = row['last_name']
    people.at[person_index, 'gender'] = row['gender']
    people.at[person_index, 'class_year'] = row['class_year']
    people.at[person_index, 'is_jewish'] = row['is_jewish']
    people.at[person_index, 'school'] = row['school']

    contact = row['contacts']
    if pd.isna(contact):
        continue
    contact = contact.strip().lower()
    if contact not in contacts['contact_value']:
        contact_id = contacts['id'].max() + 1
        contact_type = "school email" if contact.endswith("@college.harvard.edu") or contact.endswith("@mit.edu") else "personal email"
        contacts.loc[len(contacts)] = [contact_id, person_id, contact_type, contact, False]
    else:
        existing_contact = contacts[contacts['contact_value'] == contact]
        existing_person_id = existing_contact['person_id'].values[0]
        if existing_person_id != person_id:
            print(f"Contact {contact} is linked to person {existing_person_id} but should be linked to {person_id}")

people.to_csv("final/people.csv", index=False)
contacts.to_csv("final/contacts.csv", index=False)

In [None]:
import pandas as pd

people = pd.read_csv("final/people.csv")
contacts = pd.read_csv("final/contacts.csv")

# contacts onto to people m:1
# create one row per person with contacts as a list of the contact_value column values
contacts_grouped = contacts[contacts['contact_type'] == "school email"].groupby("person_id")["contact_value"].first().reset_index()
people = people.merge(contacts_grouped, left_on="id", right_on="person_id", how="left")
people.rename(columns={"contact_value": "contacts"}, inplace=True)
people.drop(columns=["person_id", "preferred_name"], inplace=True)

# find any null values unless school = Other, then class year can be null and school email can be null
to_fill = people[people["school"] != "Other"]
to_fill = to_fill[to_fill.isna().any(axis=1)]
to_fill_other = people[people["school"] == "Other"]
to_fill_other = to_fill_other[to_fill_other[["first_name", "last_name", "gender", "is_jewish", "school"]].isna().any(axis=1)]

to_fill = pd.concat([to_fill, to_fill_other])
to_fill.drop_duplicates(inplace=True)

to_fill["class_year"] = to_fill["class_year"].apply(lambda x: pd.NA if pd.isna(x) else int(x))
to_fill.fillna("", inplace=True)

In [None]:
to_fill.to_csv("to_fill.csv", index=False)

# Analysis

In [4]:
import pandas as pd

# add is first event
import pandas as pd

# Load RSVPs CSV
attendance = pd.read_csv('final/attendance.csv').iloc[:, 1:]
attendance["approved"] = attendance["approved"].astype(bool)
# Load Events CSV
events = pd.read_csv('final/events.csv')

# Add is_first_event column to attendance
attendance = attendance.merge(events[['id', 'start_datetime']], left_on='event_id', right_on='id', how='left')

# Drop the duplicate 'id' column from events table
attendance.drop(columns=['id'], inplace=True)

# Convert start_datetime to datetime type
attendance['start_datetime'] = pd.to_datetime(attendance['start_datetime'])

# Sort RSVPs by person_id and event start time
attendance_sorted = attendance.sort_values(by=['person_id', 'start_datetime'])
first_event_times = attendance_sorted[attendance_sorted['checked_in'] == True]
# For each person_id, get the earliest start_datetime (unique pair)
first_event_times = first_event_times.groupby('person_id', as_index=False)['start_datetime'].min()


attendance_sorted['key'] = attendance_sorted['person_id'].astype(str) + attendance_sorted['start_datetime'].astype(str)
first_event_times['key'] = first_event_times['person_id'].astype(str) + first_event_times['start_datetime'].astype(str)
# Set is_first_event = True if this is the earliest event, else False
attendance_sorted['is_first_event'] = attendance_sorted['key'].isin(first_event_times['key'])
attendance_sorted.drop(columns=['key'], inplace=True)
attendance_sorted.sort_values(by=['start_datetime'], inplace=True)
attendance_sorted.drop(columns=['start_datetime'], inplace=True)

attendance = attendance_sorted

events = pd.read_csv("final/events.csv")
people = pd.read_csv("final/people.csv")

attendance = attendance.merge(people, left_on="person_id", right_on="id", how="left")

# Standardize data
attendance["is_jewish"] = attendance["is_jewish"].apply(lambda x: x if pd.isna(x) else x.upper())
attendance["gender"] = attendance["gender"].apply(lambda x: x if pd.isna(x) else x.upper())

# Aggregate RSVP, Approved, Checked-in counts by event
agg_attendance = attendance.groupby("event_id").agg({
    "rsvp": "sum",
    "approved": "sum",
    "checked_in": "sum",
    "is_first_event": "sum"
}).reset_index()

attendance['is_jewish'] = attendance['is_jewish'].fillna("N/A")
attendance['gender'] = attendance['gender'].fillna("N/A")
attendance['class_year'] = attendance['class_year'].fillna(1000)
# Calculate Jewish status percentages by event
jewish_counts = attendance.pivot_table(index="event_id", columns="is_jewish", aggfunc="size")
jewish_percentages = jewish_counts.div(jewish_counts.sum(axis=1), axis=0).reset_index()

# Calculate Gender percentages by event
gender_counts = attendance.pivot_table(index="event_id", columns="gender", aggfunc="size")
gender_percentages = gender_counts.div(gender_counts.sum(axis=1), axis=0).reset_index()

# Calculate Class year percentages by event
class_year_counts = attendance.pivot_table(index="event_id", columns="class_year", aggfunc="size")
class_year_percentages = class_year_counts.div(class_year_counts.sum(axis=1), axis=0).reset_index()

# Combine all summaries
summary = agg_attendance \
    .merge(jewish_percentages, on="event_id", how="left", suffixes=(None, '_jewish')) \
    .merge(gender_percentages, on="event_id", how="left", suffixes=(None, '_gender')) \
    .merge(class_year_percentages, on="event_id", how="left", suffixes=(None, '_class_year'))

# Merge with event details (assuming you need event code)
final_summary = summary.merge(events[["id", "category", "description"]], left_on="event_id", right_on="id", how="left")

# Drop redundant 'id' column after merge
final_summary = final_summary.drop(columns=['id'])


final_summary["J"] = final_summary["J"].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary["N"] = final_summary["N"].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary["F"] = final_summary["F"].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary["M"] = final_summary["M"].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary[2025.0] = final_summary[2025.0].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary[2026.0] = final_summary[2026.0].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary[2027.0] = final_summary[2027.0].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary[2028.0] = final_summary[2028.0].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary.drop(columns=["event_id"], inplace=True)
final_summary.drop(columns=["N/A", "N/A_gender", 1000.0], inplace=True)
final_summary = final_summary[["category", "description", "rsvp", "approved", "checked_in", "is_first_event", "J", "N", "F", "M", 2025.0, 2026.0, 2027.0, 2028.0]]
final_summary.to_excel("final_summary.xlsx", index=False)

# Make Email List

In [7]:
separate = False

import pandas as pd

people   = pd.read_csv("final/people.csv")
contacts = pd.read_csv("final/contacts.csv")
attendance  = pd.read_csv("final/attendance.csv")

# 1) pick out subsets
school_emails = contacts[contacts["contact_type"] == "school email"]
all_emails    = contacts[contacts["contact_type"].str.contains("email", na=False)]

# 2) assemble with priority tags
group1 = (
    people
    .merge(school_emails, left_on="id", right_on="person_id", how="inner")
    .assign(priority=1)
)
group2 = (
    people[people["school"].str.lower() == "harvard"]
    .merge(all_emails, left_on="id", right_on="person_id", how="left")
    .assign(priority=2)
)
group3 = (
    people[people["school"].str.lower() == "mit"]
    .merge(all_emails, left_on="id", right_on="person_id", how="left")
    .assign(priority=3)
)
group4 = (
    people
    .merge(all_emails, left_on="id", right_on="person_id", how="inner")
    .query("school.str.lower() not in ['mit','harvard']")
    .query("not contact_value.str.endswith('berklee.edu')")
    .assign(priority=4)
)

# 3) concat + dedupe on person_id
all_people = pd.concat([group1, group2, group3, group4], ignore_index=True)
all_people = (
    all_people
    .sort_values("priority")
    .drop_duplicates(subset=["person_id"], keep="first")
)

attendance = attendance[attendance["checked_in"] == True]
event_counts = attendance.groupby("person_id")["id"].count().reset_index()
event_counts.rename(columns={"id": "event_count"}, inplace=True)

all_people = all_people.merge(event_counts, left_on="person_id", right_on="person_id", how="left")
all_people.drop(columns=["person_id"], inplace=True)

all_people['event_count'] = all_people['event_count'].fillna(0)

# 4) keep only the columns you need
all_people = all_people[["first_name", "last_name", "school", "contact_value", "event_count"]]

if separate:
    # 5) split MIT vs others with a consistent boolean mask
    mask_mit = (
        all_people["school"].str.lower().eq("mit")
        | all_people["contact_value"].str.endswith("mit.edu")
    )
    mit   = all_people[mask_mit].copy()
    other = all_people[~mask_mit].copy()

    # 6) write out
    mit.to_csv("mit_mailing.csv",        index=False)
    other.to_csv("zakarias_mailing.csv", index=False)
else:
    # 5) write out
    all_people.to_csv("all_mailing.csv", index=False)