In [472]:
import pandas as pd
import numpy as np
import re

def clean_na_values(df):
    """
    Replace a broad range of 'NA-like' string patterns, empty strings, and placeholders with pd.NA.
    
    Args:
        df (pd.DataFrame): DataFrame to clean.
    Returns:
        pd.DataFrame: Cleaned DataFrame with standardized NA values.
    """
    # Expanded patterns to recognize as NA
    na_patterns = [
        r"^\s*$",               # empty or whitespace only
        r"^(n/?a)$",            # NA, N/A, n/a, na
        r"^(none)$",            # none
        r"^(null)$",            # null
        r"^<na>$",              # <NA>
        r"^#n/a$",              # #N/A
        r"^\[?na\]?$",          # [NA], NA
        r"^nan$",               # nan, NaN
        r"^nil$",               # nil
        r"^missing$",           # missing
        r"^unknown$",           # unknown
        r"^unavailable$",       # unavailable
        r"^undisclosed$",       # undisclosed
        r"^tbd$",               # TBD
        r"^#value!$",           # #VALUE!
        r"^\?$",                # ?
        r"^-+$",                # -, --, ---
        r"^\.\.\.$",            # ...
    ]
    
    # Combine patterns into a regex
    na_regex = re.compile("|".join(na_patterns), re.IGNORECASE)
    
    # Apply regex replacement to all string-like elements
    return df.applymap(lambda x: pd.NA if isinstance(x, str) and na_regex.match(x.strip()) else x)

In [473]:
import pandas as pd

# Load and clean main dataset
base = pd.read_csv("Raw/all_people.csv").replace("not harvard", pd.NA)

# Standardizing grades and names
base["Grade"] = (
    base["Grade"]
    .replace({"S": "Senior", "Other": pd.NA, "Graduate": pd.NA, "GRAD": pd.NA})
    .replace({"Senior": 2025, "Junior": 2026, "Sophomore": 2027, "Freshman": 2028})
)
base["J/N"] = base["J/N"].replace("F", pd.NA)
base["First Name"] = base["first_name"].str.strip().str.lower()
base["Last Name"] = base["last_name"].str.strip().str.lower()
base = base.dropna(subset=["last_name"]).reset_index(drop=True)

# Assuming you have a `clean_na_values` function
base = clean_na_values(base)
base["id"] = base.index

# ------------------- NEW EMAIL EXTRACTION LOGIC -------------------

# Function to extract emails based on domain pattern
def extract_emails(df, email_columns, edu=True):
    email_list = []
    for col in email_columns:
        temp = df[["id", col]].dropna(subset=[col]).copy()
        if edu:
            temp = temp[temp[col].str.endswith(".edu", na=False)]
        else:
            temp = temp[~temp[col].str.endswith(".edu", na=False)]
        temp["contact_type"] = "school email" if edu else "personal email"
        temp.columns = ["person_id", "contact_value", "contact_type"]
        email_list.append(temp)
    # Concatenate and drop duplicates
    result = pd.concat(email_list).drop_duplicates(subset=["person_id", "contact_value"]).reset_index(drop=True)
    return result

# Extract school emails (.edu) and personal emails (non-.edu) from both columns
school_emails = extract_emails(base, ["School Email", "Emails"], edu=True)
personal_emails = extract_emails(base, ["School Email", "Emails"], edu=False)

# Combine all emails, deduplicate on same contact_value per person
all_emails = pd.concat([school_emails, personal_emails])
all_emails = all_emails.drop_duplicates(subset=["person_id", "contact_value"]).reset_index(drop=True)
all_emails["id"] = all_emails.index
all_emails["is_verified"] = False
all_emails = all_emails[["id", "person_id", "contact_type", "contact_value", "is_verified"]]

# ------------------- CLEAN FINAL PEOPLE BASE -------------------

base = base[["id", "First Name", "Last Name", "Gender", "Grade", "J/N", "School"]]
base.columns = ["id", "first_name", "last_name", "gender", "class_year", "is_jewish", "school"]
base["preferred_name"] = pd.NA
base = base.reset_index(drop=True)

# ------------------- DEBUGGING OUTPUT -------------------

for col in base.columns:
    print(f"Column: {col}")
    display(base[col].value_counts(dropna=False))

print("Missing values summary:")
display(base.isna().sum())

# Preview of final emails
print("Final extracted emails preview:")
display(all_emails.head())

Column: id


id
0       1
803     1
801     1
800     1
799     1
       ..
398     1
397     1
396     1
395     1
1195    1
Name: count, Length: 1196, dtype: int64

Column: first_name


first_name
alex       12
michael    10
emma        8
matthew     8
jack        7
           ..
riley       1
kritika     1
jinet       1
amiya       1
sebs        1
Name: count, Length: 765, dtype: int64

Column: last_name


last_name
wang              10
lee                8
kim                7
li                 5
cohen              5
                  ..
murigande          1
jm                 1
kalish-schur       1
mikrogiannakis     1
colegio            1
Name: count, Length: 1025, dtype: int64

Column: gender


gender
NaN    658
F      279
M      259
Name: count, dtype: int64

Column: class_year


class_year
NaN     461
2028    253
2025    192
2026    162
2027    119
<NA>      9
Name: count, dtype: int64

Column: is_jewish


is_jewish
NaN     605
N       403
J       185
<NA>      3
Name: count, dtype: int64

Column: school


school
Harvard    525
NaN        273
MIT        254
<NA>        83
Other       61
Name: count, dtype: int64

Column: preferred_name


preferred_name
<NA>    1196
Name: count, dtype: int64

Missing values summary:


id                   0
first_name           0
last_name            0
gender             658
class_year         470
is_jewish          608
school             356
preferred_name    1196
dtype: int64

Final extracted emails preview:


Unnamed: 0,id,person_id,contact_type,contact_value,is_verified
0,0,0,school email,zakarias_erdos@college.harvard.edu,False
1,1,1,school email,Yehudahtor@college.harvard.edu,False
2,2,2,school email,zachabrams@college.harvard.edu,False
3,3,3,school email,smikhail@college.harvard.edu,False
4,4,4,school email,farukoztok@college.harvard.edu,False


In [474]:
teds_id = base[(base["first_name"] == "theodore") & (base["last_name"] == "sunshine")]["id"].values[0]
all_emails.loc[len(all_emails)] = [len(all_emails), teds_id, "personal email", "tedsunshine@gmail.com", False]

In [475]:
all_emails[all_emails["person_id"] == 85]


Unnamed: 0,id,person_id,contact_type,contact_value,is_verified
80,80,85,school email,theodore_sunshine@college.harvard.edu,False
722,722,85,personal email,teddysunshine@icloud.com,False
1205,1205,85,personal email,tedsunshine@gmail.com,False


In [476]:
base.to_csv("final/people.csv", index=False)
all_emails.to_csv("final/contacts.csv", index=False)

In [477]:
events_data = [
    ["prelaunch", "speaker", "sunshine residence", "September 26, 2024, 5:30pm EST", "Jonathan Kraft spoke to students at the Sunshine residence"],
    ["launch", "party", "prudential center", "September 26, 2024, 7:30pm EST", "Party at the top of the prudential center"],
    ["russell house", "speaker dinner", "russell house", "October 14, 2024, 7:30pm EST" , "Join us for our first official event this coming Monday, October 14th at Russell House Tavern from 7:30 - 9:30 with dinner and of course an open bar. You’ll meet and chat with several Israeli tech founders. Our featured guests include the co-founder and CTO of a cloud security company acquired by Cisco, as well as a tech innovator who has developed an app that uses augmented reality to make art and creativity more accessible."],
    ["viale", "speaker dinner", "viale", "October 30, 2024, 7:45pm EST", "We’re excited to invite you to the next event hosted by The Camel at Viale, Central Square. As always, drinks and food are on us! Join us for a conversation with Boaz Fachler, Principal at Link Ventures, Dror Zaide, Co-Founder at Eleos Health, and a surprise guest (Adam Valkin), followed by a cocktail reception and dinner."],
    ["sababa nights", "party", "Dx", "November 7, 2024, 10:00pm EST", "Disco Rave this Thursday! Harvard x MIT party at the DX hosted by the Camel. Open bar and live DJ from Berklee. Bring your friends!" ],
    ["amplify", "speaker dinner", "bar enza", "November 13, 2024, 7:00pm EST", "Amplify VC hosted/funded dinner at bar enza"],
    ["community dinners I", "community dinner", "various", "November 20, 2024, 7:00pm EST", "Community dinners hosted by Camel, monday, wednesday, and thursday options"],
    ["secret sip", "community dinner", "sunshine residence", "December 6, 2024, 6:00pm EST", "Join us for a Shabbat dinner and afterparty with an intimate group of Harvard and MIT Camel community, just before we wrap up the semester."],
    ["pentera", "speaker dinner", "daedalus", "February 5, 2025, 7:00pm EST", "Hi Everyone! This coming Wednesday, Amitai Ratzon will be joining us at Bar Enza at 7 pm. Amitai is the CEO of Pentera, a cybersecurity unicorn. As always, there will be open bar and dinner."],
    ["bsmnt", "party", "bsmnt", "February 6, 2025, 9:00pm EST", "There’s a Camel in the BSMNT. We don’t know how it got there. But it’s throwing a rave. We want to welcome you all back to school with a night at BSMNT—Boston’s newest nightclub. As always, there will be open bar all night, and DJ Costa will be on deck."],
    ["zelnick speaker", "speaker", "sheraton commander", "February 13, 2025, 6:00pm EST", "This Thursday (February 13th) at 6 pm, we are bringing Strauss Zelnick to Sheraton Commander for an exclusive evening of drinks and conversation. Strauss is the CEO of Take-Two Interactive — the creators of NBA 2K and GTA — and the former chairman of the media giant CBS Corporation. He will be joining us to share insights on his career in Private Equity and Entertainment, and maybe even why GTA VI still hasn’t been released… See you soon!"],
    ["zelnick dinner", "speaker dinner", "gufo", "February 13, 2025, 7:30pm EST", "Hi, We’re hosting a private, invite-only dinner with Strauss Zelnick, CEO of Take-Two Interactive (NBA 2K and GTA) and former chairman of CBS Corporation, and we’d love to have you there!" ],
    ["community dinners II", "community dinner", "various", "February 26, 2025, 7:30pm EST", "Community dinners hosted by Camel"],
]

events = pd.DataFrame(events_data, columns = ["event_name", "category", "location", "start_datetime", "description"])

In [478]:
events["start_datetime"] = pd.to_datetime(events["start_datetime"], utc=False, format="%B %d, %Y, %I:%M%p %Z", errors="coerce")
events.sort_values(by="start_datetime", inplace=True)
events.reset_index(drop=True, inplace=True)
events["id"] = events.index
events=events[["id", "event_name", "category", "location", "start_datetime", "description"]]
events.to_csv("final/events.csv", index=False)
events

Unnamed: 0,id,event_name,category,location,start_datetime,description
0,0,prelaunch,speaker,sunshine residence,2024-09-26 17:30:00-05:00,Jonathan Kraft spoke to students at the Sunshi...
1,1,launch,party,prudential center,2024-09-26 19:30:00-05:00,Party at the top of the prudential center
2,2,russell house,speaker dinner,russell house,2024-10-14 19:30:00-05:00,Join us for our first official event this comi...
3,3,viale,speaker dinner,viale,2024-10-30 19:45:00-05:00,We’re excited to invite you to the next event ...
4,4,sababa nights,party,Dx,2024-11-07 22:00:00-05:00,Disco Rave this Thursday! Harvard x MIT party ...
5,5,amplify,speaker dinner,bar enza,2024-11-13 19:00:00-05:00,Amplify VC hosted/funded dinner at bar enza
6,6,community dinners I,community dinner,various,2024-11-20 19:00:00-05:00,"Community dinners hosted by Camel, monday, wed..."
7,7,secret sip,community dinner,sunshine residence,2024-12-06 18:00:00-05:00,Join us for a Shabbat dinner and afterparty wi...
8,8,pentera,speaker dinner,daedalus,2025-02-05 19:00:00-05:00,"Hi Everyone! This coming Wednesday, Amitai Rat..."
9,9,bsmnt,party,bsmnt,2025-02-06 21:00:00-05:00,There’s a Camel in the BSMNT. We don’t know ho...


# Merging

In [479]:
import pandas as pd
from difflib import SequenceMatcher

def fuzzy_ratio(str_a, str_b):
    return SequenceMatcher(None, str_a, str_b).ratio()

def is_initial(name):
    cleaned = name.strip().lower()
    if len(cleaned) == 1 and cleaned.isalpha():
        return True
    if len(cleaned) == 2 and cleaned[0].isalpha() and cleaned[1] == ".":
        return True
    return False

def compare_names(fn_ta, ln_ta, fn_sheet, ln_sheet, fuzzy_threshold=0.80):
    fn_ta = fn_ta.strip().lower()
    ln_ta = ln_ta.strip().lower()
    fn_sheet = fn_sheet.strip().lower()
    ln_sheet = ln_sheet.strip().lower()

    # ===== STEP 1: Check if both first and last names are exact match =====
    if fn_ta == fn_sheet and ln_ta == ln_sheet:
        return "auto_accept"
    # if one name is exact match and the other is a substring
    elif (fn_ta in fn_sheet and ln_sheet == ln_ta) or (fn_sheet in fn_ta and ln_ta == ln_sheet):
        return "auto_accept"
    elif (fn_ta == fn_sheet and ln_ta in ln_sheet) or (fn_sheet == fn_ta and ln_sheet in ln_ta):
        return "auto_accept"

    # ===== STEP 2: Handle initials only when necessary =====
    fn_ta_is_initial = is_initial(fn_ta)
    ln_ta_is_initial = is_initial(ln_ta)

    if fn_ta_is_initial or ln_ta_is_initial:
        # Initial matching logic:
        if fn_ta_is_initial:
            letter = fn_ta[0].lower()
            if not fn_sheet.startswith(letter):
                return "reject_now"
        if ln_ta_is_initial:
            letter = ln_ta[0].lower()
            if not ln_sheet.startswith(letter):
                return "reject_now"

        # If one of them is initial, require a good fuzzy match on the other
        if fn_ta_is_initial and not ln_ta_is_initial:
            ratio_last = fuzzy_ratio(ln_ta, ln_sheet)
            if ratio_last >= fuzzy_threshold:
                return "manual_review"
            else:
                return "reject_now"
        elif ln_ta_is_initial and not fn_ta_is_initial:
            ratio_first = fuzzy_ratio(fn_ta, fn_sheet)
            if ratio_first >= fuzzy_threshold:
                return "manual_review"
            else:
                return "reject_now"
        else:
            # Both initials that passed .startswith checks but aren't exact match
            return "manual_review"

    # ===== STEP 3: Fuzzy logic =====
    exact_first = (fn_ta == fn_sheet)
    exact_last = (ln_ta == ln_sheet)
    ratio_first = fuzzy_ratio(fn_ta, fn_sheet)
    ratio_last = fuzzy_ratio(ln_ta, ln_sheet)

    # If one name is exact, and the other is a good fuzzy match
    if exact_first and ratio_last >= fuzzy_threshold:
        print(f"Matching {fn_ta} {ln_ta} to {fn_sheet} {ln_sheet}")
        return "auto_accept"
    if exact_last and ratio_first >= fuzzy_threshold:
        print(f"Matching {fn_ta} {ln_ta} to {fn_sheet} {ln_sheet}")
        return "auto_accept"

    # If both are good fuzzy matches (but not exact)
    if ratio_first >= fuzzy_threshold and ratio_last >= fuzzy_threshold:
        return "manual_review"
    # Otherwise, reject
    return "reject_now"

In [480]:
def update_names_if_substring(df, idx, sheet_first, sheet_last, input_first, input_last):
    """
    Update the first_name and last_name in df at idx to the longer version
    between input and sheet names if one is a substring of the other.
    """
    # First name check
    if sheet_first.lower() in input_first.lower() or input_first.lower() in sheet_first.lower():
        longer_first = max(sheet_first, input_first, key=len)
        df.at[idx, 'first_name'] = longer_first
    
    # Last name check (if both are provided)
    if pd.notna(sheet_last) and pd.notna(input_last):
        if sheet_last.lower() in input_last.lower() or input_last.lower() in sheet_last.lower():
            longer_last = max(sheet_last, input_last, key=len)
            df.at[idx, 'last_name'] = longer_last

In [481]:
import pandas as pd

def find_person_id(
    row,
    people_df,
    contacts_df,
    email_col=None,
    handle_indices_list=None,
    fuzzy_threshold=0.80,  # Passed to compare_names
):
    if pd.isna(row["first_name"]):
        raise ValueError("First name is missing in the row.")

    first_name = row["first_name"].strip().lower()
    last_name = row.get("last_name")
    last_name = last_name.strip().lower() if pd.notna(last_name) else None

    # 1) Email matching
    if email_col and email_col in row and pd.notna(row[email_col]):
        email = str(row[email_col]).strip().lower()
        email_matches = contacts_df[contacts_df["contact_value"].str.lower() == email]
        if not email_matches.empty:
            person_id = email_matches.iloc[0]["person_id"]
            idx = people_df[people_df["id"] == person_id].index[0]  # map to index
            update_names_if_substring(people_df, idx, people_df.loc[idx, "first_name"], people_df.loc[idx, "last_name"], first_name, last_name)
            return person_id, people_df
        print(f"Could not find person with email: {email}")
        handle_indices_list.append((first_name, last_name))
    else:
        email = None

    # 2) Exact name matching
    potentials = people_df[
        (people_df["first_name"].str.lower() == first_name) &
        ((people_df["last_name"].str.lower() == last_name) if last_name else True)
    ]

    if len(potentials) == 1:
        idx = potentials.index[0]
        update_names_if_substring(people_df, idx, potentials.loc[idx, "first_name"], potentials.loc[idx, "last_name"], first_name, last_name)
        return potentials.loc[idx, "id"], people_df

    elif len(potentials) > 1:
        options = [
            f"{i} => {p['first_name']} {p['last_name']} (gender={p['gender']}, jewish={p['is_jewish']})"
            for i, (_, p) in enumerate(potentials.iterrows())
        ]
        options_str = "\n".join(options)
        choice = input(f"Multiple exact matches for '{first_name} {last_name or ''} and email {email}'. Choose one:\n\n{options_str}\n\nSelect index or 'n' to skip: ")
        if choice.lower() == "n":
            handle_indices_list.append((first_name, last_name))
            return None, people_df
        try:
            selected_idx = potentials.index[int(choice)]
            person_id = people_df.loc[selected_idx, "id"]
            update_names_if_substring(people_df, selected_idx, people_df.loc[selected_idx, "first_name"], people_df.loc[selected_idx, "last_name"], first_name, last_name)
            return person_id, people_df
        except:
            print("Invalid choice. Skipping.")
            handle_indices_list.append((first_name, last_name))
            return None, people_df

    # 3) Fuzzy matching
    auto_accepts, manual_reviews = [], []
    for idx, candidate in people_df.iterrows():
        verdict = compare_names(
            first_name,
            (last_name or ""),
            candidate["first_name"],
            candidate["last_name"] if pd.notna(candidate["last_name"]) else "",
            fuzzy_threshold,
        )
        if verdict == "auto_accept":
            auto_accepts.append(idx)
        elif verdict == "manual_review":
            manual_reviews.append(idx)

    # Handle auto-accept
    if len(auto_accepts) == 1:
        idx = auto_accepts[0]
        update_names_if_substring(people_df, idx, people_df.loc[idx, "first_name"], people_df.loc[idx, "last_name"], first_name, last_name)
        return people_df.loc[idx, "id"], people_df

    if len(auto_accepts) > 1:
        options = [
            f"{i} => {people_df.loc[idx, 'first_name']} {people_df.loc[idx, 'last_name']} (gender={people_df.loc[idx, 'gender']}, jewish={people_df.loc[idx, 'is_jewish']})"
            for i, idx in enumerate(auto_accepts)
        ]
        options_str = "\n".join(options)
        choice = input(f"Multiple 'auto_accept' matches for '{first_name} {last_name or ''} and email {email}'. Choose one:\n\n{options_str}\n\nSelect index or 'n' to skip: ")
        if choice.lower() == "n":
            handle_indices_list.append((first_name, last_name))
            return None, people_df
        try:
            idx = auto_accepts[int(choice)]
            update_names_if_substring(people_df, idx, people_df.loc[idx, "first_name"], people_df.loc[idx, "last_name"], first_name, last_name)
            return people_df.loc[idx, "id"], people_df
        except:
            print("Invalid choice. Skipping.")
            handle_indices_list.append((first_name, last_name))
            return None, people_df

    # Manual review
    if manual_reviews:
        options = [
            f"{i} => {people_df.loc[idx, 'first_name']} {people_df.loc[idx, 'last_name']} (gender={people_df.loc[idx, 'gender']}, jewish={people_df.loc[idx, 'is_jewish']})"
            for i, idx in enumerate(manual_reviews)
        ]
        options_str = "\n".join(options)
        choice = input(f"No auto-accept found for '{first_name} {last_name or ''}', but possible matches:\n\n{options_str}\n\nSelect index or 'n' to skip: ")
        if choice.lower() == "n":
            handle_indices_list.append((first_name, last_name))
            return None, people_df
        try:
            idx = manual_reviews[int(choice)]
            update_names_if_substring(people_df, idx, people_df.loc[idx, "first_name"], people_df.loc[idx, "last_name"], first_name, last_name)
            return people_df.loc[idx, "id"], people_df
        except:
            print("Invalid choice. Skipping.")
            handle_indices_list.append((first_name, last_name))
            return None, people_df

    # No matches
    print(f"No match found for '{first_name} {last_name or ''}'.")
    handle_indices_list.append((first_name, last_name))
    return None, people_df

In [484]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import os


people = pd.read_csv("final/people.csv")
people['class_year'] = people['class_year']
contacts = pd.read_csv("final/contacts.csv")
events = pd.read_csv("final/events.csv")

to_handle = {}

attended_sheets = [f for f in os.listdir("Raw") if "Sheets" in f]
no_rsvp_sheets = ["Amplify_Sheets.csv", "Community Dinners II_Sheets.csv", "CommunityDinners_Sheets.csv"]
attended_sheets = [f for f in attended_sheets if f not in no_rsvp_sheets]
rsvp_sheets = [f for f in os.listdir("Raw") if "Posh" in f or "Partiful" in f]

print(f"Found {len(attended_sheets)} attended sheets and {len(rsvp_sheets)} rsvp sheets, total {len([f for f in os.listdir("Raw") if f.endswith(".csv")])}")
# Create final attendance DF
attendance = pd.DataFrame(
    columns=[
        "person_id",
        "event_id",
        "rsvp",
        "approved",
        "checked_in",
        "rsvp_datetime",
        "is_first_event",
        "invite_token_id",
    ]
)



###########################################################
# rsvp params
###########################################################
eid_key = {
    "Launch_Partiful.csv": 1,
    "Pentera_Posh.csv": 8,
    "PreLaunch_Partiful.csv": 0,
    "Russell_Partiful.csv": 2,
    "SababaNights_Partiful.csv": 4,
    "SecretSip_Partiful.csv": 7,
    "Viale_Partiful.csv": 3,
    "ZelnickDinner_Posh.csv": 11,
    "ZelnickSpeaker_Posh.csv": 10,
    "BSMNT_Posh.csv": 9,
}
indices_to_handle_rsvp = []
for rsvp_csv_file_name in rsvp_sheets:
    eid = eid_key[rsvp_csv_file_name]
    rsvp_date_col = "rsvp_date" if "partiful" in rsvp_csv_file_name.lower() else "order_date"
    approved_col = "approved"

    # Read CSVs
    rsvp_csv = pd.read_csv(f"Raw/{rsvp_csv_file_name}")
    # Split the 'name' column into 'first_name' and 'last_name'
    rsvp_csv[["first_name", "last_name"]] = (
        rsvp_csv["name"].fillna("").str.split(" ", n=1, expand=True)
    )

    rsvp_email_col = "email" if "email" in rsvp_csv.columns else None

    # Convert RSVP date columns to datetime with a timezone
    rsvp_csv[rsvp_date_col] = pd.to_datetime(rsvp_csv[rsvp_date_col], errors="coerce")
    rsvp_csv[rsvp_date_col] = rsvp_csv[rsvp_date_col].dt.tz_localize(
        "America/New_York", ambiguous="NaT", nonexistent="NaT"
    )
    # -----------------------------------------------------------------------------
    # Process RSVPs
    # -----------------------------------------------------------------------------
    print(f"Handling RSVPs for {rsvp_csv_file_name}...\n")
    for i, rsvp_row in rsvp_csv.iterrows():
        pid, people = find_person_id(
            rsvp_row,
            people,
            contacts,
            email_col=rsvp_email_col,
            handle_indices_list=indices_to_handle_rsvp,
        )
        if pid is not None:
            if rsvp_csv_file_name == "BSMNT_Posh.csv":
                checked_in = rsvp_row["scanned"] == 1
            else:
                checked_in = False
            attendance.loc[len(attendance)] = [
                pid,
                eid,
                True,                             # rsvp = True
                rsvp_row[approved_col],          # approved?
                checked_in,                           # checked_in defaults to False
                pd.to_datetime(rsvp_row[rsvp_date_col]),
                False,                           # is_first_event
                pd.NA,                           # invite_token_id
            ]

# -----------------------------------------------------------------------------
# Process Attendance
# -----------------------------------------------------------------------------
##########################################################
# attendance params
##########################################################
eid_key = {
    "Launch_Sheets.csv": 1,
    "Pentera_Sheets.csv": 8,
    "PreLaunch_Sheets.csv": 0,
    "Russell_Sheets.csv": 2,
    "SababaNights_Sheets.csv": 4,
    "SecretSip_Sheets.csv": 7,
    "Viale_Sheets.csv": 3,
    "ZelnickDinner_Sheets.csv": 11,
    "ZelnickSpeaker_Sheets.csv": 10,
}

num_attend_no_rsvp = 0
indices_to_handle_attendance = []
for attended_csv_file_name in attended_sheets:
    eid = eid_key[attended_csv_file_name]
    attended_email_col = "email"
    attended_csv = pd.read_csv(f"Raw/{attended_csv_file_name}")

    # Split the 'name' column into 'first_name' and 'last_name'
    attended_csv[["first_name", "last_name"]] = (
        attended_csv["name"].fillna("").str.split(" ", n=1, expand=True)
    )



    print(f"Handling attendance for {attended_csv_file_name}...\n")
    for i, att_row in attended_csv.iterrows():
        pid, people = find_person_id(
            att_row,
            people,
            contacts,
            email_col=attended_email_col,
            handle_indices_list=indices_to_handle_attendance,
        )
        if pid is not None:
            # Instead of modifying "attendance[attendance['person_id'] == pid]['checked_in'] = True",
            # which can generate SettingWithCopyWarning, use 'loc' to update properly:
            if attendance.loc[((attendance["person_id"] == pid) & (attendance['event_id'] == eid)), "checked_in"].empty:
                num_attend_no_rsvp += 1
                attendance.loc[len(attendance)] = [
                    pid,
                    eid,
                    True,    # rsvp
                    True,    # approved
                    True,    # checked_in
                    pd.NaT,  # rsvp_datetime
                    False,   # is_first_event
                    pd.NA,   # invite_token_id
                ]
            else:
                attendance.loc[((attendance["person_id"] == pid) & (attendance['event_id'] == eid)), "checked_in"] = True



# for these ones, rsvp is done with attendance because there was no rsvp, so we fill the columns as approved
eid_key = { "Amplify_Sheets.csv": 5,
    "Community Dinners II_Sheets.csv": 12,
    "CommunityDinners_Sheets.csv": 6,}
indices_to_handle_no_rsvp = []
for norsvp_csv_file_name in no_rsvp_sheets:
    eid = eid_key[norsvp_csv_file_name]
    norsvp_email_col = "email"

    # Read CSVs
    norsvp_csv = pd.read_csv(f"Raw/{norsvp_csv_file_name}")
    norsvp_csv[["first_name", "last_name"]] = (
        norsvp_csv["name"].fillna("").str.split(" ", n=1, expand=True)
    )

    print(f"Handling no RSVPs for {norsvp_csv_file_name}...\n")
    # -----------------------------------------------------------------------------
    # Process RSVPs
    # -----------------------------------------------------------------------------
    for i, norsvp_row in norsvp_csv.iterrows():
        pid, people = find_person_id(
            norsvp_row,
            people,
            contacts,
            email_col=norsvp_email_col,
            handle_indices_list=indices_to_handle_no_rsvp,
        )
        if pid is not None:  # <--- Add this guard
            attendance.loc[len(attendance)] = [
                pid,
                eid,
                True,    # rsvp
                True,    # approved
                True,    # checked_in
                pd.NaT,  # rsvp_datetime
                False,   # is_first_event
                pd.NA,   # invite_token_id
            ]

# summmary data
print(f"Found {num_attend_no_rsvp} attendees with no RSVP.")
print(f"Made {len(attendance)} attendance entries.")
print(f"Found {len(indices_to_handle_rsvp + indices_to_handle_attendance + indices_to_handle_no_rsvp)} rsvp indices to handle.")

Found 9 attended sheets and 10 rsvp sheets, total 23
Handling RSVPs for Pentera_Posh.csv...

Handling RSVPs for SababaNights_Partiful.csv...

No match found for ' '.


  rsvp_csv[rsvp_date_col] = pd.to_datetime(rsvp_csv[rsvp_date_col], errors="coerce")


No match found for 'courtney '.
No match found for 'dreese '.
No match found for 'e '.
No match found for 'eliska '.
No match found for 'fernanda '.
No match found for 'francisco '.
No match found for 'gabe evers'.
No match found for 'johnson '.
No match found for 'kami '.
No match found for 'kiran '.
Matching luke huang to luke chung
No match found for 'nitish '.
No match found for 'nolan '.
No match found for 'prash '.
No match found for 'samay '.
No match found for 'saphina '.
Matching simona letizia to simone letizia
No match found for 'sreela '.
Handling RSVPs for ZelnickSpeaker_Posh.csv...



  rsvp_csv[rsvp_date_col] = pd.to_datetime(rsvp_csv[rsvp_date_col], errors="coerce")
  rsvp_csv[rsvp_date_col] = pd.to_datetime(rsvp_csv[rsvp_date_col], errors="coerce")


Handling RSVPs for ZelnickDinner_Posh.csv...

Handling RSVPs for Launch_Partiful.csv...

Matching dorde ivanovic to djordje ivanovic
Matching frederico araujo to frederico araújo
No match found for 'j. desire abayo'.


  rsvp_csv[rsvp_date_col] = pd.to_datetime(rsvp_csv[rsvp_date_col], errors="coerce")


Handling RSVPs for BSMNT_Posh.csv...

Could not find person with email: 69@hotmail.com
Could not find person with email: okeynwaishienyi@gmail.com
No match found for ' '.
Could not find person with email: woodbridge.harvard@gmail.com
Handling RSVPs for PreLaunch_Partiful.csv...

Handling RSVPs for Viale_Partiful.csv...

No match found for 'dreese '.
Handling RSVPs for Russell_Partiful.csv...

Handling RSVPs for SecretSip_Partiful.csv...

No match found for 'giovanni maria d'antonio'.
Handling attendance for Viale_Sheets.csv...

Handling attendance for PreLaunch_Sheets.csv...

Handling attendance for ZelnickSpeaker_Sheets.csv...

Matching ashey chamoy to asher chamoy
No match found for 'anil candilcor'.
Matching matthew herschfeld to matthew hirschfeld
Matching alexander otto to alexandra otto
Matching adriano ariota to adriano arioto
No match found for 'cybele '.
Matching yagnur kavukcuoglu to yagmur kavukcuoglu
Handling attendance for SababaNights_Sheets.csv...

No match found for 'cl

In [485]:
invite_tokens = pd.DataFrame(columns=["id", "event_id", "category", "description"])
for i in range(0, 13):
    invite_tokens.loc[len(invite_tokens)] = [i, i, "personal outreach", "before we began tracking this"]

attendance["invite_token_id"] = attendance["event_id"]

In [486]:
people.to_csv("final/people.csv", index=False)
attendance.to_csv("final/attendance.csv")
invite_tokens.to_csv("final/invite_tokens.csv", index=False)

In [487]:
sorted(attendance['event_id'].unique())

[np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6),
 np.int64(7),
 np.int64(8),
 np.int64(9),
 np.int64(10),
 np.int64(11),
 np.int64(12)]

In [488]:
total = len(indices_to_handle_attendance) + len(indices_to_handle_rsvp)
# todo, drop duplicates, then add to df
new = indices_to_handle_rsvp + indices_to_handle_attendance + indices_to_handle_no_rsvp
total2 = len(new)

print(total)
print(total2)

df_additional_names = pd.DataFrame(columns = ["first_name", "last_name"], data=new)
df_additional_names["first_name"] = df_additional_names["first_name"].str.strip().str.lower()
df_additional_names["last_name"] = df_additional_names["last_name"].str.strip().str.lower()
df_additional_names = df_additional_names.drop_duplicates()
df_additional_names.to_csv("to_add_people.csv", index=False)
df_additional_names

41
41


Unnamed: 0,first_name,last_name
0,,
1,courtney,
2,dreese,
3,e,
4,eliska,
5,fernanda,
6,francisco,
7,gabe,evers
8,johnson,
9,kami,


# Add Details To Missing People

In [None]:
import pandas as pd
import os

# ===== Load the unmatched names file =====
no_match = pd.read_csv("camel_unmatched_names.csv")

# Split names into first and last
no_match["email"] = [[] for _ in range(len(no_match))]
no_match["school"] = [[] for _ in range(len(no_match))]
no_match["grade"] = [[] for _ in range(len(no_match))]
no_match["j/n"] = [[] for _ in range(len(no_match))]

# ===== Step 1: Concat all Raw CSV files into one DataFrame =====
all_raw = []
for file in os.listdir("Raw"):
    if file.endswith(".csv"):
        raw_df = pd.read_csv(f"Raw/{file}")

        # Split 'name' column into first and last names
        if "name" in raw_df.columns:
            raw_df[['first_name', 'last_name']] = pd.NA
            raw_df[['first_name', 'last_name']] = raw_df['name'].str.split(' ', n=1, expand=True)
        
        # Append to the list
        all_raw.append(raw_df)

# Combine all raw data into one DataFrame
df_all_raw = pd.concat(all_raw, ignore_index=True)
print(f"\n📊 Combined {len(all_raw)} raw files into one DataFrame with {len(df_all_raw)} total rows.")

# Check available columns
search_email = "email" in df_all_raw.columns
search_school = "school" in df_all_raw.columns
search_grade = "grade" in df_all_raw.columns
search_jn = "j/n" in df_all_raw.columns

# ===== Step 2: Perform matching and information gathering =====

# For each unmatched name:
for idx_nm, row_nm in no_match.iterrows():
    fn_nm = str(row_nm["first_name"]).strip()
    ln_nm = str(row_nm["last_name"]).strip()

    # Search through all concatenated raw data
    for idx_r, row_r in df_all_raw.iterrows():
        fn_r = str(row_r["first_name"]).strip() if pd.notna(row_r["first_name"]) else ""
        ln_r = str(row_r["last_name"]).strip() if pd.notna(row_r["last_name"]) else ""

        code = compare_names(fn_nm, ln_nm, fn_r, ln_r, fuzzy_threshold=0.80)

        if code in ["auto_accept", "manual_review"]:  # Only consider valid matches
            if code == "manual_review":
                val = input(f"""
                Manual review required for '{fn_nm} {ln_nm}'.\n 
                Possible match: {fn_r} {ln_r}\n
                y/n""").strip().lower()
                if val not in ["yes", "y"]:
                    continue
            
            # Update the unmatched name with the best match
            if len(row_nm["first_name"].strip()) < len(row_r["first_name"].strip()):
                no_match.at[idx_nm, "first_name"] = row_r["first_name"]
            if len(row_nm["last_name"].strip()) < len(row_r["last_name"].strip()):
                no_match.at[idx_nm, "last_name"] = row_r["last_name"]

            # For each piece of info, append if valid and not already in list
            if search_email:
                email = row_r["email"] if pd.notna(row_r["email"]) and str(row_r["email"]).strip().lower() else None
                if email and email not in no_match.at[idx_nm, "email"]:
                    no_match.at[idx_nm, "email"].append(email)
            
            if search_school:
                school = row_r["school"] if pd.notna(row_r["school"]) and str(row_r["school"]).strip().lower() else None
                if school and school not in no_match.at[idx_nm, "school"]:
                    no_match.at[idx_nm, "school"].append(school)

            if search_grade:
                grade = row_r["grade"] if pd.notna(row_r["grade"]) and str(row_r["grade"]).strip().lower() else None
                if grade and grade not in no_match.at[idx_nm, "grade"]:
                    no_match.at[idx_nm, "grade"].append(grade)

            if search_jn:
                jn = row_r["j/n"] if pd.notna(row_r["j/n"]) and str(row_r["j/n"]).strip().lower() else None
                if jn and jn not in no_match.at[idx_nm, "j/n"]:
                    no_match.at[idx_nm, "j/n"].append(jn)

# ===== Step 3: Final formatting to make lists readable (join them) =====

for col in ["email", "school", "grade", "j/n"]:
    no_match[col] = no_match[col].apply(lambda x: ', '.join(map(str, x)) if x else pd.NA)

# ===== Step 4: Save the enhanced unmatched names file =====
no_match.to_csv("camel_unmatched_names_with_info.csv", index=False)

print("\n🎉 Final enriched unmatched names saved to 'camel_unmatched_names_with_info.csv'.")


📊 Combined 22 raw files into one DataFrame with 3031 total rows.
Matching lindsey brail to lindsay brail
Matching ana tejeda to ana tejada
Matching giovanni d’antonio to giovanni m d’antonio
Matching giovanni d’antonio to giovanni m d’antonio
Matching giovanni d’antonio to giovanni m d’antonio
Matching giovanni d’antonio to giovanni m d’antonio
Matching giovanni d’antonio to giovanni m d’antonio
Matching giovanni d’antonio to giovanni m d’antonio
Matching giovanni d’antonio to giovanni m d’antonio
Matching hyunwoo kim to hyunwo kim
Matching talia gershon to tali gershon
Matching yagmur kavukcuoglu to yagnur kavukcuoglu
Matching isabella raskin to isabela raskin
Matching asher chamoy to ashey chamoy
Matching harley pasternak to harley pasternack
Matching harley pasternak to harley pasternack
Matching matthew hirschfeld to matthew herschfeld
Matching adriano arioto to adriano ariota
Matching alexandra otto to alexander otto
Matching joshua yang to josh yang
Matching giovanni d'antonio t

# Remove Duplicates

In [None]:
import pandas as pd

# Load enhanced unmatched names file
df = pd.read_csv("camel_unmatched_names_with_info.csv")

# Ensure lists for merging
for col in ["email", "school", "grade", "j/n"]:
    df[col] = df[col].fillna('').apply(lambda x: [item.strip() for item in x.split(",") if item.strip()] if x else [])

# Track indices to remove (duplicates)
rows_to_remove = set()

# Iterate over each pair of rows (avoiding redundant pairs and self-pair)
for idx_a in range(len(df)):
    if idx_a in rows_to_remove:
        continue  # skip rows already marked for deletion

    fn_a = str(df.at[idx_a, "first_name"]).strip()
    ln_a = str(df.at[idx_a, "last_name"]).strip()

    for idx_b in range(idx_a + 1, len(df)):
        if idx_b in rows_to_remove:
            continue  # skip if already marked

        fn_b = str(df.at[idx_b, "first_name"]).strip()
        ln_b = str(df.at[idx_b, "last_name"]).strip()

        # Use compare_names function with a good threshold
        code = compare_names(fn_a, ln_a, fn_b, ln_b, fuzzy_threshold=0.80)

        if code in ["auto_accept", "manual_review"]:
            # take the longer first name
            if len(fn_a) < len(fn_b):
                fn_a = fn_b
            if len(ln_a) < len(ln_b):
                ln_a = ln_b

            # set the first name and last name to the longer one
            df.at[idx_a, "first_name"] = fn_a
            df.at[idx_a, "last_name"] = ln_a

            print(f"🔗 Found duplicate candidates:\n  1) {fn_a} {ln_a}\n  2) {fn_b} {ln_b}\n -> Merging and keeping first.")

            # Merge each column's lists
            for col in ["email", "school", "grade", "j/n"]:
                # Union of both lists, avoiding empty strings
                combined = set(df.at[idx_a, col]) | set(df.at[idx_b, col])
                combined = {x for x in combined if x.strip()}
                df.at[idx_a, col] = list(combined)

            

            # Mark idx_b for removal (since merged into idx_a)
            rows_to_remove.add(idx_b)

# After merging, convert lists back to comma-separated strings
for col in ["email", "school", "grade", "j/n"]:
    df[col] = df[col].apply(lambda x: ', '.join(sorted(set(x))) if isinstance(x, list) else pd.NA)

# Drop duplicate rows
df_cleaned = df.drop(index=rows_to_remove).reset_index(drop=True)

# Save cleaned version
df_cleaned.to_csv("camel_unmatched_names_deduplicated.csv", index=False)
print(f"\n✅ Deduplicated file saved as 'camel_unmatched_names_deduplicated.csv'. {len(rows_to_remove)} duplicates removed.")

🔗 Found duplicate candidates:
  1) nan nan
  2) nan nan
 -> Merging and keeping first.
🔗 Found duplicate candidates:
  1) nan nan
  2) nan nan
 -> Merging and keeping first.
🔗 Found duplicate candidates:
  1) nan nan
  2) nan nan
 -> Merging and keeping first.
🔗 Found duplicate candidates:
  1) nan nan
  2) nan nan
 -> Merging and keeping first.
🔗 Found duplicate candidates:
  1) nan nan
  2) nan nan
 -> Merging and keeping first.
🔗 Found duplicate candidates:
  1) nan nan
  2) nan nan
 -> Merging and keeping first.
🔗 Found duplicate candidates:
  1) nan nan
  2) nan nan
 -> Merging and keeping first.
Matching ana tejeda to ana tejada
🔗 Found duplicate candidates:
  1) ana tejeda
  2) ana tejada
 -> Merging and keeping first.
🔗 Found duplicate candidates:
  1) cami kotouc
  2) cami Kotouc
 -> Merging and keeping first.
🔗 Found duplicate candidates:
  1) christina Lee
  2) christina lee
 -> Merging and keeping first.
🔗 Found duplicate candidates:
  1) connor Kim
  2) connor kim
 -> Merg

In [None]:
import pandas as pd

# Load cleaned and deduplicated DataFrame
df_cleaned = pd.read_csv("camel_unmatched_names_deduplicated.csv")

# Initialize new columns for split emails
df_cleaned['personal_email'] = pd.NA
df_cleaned['school_email'] = pd.NA

# Function to process and interactively resolve emails with all context in input()
def process_emails_interactive(email_str, index, row):
    if pd.isna(email_str) or not str(email_str).strip():
        return pd.NA, pd.NA

    emails = [e.strip().lower() for e in str(email_str).split(",") if e.strip()]
    emails = list(set(emails))  # Deduplicate
    personal_emails = [e for e in emails if not e.endswith('.edu')]
    school_emails = [e for e in emails if e.endswith('.edu')]

    # Handle personal emails
    if len(personal_emails) > 1:
        selected_personal = input(
            f"\n⚠️ Multiple personal emails for '{row['first_name']} {row['last_name']}' (Row {index}): {personal_emails}\n"
            f"All emails: {emails}\n"
            f"School: {row['school']} | Grade: {row['grade']} | J/N: {row['j/n']}\n"
            "Select one personal email (or type full preferred email): "
        ).strip()
        personal_emails = [selected_personal] if selected_personal else [personal_emails[0]]
    elif len(personal_emails) == 1:
        selected_personal = personal_emails[0]
    else:
        selected_personal = pd.NA

    # Handle school emails
    if len(school_emails) > 1:
        selected_school = input(
            f"\n⚠️ Multiple school emails for '{row['first_name']} {row['last_name']}' (Row {index}): {school_emails}\n"
            f"All emails: {emails}\n"
            f"School: {row['school']} | Grade: {row['grade']} | J/N: {row['j/n']}\n"
            "Select one school email (or type full preferred email): "
        ).strip()
        school_emails = [selected_school] if selected_school else [school_emails[0]]
    elif len(school_emails) == 1:
        selected_school = school_emails[0]
    else:
        selected_school = pd.NA

    return selected_personal, selected_school

# Function to resolve other fields interactively with all context in input()
def resolve_field_interactive(value_str, column_name, index, row):
    if pd.isna(value_str) or not str(value_str).strip():
        return pd.NA
    values = [v.strip() for v in str(value_str).split(",") if v.strip()]
    if column_name == "school":
        if len(values) > 1 and "Other" in values or "other" in values:
            values = [v for v in values if v.lower() != "other"]
    if len(values) > 1:
        selected_value = input(
            f"\n⚠️ Multiple values for '{column_name}' in '{row['first_name']} {row['last_name']}' (Row {index}): {values}\n"
            f"Emails: {row['email']} | School: {row['school']} | Grade: {row['grade']} | J/N: {row['j/n']}\n"
            f"Select one for '{column_name}' (or type correct value): "
        ).strip()
        return selected_value if selected_value else values[0]
    else:
        return values[0] if values else pd.NA

# ===== Process each row with full-context interactive resolution =====
for idx, row in df_cleaned.iterrows():
    # Resolve email fields with context in input()
    personal_email, school_email = process_emails_interactive(row['email'], idx, row)
    df_cleaned.at[idx, 'personal_email'] = personal_email
    df_cleaned.at[idx, 'school_email'] = school_email

    # Resolve other fields interactively with context in input()
    df_cleaned.at[idx, 'school'] = resolve_field_interactive(row['school'], 'school', idx, row)
    df_cleaned.at[idx, 'grade'] = resolve_field_interactive(row['grade'], 'grade', idx, row)
    df_cleaned.at[idx, 'j/n'] = resolve_field_interactive(row['j/n'], 'j/n', idx, row)

# ===== Optional: Drop old email column if no longer needed =====
df_cleaned.drop(columns=['email'], inplace=True)

# ===== Save fully resolved dataset =====
df_cleaned.to_csv("camel_unmatched_names_resolved_interactive.csv", index=False)
print("\n✅ Fully resolved dataset saved to 'camel_unmatched_names_resolved_interactive.csv'.")


✅ Fully resolved dataset saved to 'camel_unmatched_names_resolved_interactive.csv'.


In [None]:
df_cleaned.columns = ["first_name", "last_name", "School", "Grade", "J/N", "Emails", "School Email"]
df_cleaned

df_all_people = pd.read_csv("all_people.csv")
t1 = len(df_all_people)

t2 = len(pd.concat([df_all_people, df_cleaned]))

df_all_people = pd.concat([df_all_people, df_cleaned])
df_all_people.drop_duplicates(subset=["first_name", "last_name"], inplace=True)
t3 = len(df_all_people)
df_all_people.to_csv("all_people2.csv", index=False)

print(t1)
print(t2)
print(t3)


705
1283
1251


In [None]:
import pandas as pd

# Load enhanced unmatched names file
df = pd.read_csv("all_people2.csv")

# Ensure lists for merging
for col in ["School", "Grade", "J/N", "Emails", "School Email"]:
    df[col] = df[col].fillna('').apply(lambda x: [item.strip() for item in x.split(",") if item.strip()] if x else [])

# Track indices to remove (duplicates)
rows_to_remove = set()

# Iterate over each pair of rows (avoiding redundant pairs and self-pair)
for idx_a in range(len(df)):
    if idx_a in rows_to_remove:
        continue  # skip rows already marked for deletion

    fn_a = str(df.at[idx_a, "first_name"]).strip()
    ln_a = str(df.at[idx_a, "last_name"]).strip()

    for idx_b in range(idx_a + 1, len(df)):
        if idx_b in rows_to_remove:
            continue  # skip if already marked

        fn_b = str(df.at[idx_b, "first_name"]).strip()
        ln_b = str(df.at[idx_b, "last_name"]).strip()

        # Use compare_names function with a good threshold
        code = compare_names(fn_a, ln_a, fn_b, ln_b, fuzzy_threshold=0.80)

        if code in ["auto_accept", "manual_review"]:
            # take the longer first name
            if len(fn_a) < len(fn_b):
                fn_a = fn_b
            if len(ln_a) < len(ln_b):
                ln_a = ln_b

            # set the first name and last name to the longer one
            df.at[idx_a, "first_name"] = fn_a
            df.at[idx_a, "last_name"] = ln_a

            print(f"🔗 Found duplicate candidates:\n  1) {fn_a} {ln_a}\n  2) {fn_b} {ln_b}\n -> Merging and keeping first.")

            # Merge each column's lists
            for col in ["School", "Grade", "J/N", "Emails", "School Email"]:
                # Union of both lists, avoiding empty strings
                combined = set(df.at[idx_a, col]) | set(df.at[idx_b, col])
                combined = {x for x in combined if x.strip()}
                df.at[idx_a, col] = list(combined)

            

            # Mark idx_b for removal (since merged into idx_a)
            rows_to_remove.add(idx_b)

# After merging, convert lists back to comma-separated strings
for col in ["School", "Grade", "J/N", "Emails", "School Email"]:
    df[col] = df[col].apply(lambda x: ', '.join(sorted(set(x))) if isinstance(x, list) else pd.NA)

# Drop duplicate rows
df_cleaned = df.drop(index=rows_to_remove).reset_index(drop=True)

🔗 Found duplicate candidates:
  1) Rowan gupta
  2) rowan gupta
 -> Merging and keeping first.


In [None]:
# TODO
import pandas as pd
filled = pd.read_csv("manual_filled.csv")
for index, row in filled.iterrows():
    # set person demographics = to filled in demographics unless None, NA, or ""
    # if there is a contact value, check if it exists in our db and that it is linked to the current person
    # if it exists but is linked to a different person, print the person id, the contact id, and the associated person id
    # if it doesn't exist, create a new contact and link it to the current person

In [16]:
import pandas as pd

people = pd.read_csv("final/people.csv")
contacts = pd.read_csv("final/contacts.csv")

# contacts onto to people m:1
# create one row per person with contacts as a list of the contact_value column values
contacts_grouped = contacts[contacts['contact_type'] == "school email"].groupby("person_id")["contact_value"].first().reset_index()
people = people.merge(contacts_grouped, left_on="id", right_on="person_id", how="left")
people.rename(columns={"contact_value": "contacts"}, inplace=True)
people.drop(columns=["person_id", "preferred_name"], inplace=True)

# find any null values unless school = Other, then class year can be null and school email can be null
to_fill = people[people["school"] != "Other"]
to_fill = to_fill[to_fill.isna().any(axis=1)]
to_fill_other = people[people["school"] == "Other"]
to_fill_other = to_fill_other[to_fill_other[["first_name", "last_name", "gender", "is_jewish", "school"]].isna().any(axis=1)]

to_fill = pd.concat([to_fill, to_fill_other])
to_fill.drop_duplicates(inplace=True)

to_fill["class_year"] = to_fill["class_year"].apply(lambda x: pd.NA if pd.isna(x) else int(x))
to_fill.fillna("", inplace=True)

In [7]:
to_fill.to_csv("to_fill.csv", index=False)

In [17]:
to_fill

Unnamed: 0,id,first_name,last_name,gender,class_year,is_jewish,school,contacts
19,19,josie,whelan,F,2028,,Harvard,Josiewhelan@college.harvard.edu
22,22,christian,armaly,M,2028,,Harvard,christianarmaly@college.harvard.edu
27,27,alison,chan,F,2025,N,Harvard,
42,42,samuel,coopersmith,M,2028,N,Harvard,
46,46,eli,solomon,M,2028,J,Harvard,
...,...,...,...,...,...,...,...,...
1161,1163,danielle,frankel,,2026,,Other,
1163,1165,devorah,feder,,2026,,Other,
1183,1188,naveh,talmon chvaicer,,,J,Other,
1184,1189,liz,vermeulen,,2026,N,Other,


# Analysis

In [22]:
import pandas as pd

# add is first event
import pandas as pd

# Load RSVPs CSV
attendance = pd.read_csv('final/attendance.csv').iloc[:, 1:]
attendance["approved"] = attendance["approved"].astype(bool)
# Load Events CSV
events = pd.read_csv('final/events.csv')

# Add is_first_event column to attendance
attendance = attendance.merge(events[['id', 'start_datetime']], left_on='event_id', right_on='id', how='left')

# Drop the duplicate 'id' column from events table
attendance.drop(columns=['id'], inplace=True)

# Convert start_datetime to datetime type
attendance['start_datetime'] = pd.to_datetime(attendance['start_datetime'])

# Sort RSVPs by person_id and event start time
attendance_sorted = attendance.sort_values(by=['person_id', 'start_datetime'])
first_event_times = attendance_sorted[attendance_sorted['checked_in'] == True]
# For each person_id, get the earliest start_datetime (unique pair)
first_event_times = first_event_times.groupby('person_id', as_index=False)['start_datetime'].min()


attendance_sorted['key'] = attendance_sorted['person_id'].astype(str) + attendance_sorted['start_datetime'].astype(str)
first_event_times['key'] = first_event_times['person_id'].astype(str) + first_event_times['start_datetime'].astype(str)
# Set is_first_event = True if this is the earliest event, else False
attendance_sorted['is_first_event'] = attendance_sorted['key'].isin(first_event_times['key'])
attendance_sorted.drop(columns=['key'], inplace=True)
attendance_sorted.sort_values(by=['start_datetime'], inplace=True)
attendance_sorted.drop(columns=['start_datetime'], inplace=True)

attendance = attendance_sorted

events = pd.read_csv("final/events.csv")
people = pd.read_csv("final/people.csv")

attendance = attendance.merge(people, left_on="person_id", right_on="id", how="left")

# Standardize data
attendance["is_jewish"] = attendance["is_jewish"].apply(lambda x: x if pd.isna(x) else x.upper())
attendance["gender"] = attendance["gender"].apply(lambda x: x if pd.isna(x) else x.upper())

# Aggregate RSVP, Approved, Checked-in counts by event
agg_attendance = attendance.groupby("event_id").agg({
    "rsvp": "sum",
    "approved": "sum",
    "checked_in": "sum",
    "is_first_event": "sum"
}).reset_index()

attendance['is_jewish'] = attendance['is_jewish'].fillna("N/A")
attendance['gender'] = attendance['gender'].fillna("N/A")
attendance['class_year'] = attendance['class_year'].fillna(1000)
# Calculate Jewish status percentages by event
jewish_counts = attendance.pivot_table(index="event_id", columns="is_jewish", aggfunc="size")
jewish_percentages = jewish_counts.div(jewish_counts.sum(axis=1), axis=0).reset_index()

# Calculate Gender percentages by event
gender_counts = attendance.pivot_table(index="event_id", columns="gender", aggfunc="size")
gender_percentages = gender_counts.div(gender_counts.sum(axis=1), axis=0).reset_index()

# Calculate Class year percentages by event
class_year_counts = attendance.pivot_table(index="event_id", columns="class_year", aggfunc="size")
class_year_percentages = class_year_counts.div(class_year_counts.sum(axis=1), axis=0).reset_index()

# Combine all summaries
summary = agg_attendance \
    .merge(jewish_percentages, on="event_id", how="left", suffixes=(None, '_jewish')) \
    .merge(gender_percentages, on="event_id", how="left", suffixes=(None, '_gender')) \
    .merge(class_year_percentages, on="event_id", how="left", suffixes=(None, '_class_year'))

# Merge with event details (assuming you need event code)
final_summary = summary.merge(events[["id", "category", "description"]], left_on="event_id", right_on="id", how="left")

# Drop redundant 'id' column after merge
final_summary = final_summary.drop(columns=['id'])


final_summary["J"] = final_summary["J"].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary["N"] = final_summary["N"].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary["F"] = final_summary["F"].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary["M"] = final_summary["M"].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary[2025.0] = final_summary[2025.0].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary[2026.0] = final_summary[2026.0].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary[2027.0] = final_summary[2027.0].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary[2028.0] = final_summary[2028.0].apply(lambda x: f"{x:.2%}" if pd.notna(x) else pd.NA)
final_summary.drop(columns=["event_id"], inplace=True)
final_summary.drop(columns=["N/A", "N/A_gender", 1000.0], inplace=True)
final_summary = final_summary[["category", "description", "rsvp", "approved", "checked_in", "is_first_event", "J", "N", "F", "M", 2025.0, 2026.0, 2027.0, 2028.0]]
final_summary.to_excel("final_summary.xlsx", index=False)