## Insert New Event

In [1]:
import pandas as pd

In [2]:
events = pd.read_csv('final/events.csv')
max_id = events['id'].max()
new_event_id = max_id + 1
category_valid = False

while not category_valid:
    category = input('Enter category (speaker, party, speaker dinner, community dinner): ')
    if category.lower() in ['speaker', 'party', 'speaker dinner', 'community dinner']:
        category = category.lower()
        category_valid = True
    else:
        print('Invalid category. Please enter a valid category.')

name = input('Enter name: ')

start_datetime = input('Enter start date and time (YYYY-MM-DD HH:MM): ')
start_datetime = start_datetime + ":00-05:00"
start_datetime = pd.to_datetime(start_datetime)

location = input('Enter location: ').lower()

description = input('Enter description: ')

new_event = pd.DataFrame({'id': [new_event_id], 'event_name': [name], 'category': [category], 'location': [location], 'start_datetime': [start_datetime], 'description': [description]})

if name in events["event_name"].values:
    print('Event already exists.')
    new_event_id = None
else:
    events = pd.concat([events, new_event], ignore_index=True)
    events.to_csv('final/events.csv', index=False)
    print('Event added successfully.')

Event added successfully.


In [20]:
import pandas as pd
from difflib import SequenceMatcher

def fuzzy_ratio(str_a, str_b):
    return SequenceMatcher(None, str_a, str_b).ratio()

def is_initial(name):
    cleaned = name.strip().lower()
    if len(cleaned) == 1 and cleaned.isalpha():
        return True
    if len(cleaned) == 2 and cleaned[0].isalpha() and cleaned[1] == ".":
        return True
    return False

def compare_names(fn_ta, ln_ta, fn_sheet, ln_sheet, fuzzy_threshold):
    # Handle NaN values
    fn_ta = str(fn_ta) if pd.notna(fn_ta) else ""
    ln_ta = str(ln_ta) if pd.notna(ln_ta) else ""
    fn_sheet = str(fn_sheet) if pd.notna(fn_sheet) else ""
    ln_sheet = str(ln_sheet) if pd.notna(ln_sheet) else ""

    # ===== STEP 1: Check if both first and last names are exact match =====
    if fn_ta == fn_sheet and ln_ta == ln_sheet:
        return "auto_accept"
    # if one name is exact match and the other is a substring
    elif (fn_ta in fn_sheet and ln_sheet == ln_ta) or (fn_sheet in fn_ta and ln_ta == ln_sheet):
        return "auto_accept"
    elif (fn_ta == fn_sheet and ln_ta in ln_sheet) or (fn_sheet == fn_ta and ln_sheet in ln_ta):
        return "auto_accept"

    # ===== STEP 2: Handle initials only when necessary =====
    fn_ta_is_initial = is_initial(fn_ta)
    ln_ta_is_initial = is_initial(ln_ta)

    if fn_ta_is_initial or ln_ta_is_initial:
        # Initial matching logic:
        if fn_ta_is_initial:
            letter = fn_ta[0].lower()
            if not fn_sheet.startswith(letter):
                return "reject_now"
        if ln_ta_is_initial:
            letter = ln_ta[0].lower()
            if not ln_sheet.startswith(letter):
                return "reject_now"

        # If one of them is initial, require a good fuzzy match on the other
        if fn_ta_is_initial and not ln_ta_is_initial:
            ratio_last = fuzzy_ratio(ln_ta, ln_sheet)
            if ratio_last >= fuzzy_threshold:
                return "manual_review"
            else:
                return "reject_now"
        elif ln_ta_is_initial and not fn_ta_is_initial:
            ratio_first = fuzzy_ratio(fn_ta, fn_sheet)
            if ratio_first >= fuzzy_threshold:
                return "manual_review"
            else:
                return "reject_now"
        else:
            # Both initials that passed .startswith checks but aren't exact match
            return "manual_review"

    # ===== STEP 3: Fuzzy logic =====
    exact_first = (fn_ta == fn_sheet)
    exact_last = (ln_ta == ln_sheet)
    ratio_first = fuzzy_ratio(fn_ta, fn_sheet)
    ratio_last = fuzzy_ratio(ln_ta, ln_sheet)

    # If one name is exact, and the other is a good fuzzy match
    if exact_first and ratio_last >= fuzzy_threshold:
        print(f"Matching {fn_ta} {ln_ta} to {fn_sheet} {ln_sheet}")
        return "auto_accept"
    if exact_last and ratio_first >= fuzzy_threshold:
        print(f"Matching {fn_ta} {ln_ta} to {fn_sheet} {ln_sheet}")
        return "auto_accept"

    # If both are good fuzzy matches (but not exact)
    if ratio_first >= fuzzy_threshold and ratio_last >= fuzzy_threshold:
        return "manual_review"
    # Otherwise, reject
    return "reject_now"

In [21]:
def update_names_if_substring(df, idx, sheet_first, sheet_last, input_first, input_last):
    """
    Update the first_name and last_name in df at idx to the longer version
    between input and sheet names if one is a substring of the other.
    """
    # First name check
    if pd.isna(sheet_first) or not sheet_first:
        sheet_first = ""
    if pd.isna(input_first) or not input_first:
        input_first = ""

    if sheet_first.lower() in input_first.lower() or input_first.lower() in sheet_first.lower():
        longer_first = max(sheet_first, input_first, key=len)
        df.at[idx, 'first_name'] = longer_first
    
    # Last name check (if both are provided)
    if pd.notna(sheet_last) and pd.notna(input_last):
        if sheet_last.lower() in input_last.lower() or input_last.lower() in sheet_last.lower():
            longer_last = max(sheet_last, input_last, key=len)
            df.at[idx, 'last_name'] = longer_last

In [22]:
def find_person_id(
    row,
    people_df,
    contacts_df,
    email_col=None,
    phone_col=None,
    handle_indices_list=None,
    fuzzy_threshold=0.80,  # Passed to compare_names
):

    first_name = row["first_name"].strip().lower() if not pd.isna(row["first_name"]) else None
    last_name = row.get("last_name")
    last_name = last_name.strip().lower() if pd.notna(last_name) else None

    # 1) Email matching
    if email_col and email_col in row and pd.notna(row[email_col]):
        email = row[email_col]
        email_matches = contacts_df[contacts_df["contact_value"].str.strip().str.lower() == email]
        if not email_matches.empty:
            person_id = email_matches.iloc[0]["person_id"]
            idx = people_df[people_df["id"] == person_id].index[0]  # map to index
            update_names_if_substring(people_df, idx, people_df.loc[idx, "first_name"], people_df.loc[idx, "last_name"], first_name, last_name)
            return person_id, people_df
        print(f"Could not find person with email: {email}")
    else:
        email = None
    # 1.2) Phone matching
    if phone_col and phone_col in row and pd.notna(row[phone_col]):
        phone = row[phone_col]
        phone_matches = contacts_df[contacts_df["contact_value"].str.strip().str.lower() == phone]
        if not phone_matches.empty:
            person_id = phone_matches.iloc[0]["person_id"]
            idx = people_df[people_df["id"] == person_id].index[0]
            update_names_if_substring(people_df, idx, people_df.loc[idx, "first_name"], people_df.loc[idx, "last_name"], first_name, last_name)
            return person_id, people_df
        print(f"Could not find person with phone: {phone}")
    else:
        phone
    
    if not first_name:
        return None, people_df

    # 2) Exact name matching
    potentials = people_df[
        (people_df["first_name"].str.lower() == first_name) &
        ((people_df["last_name"].str.lower() == last_name) if last_name else True)
    ]

    if len(potentials) == 1:
        idx = potentials.index[0]
        update_names_if_substring(people_df, idx, potentials.loc[idx, "first_name"], potentials.loc[idx, "last_name"], first_name, last_name)
        return potentials.loc[idx, "id"], people_df

    elif len(potentials) > 1:
        options = [
            f"{i} => {p['first_name']} {p['last_name']} (gender={p['gender']}, jewish={p['is_jewish']})"
            for i, (_, p) in enumerate(potentials.iterrows())
        ]
        options_str = "\n".join(options)
        choice = input(f"Multiple exact matches for '{first_name} {last_name or ''} and email {email}'. Choose one:\n\n{options_str}\n\nSelect index or 'n' to skip: ")
        if choice.lower() == "n":
            handle_indices_list.append((first_name, last_name))
            return None, people_df
        try:
            selected_idx = potentials.index[int(choice)]
            person_id = people_df.loc[selected_idx, "id"]
            update_names_if_substring(people_df, selected_idx, people_df.loc[selected_idx, "first_name"], people_df.loc[selected_idx, "last_name"], first_name, last_name)
            return person_id, people_df
        except:
            print("Invalid choice. Skipping.")
            handle_indices_list.append((first_name, last_name))
            return None, people_df

    # 3) Fuzzy matching
    auto_accepts, manual_reviews = [], []
    for idx, candidate in people_df.iterrows():
        verdict = compare_names(
            first_name,
            (last_name or ""),
            candidate["first_name"],
            candidate["last_name"] if pd.notna(candidate["last_name"]) else "",
            fuzzy_threshold,
        )
        if verdict == "auto_accept":
            auto_accepts.append(idx)
        elif verdict == "manual_review":
            manual_reviews.append(idx)

    # Handle auto-accept
    if len(auto_accepts) == 1:
        idx = auto_accepts[0]
        update_names_if_substring(people_df, idx, people_df.loc[idx, "first_name"], people_df.loc[idx, "last_name"], first_name, last_name)
        return people_df.loc[idx, "id"], people_df

    if len(auto_accepts) > 1:
        options = [
            f"{i} => {people_df.loc[idx, 'first_name']} {people_df.loc[idx, 'last_name']} (gender={people_df.loc[idx, 'gender']}, jewish={people_df.loc[idx, 'is_jewish']})"
            for i, idx in enumerate(auto_accepts)
        ]
        options_str = "\n".join(options)
        choice = input(f"Multiple 'auto_accept' matches for '{first_name} {last_name or ''} and email {email}'. Choose one:\n\n{options_str}\n\nSelect index or 'n' to skip: ")
        if choice.lower() == "n":
            handle_indices_list.append((first_name, last_name))
            return None, people_df
        try:
            idx = auto_accepts[int(choice)]
            update_names_if_substring(people_df, idx, people_df.loc[idx, "first_name"], people_df.loc[idx, "last_name"], first_name, last_name)
            return people_df.loc[idx, "id"], people_df
        except:
            print("Invalid choice. Skipping.")
            handle_indices_list.append((first_name, last_name))
            return None, people_df

    # Manual review
    if manual_reviews:
        options = [
            f"{i} => {people_df.loc[idx, 'first_name']} {people_df.loc[idx, 'last_name']} (gender={people_df.loc[idx, 'gender']}, jewish={people_df.loc[idx, 'is_jewish']})"
            for i, idx in enumerate(manual_reviews)
        ]
        options_str = "\n".join(options)
        choice = input(f"No auto-accept found for '{first_name} {last_name or ''}', but possible matches:\n\n{options_str}\n\nSelect index or 'n' to skip: ")
        if choice.lower() == "n":
            handle_indices_list.append((first_name, last_name))
            return None, people_df
        try:
            idx = manual_reviews[int(choice)]
            update_names_if_substring(people_df, idx, people_df.loc[idx, "first_name"], people_df.loc[idx, "last_name"], first_name, last_name)
            return people_df.loc[idx, "id"], people_df
        except:
            print("Invalid choice. Skipping.")
            handle_indices_list.append((first_name, last_name))
            return None, people_df

    # No matches
    print(f"No match found for '{first_name} {last_name or ''}'.")
    handle_indices_list.append((first_name, last_name))
    return None, people_df

In [23]:
# -------------------------------------------------------
# Normalize Columns
# -------------------------------------------------------
import re
# === Normalization helpers ===
def normalize_gender(val):
    if pd.isna(val): return pd.NA
    s = str(val).strip().lower()
    if s in {"f","female","woman","girl"}: return "F"
    if s in {"m","male","man","boy"}: return "M"
    return pd.NA

def normalize_school(explicit_school, email):
    """Prefer explicit value; only 'Harvard'/'MIT' allowed; else try email domain; else NA."""
    def pick_from_text(txt):
        if not txt: return None
        t = str(txt).strip().lower()
        # common variants people typed
        if "harvard" in t and not "business" in t: return "Harvard"
        if t in {"mit","massachusetts institute of technology"} or "mit" in t: return "MIT"
        return None

    # 1) from explicit column
    choice = pick_from_text(explicit_school)
    if choice: return choice

    # 2) email fallback
    e = (email if email and not pd.isna(email) else "").lower()
    if e.endswith("@college.harvard.edu") or e.endswith("@harvard.edu"):
        return "Harvard"
    if e.endswith("@mit.edu"):
        return "MIT"

    return pd.NA

# Fixed mapping for the 2025–26 academic year: freshman=2029
GRADE_TO_YEAR = {
    "freshman": 2029, "first": 2029, "first year": 2029, "1": 2029, "1st": 2029,
    "sophomore": 2028, "second": 2028, "2": 2028, "2nd": 2028,
    "junior": 2027, "third": 2027, "3": 2027, "3rd": 2027,
    "senior": 2026, "fourth": 2026, "4": 2026, "4th": 2026,
}

def parse_class_year(val):
    """
    Returns an int year (e.g., 2029) or pd.NA.
    Accepts: '2029', '’27', "'27", "Class of 2029", "Freshman", "Junior", "Veteran 2029", etc.
    """
    if pd.isna(val): return pd.NA
    s = str(val).strip()

    # 1) direct 4-digit year anywhere in string
    m = re.search(r"(20\d{2})", s)
    if m:
        yr = int(m.group(1))
        return yr

    # 2) apostrophe short year like ’27 or '27 -> 2027
    m = re.search(r"[’'](\d{2})", s)
    if m:
        short = int(m.group(1))
        # assume 20xx; if < 30 -> 20(short)
        yr = 2000 + short
        return yr

    # 3) grade/ordinal/number words (normalize)
    t = s.lower()
    t = t.replace("year", "").strip()  # handle "First year"
    # common typos / variants
    t = t.replace("st year", "").replace("nd year", "").replace("rd year", "").replace("th year", "").strip()

    # normalize a few funky entries like "1st", "2", "3rd", "first", "Freshman-2029" (handled above), etc.
    if t in GRADE_TO_YEAR:
        return GRADE_TO_YEAR[t]

    # split on non-letters to catch words like "freshman-2029" already handled via digits, but try words alone
    words = re.findall(r"[a-z]+|\d+(?:st|nd|rd|th)?", t)
    for w in words:
        yr = GRADE_TO_YEAR.get(w)
        if yr:
            return yr

    return pd.NA

In [None]:
# -------------------------------------------------------
# Main logic
# -------------------------------------------------------
if not new_event_id:
    raise ValueError("No new event ID was generated. Please add an event first.")

# Example inputs
approved_column = "Order Status"
rsvp_approved_values = ["Completed"]

rsvp_datetime_column = "Order Date/Time"
first_name_column = "First Name"
last_name_column = "Last Name"
email_column = "Email"
school_email_column = "What is your school email?"  # NEW COLUMN
phone_column = "Phone Number"
attendance_column = "Tickets Scanned"
invite_token_column = "Tracking Link"

gender_column_raw = "Detected Gender"
school_column_raw = "What school do you go to?"
year_column_raw = "What is your Class Year"

df_current = pd.read_csv("Raw/NetflixCFO.csv")  # Replace with your real file

# Updated school normalization function that prioritizes school email
def normalize_school_with_email(school_response, general_email, school_email):
    """
    Determine school with priority: school_email > general_email > school_response
    Only undergraduate institutions count as the actual school - grad schools return "Other"
    """
    # First try school email (most reliable)
    if pd.notna(school_email) and school_email.strip():
        school_email_clean = str(school_email).strip().lower()
        
        # Harvard UNDERGRADUATE only - Harvard College
        if any(domain in school_email_clean for domain in [
            "@harvard.edu", "@college.harvard.edu"
        ]):
            return "Harvard"
        
        # Harvard GRADUATE/PROFESSIONAL schools - return Other
        elif any(domain in school_email_clean for domain in [
            "@hbs.edu", "@hms.harvard.edu", "@hsph.harvard.edu", 
            "@fas.harvard.edu", "@hillel.harvard.edu"
        ]):
            return "Other"
        
        # MIT - assume undergraduate (they use same domain for undergrad/grad)
        elif "@mit.edu" in school_email_clean:
            return "MIT"
            
        # Other Boston area UNDERGRADUATE schools
        elif "@bu.edu" in school_email_clean:
            return "Boston University"
        elif "@northeastern.edu" in school_email_clean:
            return "Northeastern"
        elif "@tufts.edu" in school_email_clean:
            return "Tufts"
        elif "@wellesley.edu" in school_email_clean:
            return "Wellesley"
        elif "@brandeis.edu" in school_email_clean:
            return "Brandeis"
        elif "@emerson.edu" in school_email_clean:
            return "Emerson"
        elif "@suffolk.edu" in school_email_clean:
            return "Suffolk"
        elif "@berklee.edu" in school_email_clean:
            return "Berklee"
        elif "@simmons.edu" in school_email_clean:
            return "Simmons"
        
        # For other .edu domains, be more conservative
        elif ".edu" in school_email_clean:
            # Extract the main domain (last two parts before .edu)
            domain_parts = school_email_clean.split("@")[-1].split(".")
            if len(domain_parts) >= 2:
                # Get the school name (second to last part)
                school_part = domain_parts[-2]
                
                # Only return if it looks like a reasonable school name
                if len(school_part) > 2 and school_part.isalpha():
                    return school_part.title()
                    
    # Fallback to general email if school email not available
    if pd.notna(general_email) and general_email.strip():
        general_email_clean = str(general_email).strip().lower()
        
        # Harvard undergraduate domains only
        if any(domain in general_email_clean for domain in [
            "@harvard.edu", "@college.harvard.edu"
        ]):
            return "Harvard"
        # Harvard graduate domains
        elif any(domain in general_email_clean for domain in [
            "@hbs.edu", "@hms.harvard.edu", "@hsph.harvard.edu", 
            "@fas.harvard.edu"
        ]):
            return "Other"
        elif "@mit.edu" in general_email_clean:
            return "MIT"
    
    # Finally fallback to the school response field
    return normalize_school(school_response, general_email)

# apply column normalization functions
df_current["_norm_gender"] = df_current[gender_column_raw].apply(normalize_gender)
df_current["_norm_school"] = df_current.apply(
    lambda r: normalize_school_with_email(
        r.get(school_column_raw, pd.NA), 
        r.get(email_column, ""),
        r.get(school_email_column, pd.NA)  # NEW: Include school email
    ),
    axis=1
)
df_current["_norm_class_year"] = df_current[year_column_raw].apply(parse_class_year)

attendance_df = pd.read_csv("final/attendance.csv")
contacts_df = pd.read_csv("final/contacts.csv")
invite_tokens_df = pd.read_csv("final/invite_tokens.csv")
events_df = pd.read_csv("final/events.csv")
people_df = pd.read_csv("final/people.csv")

# replace the email token with empty string
df_current[invite_token_column] = df_current[invite_token_column].apply(lambda x: pd.NA if x == "email" else x)

# -------------------------------------------------------
# 1) Process invite tokens
# -------------------------------------------------------
if invite_token_column in df_current.columns:
    # Replace null tokens with "default"
    df_current[invite_token_column] = df_current[invite_token_column].fillna("default")

    unique_tokens = df_current[invite_token_column].unique()

    current_max_token_id = invite_tokens_df["id"].max() if not invite_tokens_df.empty else 0

    # We will build a map: token_value -> invite_token_id
    invite_token_map = {}

    # First, make sure we account for any tokens already in the invite_tokens_df
    # (If your table already has existing tokens for the same event, you might want to check those first)
    existing_tokens_this_event = invite_tokens_df[invite_tokens_df["event_id"] == new_event_id]
    for _, row_it in existing_tokens_this_event.iterrows():
        invite_token_map[row_it["category"]] = row_it["id"]

    # Now add new tokens that do not exist yet
    new_tokens_list = []
    for token in unique_tokens:
        token_str = str(token)
        if token_str not in invite_token_map:
            current_max_token_id += 1
            new_tokens_list.append({
                "id": current_max_token_id,
                "event_id": new_event_id,
                "category": "personal outreach" if token_str != "default" else "mailing list",
                "value": token_str,
                "description": ""
            })
            invite_token_map[token_str] = current_max_token_id

    if new_tokens_list:
        invite_tokens_df = pd.concat(
            [invite_tokens_df, pd.DataFrame(new_tokens_list)],
            ignore_index=True
        )

    # After this step, invite_tokens_df has all tokens, and invite_token_map can be used for lookups.
else:
    # If there's no invite token column, define a default
    df_current[invite_token_column] = "default"
    invite_token_map = {"default": 1}  # or some fallback

# -------------------------------------------------------
# 2) Build an index for people_df and contacts_df so we can search quickly.
#    (Optional – may not be critical if your data is small.)
# -------------------------------------------------------
# For demonstration, we assume direct .loc / .query calls are enough.

# -------------------------------------------------------
# 3) Prepare to generate new IDs for people, contacts, attendance
# -------------------------------------------------------
max_person_id = people_df["id"].max() if not people_df.empty else 0
max_contact_id = contacts_df["id"].max() if not contacts_df.empty else 0
max_attendance_id = attendance_df["id"].max() if not attendance_df.empty else 0

# We'll accumulate new rows in lists, then append at the end.
new_people_rows = []
new_contacts_rows = []
new_attendance_rows = []

# A place to store name issues if you want
handle_indices_list = []

# -------------------------------------------------------
# 4) Iterate over df_current rows
# -------------------------------------------------------
for _, row in df_current.iterrows():
    # Extract relevant info
    raw_first = row.get(first_name_column, "")
    raw_last  = row.get(last_name_column, "")
    raw_email = row.get(email_column, "")
    raw_school_email = row.get(school_email_column, "")  # NEW: Extract school email
    raw_phone = row.get(phone_column, "")
    raw_invite_token = row.get(invite_token_column, "default")
    raw_rsvp_status = row.get(approved_column, pd.NA)
    raw_rsvp_datetime = row.get(rsvp_datetime_column, None)
    raw_attended = row.get(attendance_column, None)

    # Prepare a temp dict so we can pass to find_person_id
    # Clean up the email, phone, and school email
    email_clean = str(raw_email).strip().lower() if pd.notna(raw_email) else ""
    school_email_clean = str(raw_school_email).strip().lower() if pd.notna(raw_school_email) else ""
    phone_clean = str(raw_phone).strip() if pd.notna(raw_phone) else ""
    
    # For matching, prioritize school email over general email
    primary_email_for_matching = school_email_clean if school_email_clean else email_clean
    
    row_dict_for_matching = {
        "first_name": raw_first,
        "last_name":  raw_last,
        "email": primary_email_for_matching,
        "phone": phone_clean
    }

    # 4)
    found_id, people_df = find_person_id(
        row_dict_for_matching,
        people_df,
        contacts_df,
        email_col="email",
        phone_col="phone",
        handle_indices_list=handle_indices_list,
        fuzzy_threshold=0.80
    )
    matched_person_id = found_id

    # 4c) If we *still* don't have a match, create a new person row
    #     and handle their contact info.
    if not matched_person_id and matched_person_id != 0:
        max_person_id += 1
        matched_person_id = max_person_id
        norm_gender = row.get("_norm_gender", pd.NA)
        norm_school = row.get("_norm_school", pd.NA)
        norm_class_year = row.get("_norm_class_year", pd.NA)

        new_people_rows.append({
            "id": matched_person_id,
            "first_name": raw_first,
            "last_name": raw_last,
            "gender": norm_gender,            # "M"/"F"/NA
            "class_year": norm_class_year,    # int like 2029 or NA
            "is_jewish": pd.NA,
            "school": norm_school,            # "Harvard"/"MIT"/NA
            "preferred_name": pd.NA
        })

        # Add their contact info - prioritize school email
        if school_email_clean:
            max_contact_id += 1
            new_contacts_rows.append({
                "id": max_contact_id,
                "person_id": matched_person_id,
                "contact_type": "school email",
                "contact_value": school_email_clean,
                "is_verified": False
            })

        # Also add general email if different from school email
        if email_clean and email_clean != school_email_clean:
            max_contact_id += 1
            contact_type = "school email" if ".edu" in email_clean else "personal email"
            new_contacts_rows.append({
                "id": max_contact_id,
                "person_id": matched_person_id,
                "contact_type": contact_type,
                "contact_value": email_clean,
                "is_verified": False
            })

        if phone_clean:
            max_contact_id += 1
            new_contacts_rows.append({
                "id": max_contact_id,
                "person_id": matched_person_id,
                "contact_type": "phone",
                "contact_value": phone_clean,
                "is_verified": False
            })

    # 4d) If we found an existing person, check if we need to add
    #     any missing contact info to the contacts table
    else:
        # Check and add school email if it doesn't exist
        if school_email_clean:
            existing_school_email = contacts_df[
                (contacts_df["contact_value"].str.lower() == school_email_clean)
            ]
            if existing_school_email.empty:
                max_contact_id += 1
                new_contacts_rows.append({
                    "id": max_contact_id,
                    "person_id": matched_person_id,
                    "contact_type": "school email",
                    "contact_value": school_email_clean,
                    "is_verified": False
                })

        # Check and add general email if it doesn't exist and is different from school email
        if email_clean and email_clean != school_email_clean:
            existing_email = contacts_df[
                (contacts_df["contact_value"].str.lower() == email_clean)
            ]
            if existing_email.empty:
                max_contact_id += 1
                contact_type = "school email" if ".edu" in email_clean else "personal email"
                new_contacts_rows.append({
                    "id": max_contact_id,
                    "person_id": matched_person_id,
                    "contact_type": contact_type,
                    "contact_value": email_clean,
                    "is_verified": False
                })

        # Check if the person already has this phone
        if phone_clean:
            existing_phone = contacts_df[
                (contacts_df["contact_value"] == phone_clean)
            ]
            if existing_phone.empty:
                max_contact_id += 1
                new_contacts_rows.append({
                    "id": max_contact_id,
                    "person_id": matched_person_id,
                    "contact_type": "phone",
                    "contact_value": phone_clean,
                    "is_verified": False
                })

    # -------------------------------------------------------
    # 5) Create attendance record
    # -------------------------------------------------------
    max_attendance_id += 1
    attendance_id = max_attendance_id

    approved_val = 1 if raw_rsvp_status in rsvp_approved_values else 0
    
    # Interpret "checked_in"
    checked_in_val = False
    if str(raw_attended).strip().lower() in ["1", "1.0", "true", "yes"]:
        checked_in_val = True

    # Look up invite_token_id from map
    # (We forced token to string, so let's do that here as well)
    token_str = str(raw_invite_token)
    invite_token_id = invite_token_map.get(token_str, None)
    if not invite_token_id:
        # Fallback if something was missing:
        invite_token_id = invite_token_map.get("default", None)

    new_attendance_rows.append({
        "id": attendance_id,
        "person_id": matched_person_id,
        "event_id": new_event_id,
        "rsvp": True if (not pd.isna(raw_rsvp_status)) or  (raw_rsvp_status == "") else False,
        "approved": approved_val,
        "checked_in": checked_in_val,
        "rsvp_datetime": raw_rsvp_datetime,
        "is_first_event": False,
        "invite_token_id": invite_token_id
    })

# -------------------------------------------------------
# 6) Append new rows to each DataFrame
# -------------------------------------------------------
if new_people_rows:
    people_df = pd.concat([people_df, pd.DataFrame(new_people_rows)], ignore_index=True)

if new_contacts_rows:
    contacts_df = pd.concat([contacts_df, pd.DataFrame(new_contacts_rows)], ignore_index=True)

if new_attendance_rows:
    attendance_df = pd.concat([attendance_df, pd.DataFrame(new_attendance_rows)], ignore_index=True)
    attendance_df.drop_duplicates(subset=["person_id", "event_id"], keep="first", inplace=True)

people_df['first_name'] = people_df['first_name'].str.strip().str.title()
people_df['last_name'] = people_df['last_name'].str.strip().str.title()

Could not find person with email: jav2193@columbia.edu
Could not find person with email: cgoodman@mba2026.hbs.edu
Could not find person with phone: 19789738038.0
Could not find person with email: leonardbouffier@college.harvard.edu
Could not find person with email: rmorrison6@berklee.edu
Could not find person with phone: 16318273069.0
Could not find person with email: rquinn1@mit.edu
Could not find person with phone: 13154096192.0
No match found for 'ryan quinn'.
Could not find person with email: jbergstrom@college.harvard.edu
Could not find person with email: rickyang@college.harvard.edu
Could not find person with phone: 19144335054.0
No match found for 'rick yang'.
Could not find person with email: jpaul@college.harvard.edu
Could not find person with email: sambernbaum@college.harvard.edu
Could not find person with email: jesusbarrios@college.harvard.edu
Could not find person with email: lara_rieger@college.harvard.edu
Could not find person with email: gpellino@college.harvard.edu
Co

In [25]:
# -------------------------------------------------------
# 7) (Optional) Write updated DataFrames back to CSV
# -------------------------------------------------------
people_df.to_csv("final/people.csv", index=False)
contacts_df.to_csv("final/contacts.csv", index=False)
invite_tokens_df.to_csv("final/invite_tokens.csv", index=False)
attendance_df.to_csv("final/attendance.csv", index=False)

print("Done! Updated data has been saved.")

Done! Updated data has been saved.
