# Events And Registrants Data

In [None]:
# Import the shared config
import config
import hashlib
import pandas as pd  # type: ignore

# Read raw data
events_and_registrants_raw_df = pd.read_csv(config.EVENTS_AND_REGISTRANTS_RAW)
events_and_registrants_raw_df.head()

In [None]:
def clean_data(df):
    # Don't work on the original
    df_copy = df.copy()

    # Uppercase column names
    df_copy.columns = df_copy.columns.str.upper()

    # Drop columns with no values
    df_copy = df_copy.dropna(axis=1, how="all")

    # Replace white space with _
    df_copy.columns = df_copy.columns.str.replace(" ", "_")

    # Generate a EVENT_TIMESTAMP value from existing data
    def createEventTimestamp(row):
        return f"{row['EVENT_DATE']} {row['EVENT_START_TIME']}"

    # Insert EVENT_TIMESTAMP column at the begining
    df_copy["EVENT_TIMESTAMP"] = df_copy.apply(createEventTimestamp, axis=1)

    # Append column EVENT_INDEX based on row index
    df_copy["EVENT_INDEX"] = df_copy.index

    # Function to create a short, uppercase hash
    def createHash(value):
        return hashlib.md5(value.encode()).hexdigest()[:16].upper()

    # Hash the combined values from EVENT_INDEX, EVENT_DATE, EVENT_START_TIME, EVENT_FINISH_TIME columns
    # Insert the hash as the first column with EVENT_ID as the name
    df_copy.insert(
        0,
        "EVENT_ID",
        df_copy.apply(
            lambda row: createHash(
                f"{row['EVENT_INDEX']}-{row['EVENT_DATE']}-{row['EVENT_START_TIME']}-{row['EVENT_FINISH_TIME']}"
            ),
            axis=1,
        ),
    )

    # Drop the EVENT_INDEX column now that we have EVENT_ID
    df_copy.drop(columns=["EVENT_INDEX"], inplace=True)

    #
    # EVENT_INFO_ID
    #

    # Hash the combined values from EVENT_TIMESTAMP, EVENT_CATEGORY, EVENT_TITLE columns
    # Insert the hash as the first column with EVENT_INFO_ID as the name
    df_copy["EVENT_INFO_ID"] = df_copy.apply(
        lambda row: createHash(
            f"{row['EVENT_TIMESTAMP']} {row['EVENT_CATEGORY']} {row['EVENT_TITLE']}"
        ),
        axis=1,
    )

    # Drop the EVENT_CATEGORY, EVENT_TITLE columns as they are no longer needed
    df_copy.drop(columns=["EVENT_CATEGORY", "EVENT_TITLE"], inplace=True)

    # Order columns
    columns = [
        "EVENT_ID",
        "EVENT_TIMESTAMP",
        "EVENT_DATE",
        "EVENT_START_TIME",
        "EVENT_FINISH_TIME",
        "EVENT_INFO_ID",
        "MEMBER_NUMBER",
        "REGISTRANT_LAST_NAME",
        "REGISTRANT_FIRST_NAME",
        "REGISTRANT_TYPE",
        "REGISTRATION_STATUS",
        "ATTENDED",
    ]
    df_copy = df_copy[columns + [col for col in df_copy if col not in columns]]

    return df_copy


events_and_registrants_clean_df = clean_data(events_and_registrants_raw_df)
events_and_registrants_clean_df.head()

In [None]:
# Persist the interim data
events_and_registrants_clean_df.to_csv(config.EVENTS_AND_REGISTRANTS_INTERIM, index=False)