# Events And Registrants Data

In [None]:
# Import the shared config
import config
import hashlib
import pandas as pd  # type: ignore

# Read raw data
events_and_registrants_raw_df = pd.read_csv(config.EVENTS_AND_REGISTRANTS_RAW)
# events_and_registrants_raw_df.head()

In [None]:
def clean_data(df):
    # Uppercase column names
    df.columns = df.columns.str.upper()

    # Replace white space with _
    df.columns = df.columns.str.replace(" ", "_")

    # Formats
    datetime_format = "%Y-%m-%d %H:%M:%S"  # 2024-12-31 23:59:59
    date_format = "%Y-%m-%d"  # 2024-12-31

    #
    # Date and Time functions
    #

    # Generate a START_DATE value from existing data
    def createStartDate(row):
        return f"{row['EVENT_DATE']} {row['EVENT_START_TIME']}"

    # Prepend the date to the time
    def combineDateTime(row, col):
        return f"{row['EVENT_DATE']} {row[col]}"

    #
    # Date and Time column manipulcations
    #

    # Insert START_DATE column at the begining
    df.insert(0, "START_DATE", df.apply(createStartDate, axis=1))
    df.loc[:, "START_DATE"] = pd.to_datetime(
        df["START_DATE"], format=datetime_format, errors="coerce"
    )

    # Convert time columns to time type
    time_columns = [
        "EVENT_START_TIME",
        "EVENT_FINISH_TIME",
    ]

    # Use .loc to avoid SettingWithCopyWarning and specify date format
    for col in time_columns:
        # Combine date and time
        df.loc[:, col] = df.apply(
            lambda row: combineDateTime(row, col), axis=1
        )  # axis insure row wise operation
        # Convert to datetime.time
        df.loc[:, col] = pd.to_datetime(df[col], format=datetime_format, errors="coerce").dt.time

    # Change EVENT_DATE column to date type
    df.loc[:, "EVENT_DATE"] = pd.to_datetime(df["EVENT_DATE"], format=date_format, errors="coerce")

    # Function to create a short, uppercase hash
    def createHash(value):
        return hashlib.md5(value.encode()).hexdigest()[:16].upper()

    #
    # EVENT_INFO_ID key generation
    #

    # Hash the combined values from START_DATE, EVENT_CATEGORY, EVENT_TITLE columns
    # Insert the hash as the first column with EVENT_INFO_ID as the name
    df.insert(
        0,
        "EVENT_INFO_ID",
        df.apply(
            lambda row: createHash(
                f"{row['START_DATE']} {row['EVENT_CATEGORY']} {row['EVENT_TITLE']}"
            ),
            axis=1,
        ),
    )

    #
    # Primary key generation
    #

    # Index the dafaframe and rename it to EVENT_INDEX
    df.reset_index(inplace=True)
    df.rename(columns={"index": "EVENT_INDEX"}, inplace=True)

    # Hash the combined values from EVENT_INDEX, EVENT_INFO_ID
    # Insert the hash as the first column with EVENT_ID as the name
    df.insert(
        0,
        "EVENT_ID",
        df.apply(
            lambda row: createHash(f"{row['EVENT_INDEX']} {row['EVENT_INFO_ID']}"),
            axis=1,
        ),
    )

    # Drop the EVENT_INDEX, START_DATE columns
    df.drop(columns=["EVENT_INDEX", "START_DATE"], inplace=True)

    return df


events_and_registrants_clean_df = clean_data(events_and_registrants_raw_df)
events_and_registrants_clean_df.head()

In [None]:
# Persist the interim data
events_and_registrants_clean_df.to_csv(config.EVENTS_AND_REGISTRANTS_INTERIM, index=False)