# Events And Registrants Data

In [None]:
# Import the shared config
import config
import hashlib
import pandas as pd  # type: ignore

# Read raw data
event_information_data_raw_df = pd.read_csv(config.EVENT_INFORMATION_DATA_RAW)
event_information_data_raw_df.head()

In [None]:
def clean_data(df):
    # Uppercase column names
    df.columns = df.columns.str.upper()

    # Replace white space with _
    df.columns = df.columns.str.replace(" ", "_")

    # Replace / with _
    df.columns = df.columns.str.replace("/", "_")

    # Remove ? from column names
    df.columns = df.columns.str.replace("?", "", regex=False)

    # Remove . from column names
    df.columns = df.columns.str.replace(".", "", regex=False)

    # Formats
    datetime_format = "%Y-%m-%d %I:%M:%S %p"  # 2024-12-31 11:59:59 PM

    #
    # Date and Time column manipulcations
    #

    df.loc[:, "START_DATE"] = pd.to_datetime(
        df["START_DATE"], format=datetime_format, errors="coerce"
    )

    # Function to create a short, uppercase hash
    def createHash(value):
        return hashlib.md5(value.encode()).hexdigest()[:16].upper()

    #
    # EVENT_INFO_ID key generation
    #

    # Hash the combined values from START_DATE, EVENT_CATEGORY, EVENT_TITLE columns
    # Insert the hash as the first column with EVENT_INFO_ID as the name
    df.insert(
        0,
        "EVENT_INFO_ID",
        df.apply(
            lambda row: createHash(f"{row['START_DATE']} {row['CATEGORY_NAME']} {row['TITLE']}"),
            axis=1,
        ),
    )

    return df


event_information_data_clean_df = clean_data(event_information_data_raw_df)
event_information_data_clean_df.head()

In [None]:
# Persist the interim data
event_information_data_clean_df.to_csv(config.EVENT_INFORMATION_DATA_INTERIM, index=False)