# Events Information Data

In [None]:
# Import the shared config
import config
import hashlib
import pandas as pd  # type: ignore

# Read raw data
event_information_data_raw_df = pd.read_csv(config.EVENT_INFORMATION_DATA_RAW)
event_information_data_raw_df.head()

In [None]:
def clean_data(df):
    # Don't work on the original data
    df_copy = df.copy()

    # Uppercase column names
    df_copy.columns = df_copy.columns.str.upper()

    # Drop columns with no values
    df_copy = df_copy.dropna(axis=1, how="all")

    # Fix column names
    df_copy.columns = df_copy.columns.str.replace(" ", "_")
    df_copy.columns = df_copy.columns.str.replace("/", "_")
    df_copy.columns = df_copy.columns.str.replace("?", "", regex=False)
    df_copy.columns = df_copy.columns.str.replace(".", "", regex=False)

    # Convert to datetime
    datetime_format = "%Y-%m-%d %I:%M:%S %p"  # 2024-12-31 11:59:59 PM
    df_copy.loc[:, "START_DATE"] = pd.to_datetime(
        df_copy["START_DATE"], format=datetime_format, errors="coerce"
    )

    # Normalize column names
    df_copy.rename(columns={"START_DATE": "EVENT_TIMESTAMP"}, inplace=True)
    df_copy.rename(columns={"CATEGORY_NAME": "EVENT_CATEGORY"}, inplace=True)
    df_copy.rename(columns={"TITLE": "EVENT_TITLE"}, inplace=True)
    df_copy.rename(columns={"SHORT_DESCRIPTION": "EVENT_SHORT_DESCRIPTION"}, inplace=True)

    #
    # EVENT_INFO_ID key
    #

    # Function to create a short, uppercase hash
    def createHash(value):
        return hashlib.md5(value.encode()).hexdigest()[:16].upper()

    # Hash the combined values from EVENT_TIMESTAMP, EVENT_CATEGORY, EVENT_TITLE columns
    # Insert the hash as the first column with EVENT_INFO_ID as the name
    df_copy["EVENT_INFO_ID"] = df_copy.apply(
        lambda row: createHash(
            f"{row['EVENT_TIMESTAMP']} {row['EVENT_CATEGORY']} {row['EVENT_TITLE']}"
        ),
        axis=1,
    )

    # Order columns
    columns = [
        "EVENT_INFO_ID",
        "EVENT_TIMESTAMP",
        "EVENT_CATEGORY",
        "EVENT_TITLE",
        "EVENT_SHORT_DESCRIPTION",
        "LOCATION",
        "LOCATION_ADDRESS",
        "LOCATION_CITY",
        "LOCATION_STATE_PROVINCE",
        "LOCATION_ZIP_POSTAL_CODE",
        "LOCATION_COUNTRY",
        "LOCATION_PHONE",
        "LOCATION_WEBSITE",
    ]
    df_copy = df_copy[columns + [col for col in df_copy if col not in columns]]

    return df_copy


event_information_data_clean_df = clean_data(event_information_data_raw_df)
event_information_data_clean_df.head()

In [None]:
# Persist the interim data
event_information_data_clean_df.to_csv(config.EVENT_INFORMATION_DATA_INTERIM, index=False)