# **Harmonizing Data**

Since there's different data between the app and the website, I'm going to try and harmoize them a bit.


# Setup

The cells below will help to set up the rest of the notebook.

I'll start by configuring the kernel that's running this notebook:


In [1]:
# Change the cwd
%cd ..

# Enable the autoreload module
%load_ext autoreload
%autoreload 2

# Load the environment variables
from dotenv import load_dotenv
load_dotenv(override=True)

/Users/thubbard/Documents/personal/programming/pax-pal-2025


True

Next, I'm going to import the necessary modules:


In [2]:
# Third-party imports
import pandas as pd

# Loading Data

Below, I'll load in the data I scraped from both the website and the app.


In [3]:
# Load all of the games / exhibitor details
exhibitor_details_df = pd.read_json("data/exhibitor_details.json")
games_from_website_df = pd.read_json("data/expo_hall_demos.json")
games_from_app_df = pd.read_json("data/games_from_app.json")
games_from_website_more_details_df = pd.read_json("data/expo_hall_demos_detailed.json")

# Add title_merge_key to each dataframe
games_from_website_df["title_merge_key"] = (
    games_from_website_df["name"].str.lower().str.replace(r"[^a-z0-9]", "", regex=True)
)
games_from_app_df["title_merge_key"] = (
    games_from_app_df["title"].str.lower().str.replace(r"[^a-z0-9]", "", regex=True)
)
games_from_website_more_details_df["title_merge_key"] = (
    games_from_website_more_details_df["name"]
    .str.lower()
    .str.replace(r"[^a-z0-9]", "", regex=True)
)

games_from_website_df = games_from_website_df.merge(
    games_from_website_more_details_df[
        [
            "id",
            "name",
            "company",
            "exhibitor_id",
            "modal_description",
            "title_merge_key",
        ]
    ].rename(
        columns={
            "modal_description": "description",
        }
    ),
    on=["id", "name", "company", "exhibitor_id", "title_merge_key"],
    how="left",
)

# Determining Unique Games

Next up: I'm going to determine all of the unique games:


In [4]:
# Determine all of the games in the app and website
games_from_both_app_and_website_df = games_from_app_df.merge(
    games_from_website_df[["name", "company", "image_url", "description", "title_merge_key"]].rename(
        columns={
            "name": "fromweb_title",
            "company": "fromweb_company",
            "image_url": "fromweb_image_url",
            "description": "fromweb_description",
        }
    ),
    how="inner",
    left_on=["title_merge_key"],
    right_on=["title_merge_key"],
)

# Determine the games that are only in the app and only in the website
games_from_only_app_df = games_from_app_df[
    ~games_from_app_df["title_merge_key"].isin(games_from_both_app_and_website_df["title_merge_key"])
]
games_from_only_website_df = games_from_website_df[
    ~games_from_website_df["title_merge_key"].isin(
        games_from_both_app_and_website_df["title_merge_key"]
    )
]

How many games are in each?

In [5]:
print(f"Games in both app and website: {len(games_from_both_app_and_website_df)}")
print(f"Games only in app: {len(games_from_only_app_df)}")
print(f"Games only in website: {len(games_from_only_website_df)}")

Games in both app and website: 106
Games only in app: 46
Games only in website: 39


I can grab all of the information that I have about each game, and save it below:


In [6]:
# Start with the games from the app + website as a basis
unified_games_df = games_from_both_app_and_website_df[
    [
        "title",
        "booth_number",
        "description",
        "genres",
        "developer",
        "release_year",
        "fromweb_description",
        "fromweb_company",
        "fromweb_image_url",
    ]
].rename(
    columns={
        "description": "description_texts",
        "fromweb_image_url": "header_image_url",
    }
)
# Create list of descriptions from both sources with source information
unified_games_df["description_texts"] = unified_games_df.apply(
    lambda row: (
        # Add descriptions from the app with source information
        [
            {"source": "pax_app", "text": text}
            for text in (
                row["description_texts"]
                if isinstance(row["description_texts"], list)
                else (
                    [row["description_texts"]]
                    if pd.notna(row["description_texts"])
                    else []
                )
            )
        ]
        # Add description from the website with source information
        + (
            [{"source": "pax_website", "text": row["fromweb_description"]}]
            if pd.notna(row["fromweb_description"])
            else []
        )
    ),
    axis=1,
)

# If the developer is None but fromweb_company is not, use fromweb_company
unified_games_df["developer"] = unified_games_df.apply(
    lambda row: (
        row["fromweb_company"] if pd.notna(row["fromweb_company"]) else row["developer"]
    ),
    axis=1,
)
# Drop the fromweb_company column as it's no longer needed
unified_games_df.drop(columns=["fromweb_company"], inplace=True)

# Prepare dataframes for concatenation to avoid FutureWarning
app_only_games = games_from_only_app_df[
    [
        "title",
        "booth_number",
        "description",
        "genres",
        "developer",
        "release_year",
    ]
].rename(
    columns={
        "description": "description_texts",
    }
)
# Format description_texts as dictionaries with source information
app_only_games["description_texts"] = app_only_games["description_texts"].apply(
    lambda x: [{"source": "pax_app", "text": text} for text in 
              (x if isinstance(x, list) else ([x] if pd.notna(x) else []))]
)
# Add empty header_image_url column
app_only_games["header_image_url"] = None

# Add all of the games that are only in the app
unified_games_df = pd.concat(
    [unified_games_df, app_only_games],
    ignore_index=True,
)

# Prepare website-only games dataframe
website_only_games = games_from_only_website_df[
    ["name", "company", "image_url", "description"]
].rename(
    columns={
        "name": "title",
        "company": "developer",
        "image_url": "header_image_url",
        "description": "description_texts",
    }
)
# Format description_texts as dictionaries with source information
website_only_games["description_texts"] = website_only_games["description_texts"].apply(
    lambda x: [{"source": "pax_website", "text": text} for text in 
              (x if isinstance(x, list) else ([x] if pd.notna(x) else []))]
)
website_only_games["genres"] = [[] for _ in range(len(website_only_games))]
website_only_games["release_year"] = None
website_only_games["booth_number"] = None

# Add all of the games that are only in the website
unified_games_df = pd.concat(
    [unified_games_df, website_only_games],
    ignore_index=True,
)

# Cast booth_number as an integer
unified_games_df["booth_number"] = unified_games_df["booth_number"].apply(
    lambda x: int(x) if pd.notna(x) else None
)

  unified_games_df = pd.concat(


How many games total are there?


In [7]:
len(unified_games_df)

191

# Linking Games to Exhibitor Booths
**WARNING: SUPER MESSY.** Did this at like 1am, pretty tired, rushing through it 

I'll grab the exhibitor booths first:

In [8]:
truncated_exhibitor_details_df = (
    exhibitor_details_df[["name", "booth", "description", "playable_games"]]
    .rename(
        columns={
            "name": "exhibitor_name",
            "booth": "booth_number",
            "description": "exhibitor_description",
        }
    )
    .copy()
)

# If the booth numbers is a comma-separated string of numeric values, split it and keep the first one
truncated_exhibitor_details_df["booth_number"] = truncated_exhibitor_details_df[
    "booth_number"
].apply(lambda x: x.split(",")[0] if isinstance(x, str) and "," in x else x)

truncated_exhibitor_details_df["booth_number_is_numeric"] = (
    truncated_exhibitor_details_df["booth_number"].apply(
        lambda x: x.isnumeric() if pd.notna(x) else False
    )
)

# Drop those that aren't numeric, and convert to int
truncated_exhibitor_details_df = (
    truncated_exhibitor_details_df[
        truncated_exhibitor_details_df["booth_number_is_numeric"]
    ]
    .copy()
    .drop(columns=["booth_number_is_numeric"])
)
truncated_exhibitor_details_df["booth_number"] = truncated_exhibitor_details_df[
    "booth_number"
].apply(lambda x: int(x) if pd.notna(x) else None)

# Load in the additional exhibitor info
additional_exhibitor_info_df = (
    pd.read_json("data/exhibitor_list.json")
    .explode("booths")
    .rename(
        columns={
            "name": "exhibitor_name",
            "booths": "booth_number",
            "description_excerpt": "exhibitor_description",
        }
    )
)
additional_exhibitor_info_df["playable_games"] = [[]] * len(
    additional_exhibitor_info_df
)

# Concatenate the additional exhibitor info with the existing exhibitor details
truncated_exhibitor_details_df = pd.concat(
    [
        truncated_exhibitor_details_df,
        additional_exhibitor_info_df[
            ["exhibitor_name", "booth_number", "exhibitor_description", "playable_games"]
        ],
    ],
    ignore_index=True,
).drop_duplicates(
    subset=["exhibitor_name"],
    keep="first",
)

# Make the same dictionary, but using the exhibitor name as the key
exhibitor_details_dict_by_name = {
    row["exhibitor_name"]: {
        "booth_number": row["booth_number"],
        "exhibitor_description": row["exhibitor_description"],
        "playable_games": [game.get("name") for game in row["playable_games"]],
    }
    for _, row in truncated_exhibitor_details_df.iterrows()
}

# Drop duplicates
truncated_exhibitor_details_df = truncated_exhibitor_details_df.drop_duplicates(
    subset=["booth_number"]
)

# Make a dictionary of the exhibitor details
exhibitor_details_dict = {
    row["booth_number"]: {
        "exhibitor_name": row["exhibitor_name"],
        "exhibitor_description": row["exhibitor_description"],
        "playable_games": (
            [game.get("name") for game in row["playable_games"]]
            if isinstance(row["playable_games"], list)
            else []
        ),
    }
    for _, row in truncated_exhibitor_details_df.iterrows()
}


# Drop the playable_games column
truncated_exhibitor_details_df = truncated_exhibitor_details_df.drop(
    columns=["playable_games"]
)

Now, I'll merge:

In [9]:
unified_games_with_exhibitors_df = unified_games_df.copy().merge(
    truncated_exhibitor_details_df,
    how="left",
    on="booth_number",
)

unified_games_with_exhibitors_df["different"] = (
    unified_games_with_exhibitors_df["developer"]
    .apply(lambda x: x.lower() if pd.notna(x) else None)
    .ne(
        unified_games_with_exhibitors_df["exhibitor_name"].apply(
            lambda x: x.lower() if pd.notna(x) else None
        )
    )
)

# For all of the games that are different, just make the exhibitor_name and exhibitor_description None
unified_games_with_exhibitors_df["exhibitor_name"] = (
    unified_games_with_exhibitors_df.apply(
        lambda x: None if x["different"] else x["exhibitor_name"],
        axis=1,
    )
)
unified_games_with_exhibitors_df["exhibitor_description"] = (
    unified_games_with_exhibitors_df.apply(
        lambda x: None if x["different"] else x["exhibitor_description"],
        axis=1,
    )
)

# Drop the different column
unified_games_with_exhibitors_df = unified_games_with_exhibitors_df.drop(
    columns=["different"]
)

I'll bring in the "special ones" - these are instances where the booth numbers *are* correct from the original scraping, but the name of the exhibitor *isn't* the developer. 

In [10]:
special_exhibitors_df = truncated_exhibitor_details_df[
    truncated_exhibitor_details_df["booth_number"].isin(
        [
            13097,
            15031,
            19097,
            18114,
            17096,
            15035,
            16097,
            16109,
            17092,
            21097,
            18104,
            14085,
            18098,
            21087,
        ]
    )
]

# Rename booth 13097 to "Pax Rising Showcase", and make the exhibitor_description "A collection of indie games PAX Rising is showcasing."
special_exhibitors_df.loc[
    special_exhibitors_df["booth_number"] == 13097, "exhibitor_name"
] = "PAX Rising Showcase"
special_exhibitors_df.loc[
    special_exhibitors_df["booth_number"] == 13097, "exhibitor_description"
] = "A collection of indie games PAX Rising is showcasing."

# Make the special_exhibitors_df into a dictionary
special_exhibitors_dict = special_exhibitors_df.set_index("booth_number").T.to_dict(
    "records"
)[0]

# Add the special exhibitors to the unified games with exhibitors dataframe
unified_games_with_exhibitors_df["exhibitor_name"] = (
    unified_games_with_exhibitors_df.apply(
        lambda x: (
            special_exhibitors_dict[x["booth_number"]]
            if x["booth_number"] in special_exhibitors_dict
            else x["exhibitor_name"]
        ),
        axis=1,
    )
)
unified_games_with_exhibitors_df["exhibitor_description"] = (
    unified_games_with_exhibitors_df.apply(
        lambda x: (
            special_exhibitors_dict[x["booth_number"]]
            if x["booth_number"] in special_exhibitors_dict
            else x["exhibitor_description"]
        ),
        axis=1,
    )
)

Next up, we'll try and match on the names of the exhibitors:

In [11]:
unified_games_with_exhibitors_df_rows = []
for row in unified_games_with_exhibitors_df.itertuples():
    row_dict = row._asdict()
    # Remove the Index key from the dictionary
    row_dict.pop("Index", None)

    if not pd.isna(row.exhibitor_name):
        unified_games_with_exhibitors_df_rows.append(row_dict)
    else:
        # Grab the booth number
        booth_number = row.booth_number

        # Determine whether or not the exhibitor in the exhibitor_details_dict
        if booth_number in exhibitor_details_dict:

            # Check to see if the exhibitor name is the same as the developer
            if pd.notna(row.developer) and (
                exhibitor_details_dict[booth_number]["exhibitor_name"].lower()
                == row.developer.lower()
            ):
                # If it is, just add the game to the list
                unified_games_with_exhibitors_df_rows.append(row_dict)

            # Otherwise, check to see if the name of the game is in the playable games
            else:
                # If it is, add the exhibitor name and description
                if row.title in exhibitor_details_dict[booth_number]["playable_games"]:
                    unified_games_with_exhibitors_df_rows.append(
                        {
                            **row_dict,
                            "exhibitor_name": exhibitor_details_dict[booth_number][
                                "exhibitor_name"
                            ],
                            "exhibitor_description": exhibitor_details_dict[
                                booth_number
                            ]["exhibitor_description"],
                        }
                    )
                else:
                    # Otherwise, just add the game to the list, but without the booth number
                    row_dict["booth_number"] = None
                    unified_games_with_exhibitors_df_rows.append(row_dict)

        # If not, just add the game to the list
        else:
            unified_games_with_exhibitors_df_rows.append(row_dict)

# Create a new dataframe from the rows
unified_games_with_exhibitors_df = pd.DataFrame(unified_games_with_exhibitors_df_rows)

Next, we'll try to match using some of the "playable games" we've extracted:

In [12]:
playable_games_df = exhibitor_details_df.explode("playable_games").copy()
playable_games_df = playable_games_df.dropna(subset=["playable_games"])
playable_games_df["game_name"] = playable_games_df["playable_games"].apply(
    lambda x: x.get("name") if isinstance(x, dict) else None
)

# Create a function to generate a merge key by removing case, non-alphanumerics, and spaces
def create_merge_key(title):
    if pd.isna(title):
        return None
    # Remove non-alphanumerics and spaces, convert to lowercase
    return ''.join(c for c in str(title).lower() if c.isalnum())

# Add merge keys to playable games
playable_games_df["game_name_key"] = playable_games_df["game_name"].apply(create_merge_key)

# Create dictionary with normalized keys
playable_games_to_exhibitor_info_dict = {}
for row in playable_games_df.itertuples():
    if pd.notna(row.game_name_key):
        playable_games_to_exhibitor_info_dict[row.game_name_key] = {
            "exhibitor_name": row.name,
            "exhibitor_description": row.description,
            "booth_number": row.booth,
        }

unified_games_with_exhibitors_df_rows = []
for row in unified_games_with_exhibitors_df.itertuples():

    row_dict = row._asdict()
    row_dict.pop("Index", None)

    # If the game has a booth number, add it and continue
    if pd.notna(row.booth_number):
        unified_games_with_exhibitors_df_rows.append(row_dict)
        continue

    # Create merge key for the current game title
    title_key = create_merge_key(row.title)

    # Otherwise, check if the game's normalized title is in the playable games dictionary
    if title_key and title_key in playable_games_to_exhibitor_info_dict:
        # If it is, add the exhibitor name and description
        unified_games_with_exhibitors_df_rows.append(
            {
                **row_dict,
                "exhibitor_name": playable_games_to_exhibitor_info_dict[title_key][
                    "exhibitor_name"
                ],
                "exhibitor_description": playable_games_to_exhibitor_info_dict[
                    title_key
                ]["exhibitor_description"],
                "booth_number": playable_games_to_exhibitor_info_dict[title_key][
                    "booth_number"
                ],
            }
        )
    else:
        # Otherwise, just add the game to the list
        unified_games_with_exhibitors_df_rows.append(row_dict)

# Create a new dataframe from the rows
unified_games_with_exhibitors_df = pd.DataFrame(unified_games_with_exhibitors_df_rows)

Finally, we'll try and merge any whose developer name matches the exhibitor name:

In [13]:
unified_games_with_exhibitors_df_rows = []
for row in unified_games_with_exhibitors_df.itertuples():
    row_dict = row._asdict()
    row_dict.pop("Index", None)

    # If the game has a booth number, add it and continue
    if pd.notna(row.booth_number):
        unified_games_with_exhibitors_df_rows.append(row_dict)
        continue

    # Otherwise, check if the game's developer is in the exhibitor_details_dict_by_name
    if row.developer in exhibitor_details_dict_by_name:
        # If it is, add the exhibitor name and description
        unified_games_with_exhibitors_df_rows.append(
            {
                **row_dict,
                "exhibitor_name": row.developer,
                "exhibitor_description": exhibitor_details_dict_by_name[row.developer][
                    "exhibitor_description"
                ],
                "booth_number": exhibitor_details_dict_by_name[row.developer][
                    "booth_number"
                ],
            }
        )
    else:
        # Otherwise, just add the game to the list
        unified_games_with_exhibitors_df_rows.append(row_dict)

# Create a new dataframe from the rows
unified_games_with_exhibitors_df = pd.DataFrame(unified_games_with_exhibitors_df_rows)

# Saving Unified Data
Finally, below, I'm going to save the unified data:

In [14]:
final_unified_data_df = unified_games_with_exhibitors_df.copy()

# Cast the booth_number as an int
final_unified_data_df["booth_number"] = final_unified_data_df["booth_number"].apply(
    lambda x: int(x) if pd.notna(x) else None
)

# Make the description_texts a list of dicts, empty if nothing
final_unified_data_df["description_texts"] = final_unified_data_df[
    "description_texts"
].apply(lambda x: (x if isinstance(x, list) else []))

# Make genres either a list of strings or an empty list
final_unified_data_df["genres"] = final_unified_data_df["genres"].apply(
    lambda x: x if isinstance(x, list) else ([x] if pd.notna(x) else [])
)

# Drop the release_year, and fromweb_description columns
final_unified_data_df = final_unified_data_df.drop(
    columns=["release_year", "fromweb_description"],
    errors="ignore",
)

# Save the final unified data to a JSON file
final_unified_data_df.to_json(
    "data/unified_games_data.json", orient="records", indent=4
)

# Understanding the Data
How many games don't have booth numbers?

In [15]:
len(final_unified_data_df[final_unified_data_df["booth_number"].isna()])

19