# **Adding Missing Games**

# Setup
The cells below will help to set up the rest of the notebook. 

I'll start by configuring the kernel that's running this notebook:

In [1]:
# Change the cwd
%cd ..

# Enable the autoreload module
%load_ext autoreload
%autoreload 2

# Load the environment variables
from dotenv import load_dotenv
load_dotenv(override=True)

/Users/thubbard/Documents/personal/programming/pax-pal-2025/experiments


True

Next, I'm going to import the necessary modules:

In [2]:
# General imports
import os

# Third-party imports
import pandas as pd
from openai import OpenAI

# Project-specific imports
from utils.openai import generate_completions_in_parallel
from utils.miscellaneous import get_consistent_hash

# Set up the OpenAI client
openai_client = OpenAI()

# Loading Data

In [3]:
playable_games_df = pd.read_json("data/final_enriched_games_data.json")
exhibitor_details_df = pd.read_json("data/exhibitor_details.json")

  playable_games_df = pd.read_json("data/final_enriched_games_data.json")
  playable_games_df = pd.read_json("data/final_enriched_games_data.json")
  playable_games_df = pd.read_json("data/final_enriched_games_data.json")


# Scraping Exhibitor Details for Mentions of Games

Set up the developer prompt & output format:

In [4]:
exhibitor_details_scraping_developer_prompt = """# Role
You're a digital assistant that loves responding in JSON. You specialize in information extraction tasks. 

# Task
The user will provide you with the description of an exhibitor at a video game convention.

You're aiming to extract any explicit mentions of specific games within this description. 

Games must have a title, and (hopefully) will have some descriptive text surrounding them. 
"""

from typing import List, Optional
from pydantic import BaseModel, Field


class IdentifiedGame(BaseModel):
    title: str
    description: Optional[str] = None
    mentions_playable_at_convention: bool = Field(
        ...,
        description="Whether the description mentions that the game is playable at the convention",
    )


class GameIdentificationResults(BaseModel):
    games: List[IdentifiedGame]

Next, I'll prepare data:

In [5]:
exhibitor_name_to_markdown_prompt = {}
for row in exhibitor_details_df.itertuples():
    exhibitor_name_to_markdown_prompt[row.name] = f"# {row.name}\n\n{row.description}"

Finally, I'll run the prompts:

In [6]:
exhibitor_details_scraping_completions, exhibitor_details_scraping_cost = (
    generate_completions_in_parallel(
        message_format_pairs=[
            (
                [
                    {
                        "role": "developer",
                        "content": exhibitor_details_scraping_developer_prompt,
                    },
                    {"role": "user", "content": markdown_prompt},
                ],
                GameIdentificationResults,
            )
            for exhibitor_name, markdown_prompt in exhibitor_name_to_markdown_prompt.items()
        ],
        gpt_model="gpt-4.1-mini",
        show_progress=True,
        max_parallel_requests=32,
        return_completion_costs=True,
    )
)

Generating Completions: 100%|██████████| 324/324 [00:38<00:00,  8.36it/s]


Finally, I'll parse the completions below:

In [7]:
exhibitor_names_list = [
    exhibitor_name
    for exhibitor_name, markdown_prompt in exhibitor_name_to_markdown_prompt.items()
]
newly_scraped_exhibitor_games_df_records = []
for idx, completion in enumerate(exhibitor_details_scraping_completions):
    cur_exhibitor_name = exhibitor_names_list[idx]
    identified_games = completion.choices[0].message.parsed.games
    if len(identified_games) > 0:
        for identified_game in identified_games:
            newly_scraped_exhibitor_games_df_records.append(
                {
                    "exhibitor_name": cur_exhibitor_name,
                    "game_name": identified_game.title,
                    "game_description": identified_game.description,
                    "mentions_playable_at_convention": identified_game.mentions_playable_at_convention,
                }
            )

newly_scraped_exhibitor_games_df = pd.DataFrame(
    newly_scraped_exhibitor_games_df_records
)

Next up: I'm going to remove any games that've already been identified in previous scraping attempts:

In [8]:
import re


def alphanum_only(s):
    return re.sub(r"[^a-zA-Z0-9]", "", s)


previously_identified_game_names_set = (
    playable_games_df["name"]
    .apply(lambda name: alphanum_only(str(name).lower()))
    .unique()
)

newly_identified_exhibitor_games_df = newly_scraped_exhibitor_games_df.copy()
newly_identified_exhibitor_games_df["merge_key"] = newly_identified_exhibitor_games_df[
    "game_name"
].apply(lambda name: alphanum_only(str(name).lower()))

newly_identified_exhibitor_games_df = newly_identified_exhibitor_games_df[
    ~newly_identified_exhibitor_games_df["merge_key"].isin(
        previously_identified_game_names_set
    )
].copy()

# Scraping Internet for Data

Declare the prompt:

In [9]:
developer_prompt = """# Role
You're a digital assistant helping to identify information about games. 

# Task
Search the Internet for information about games provided by users. Synthesize the game's name, description, genres, release date, platforms, and Steam page link.

Return information about the search process, including visited URLs and a summary of your findings.

# Guidelines
- You MUST use the web search tool to find information about the game. 
- ONLY identify information about the specific game provided. 
- You ought to aim to find at least three sources for each game. You should prefer authoritative sources (e.g., Wikipedia, Steam, the game's official website) to gather information, but you can also use reviews / blogs / features to understand more. 
- Return None for the `game_info` field if you can't identify the specific game (or if you're unsure about the identification).
- Some of these games may be in-development, so information could be limited; try your best!
- For header image URLs - if you've found the Steam link, then they can typically be found at https://cdn.akamai.steamstatic.com/steam/apps/[STEAM_GAME_ID]/header.jpg
"""


class GameInfo(BaseModel):
    game_name: str = Field(..., description="The name of the game.")
    released: bool = Field(
        ..., description="Whether the game has been released (True) or not (False)."
    )
    release_year: Optional[int] = Field(
        ...,
        description="The year the game was / will be released, or `None` if this info isn't available",
    )
    description: str = Field(
        ...,
        description="A paragraph-long description summarizing gameplay, story, aesthetics, and unique features, written in Wikipedia style.",
    )
    genres: Optional[List[str]] = Field(..., description="A list of genres")
    snappy_summary: Optional[str] = Field(
        None,
        description="A short, tagline-like summary (max 10 words) highlighting genre and unique appeal.",
    )
    platforms: Optional[List[str]] = Field(
        None,
        description="Platforms available: PlayStation, Xbox, Nintendo Switch, PC, Mobile, or Tabletop",
    )
    steam_link: Optional[str] = Field(
        None, description="Direct URL to the game's Steam page."
    )
    header_image_url: Optional[str] = Field(
        None, description="URL to a header image for the game."
    )


class GameSearchResults(BaseModel):
    web_search_summary: str = Field(
        ..., description="Summary of whether platform and Steam info were found."
    )
    web_search_results: List[List[str]] = Field(
        ...,
        description="A list of all of the websites visited, where each tuple contains a webpage title and URL.",
    )
    correctly_identified_game: bool = Field(
        ..., description="Whether the game was correctly identified."
    )
    game_info: Optional[GameInfo] = Field(
        None,
        description="Found game info, if `correctly_identified_game` is True. Otherwise, None.",
    )

Preparing the prompts:

In [10]:
mergekey_to_markdown_prompt = {}
for row in newly_identified_exhibitor_games_df.itertuples():
    mergekey_to_markdown_prompt[row.merge_key] = (
        f"**Exhibitor:** {row.exhibitor_name}\n\n**Game Name: **{row.game_name}**\n\n**Description:** {row.game_description}"
    )

Generating completions:

In [11]:
from concurrent.futures import ThreadPoolExecutor
import tqdm
import time

game_search_results_list = []


def process_game(merge_key, markdown_prompt):
    try:
        # Assuming user_prompt is constructed using the markdown_prompt
        current_user_prompt = markdown_prompt  # Modify this if needed to match your user_prompt construction

        response = openai_client.responses.parse(
            model="gpt-4.1-mini",
            tools=[{"type": "web_search_preview", "search_context_size": "low"}],
            input=[
                {
                    "role": "developer",
                    "content": developer_prompt,
                },
                {"role": "user", "content": current_user_prompt},
            ],
            text_format=GameSearchResults,
            tool_choice={"type": "web_search_preview"},
        )

        # Sleep for 1 second (to avoid rate-limiting)
        time.sleep(1)

        # Create result dictionary
        cur_row_dict = {
            "merge_key": merge_key,
            "web_search_summary": response.output_parsed.web_search_summary,
            "web_search_results": response.output_parsed.web_search_results,
            "correctly_identified_game": response.output_parsed.correctly_identified_game,
        } | (
            response.output_parsed.game_info.model_dump()
            if response.output_parsed.game_info
            else {}
        )

        return cur_row_dict

    except Exception as e:
        print(f"Error processing {merge_key}: {e}")

        return {
            "merge_key": merge_key,
            "web_search_summary": None,
            "web_search_results": None,
            "correctly_identified_game": False,
        }


# Run processing in parallel with 8 workers
with ThreadPoolExecutor(max_workers=8) as executor:
    # Create a list of futures
    futures = [
        executor.submit(process_game, merge_key, markdown_prompt)
        for merge_key, markdown_prompt in list(mergekey_to_markdown_prompt.items())
    ]

    # Manually create progress bar 🔄Ï
    progress_bar = tqdm.tqdm(total=len(futures), desc="Processing games")

    # Process results as they complete
    for future in futures:
        result = future.result()
        game_search_results_list.append(result)
        # Update progress bar after each result
        progress_bar.update(1)

    # Close the progress bar when done
    progress_bar.close()

Processing games:  83%|████████▎ | 267/323 [03:44<02:41,  2.88s/it]

Error processing persona3portable: 1 validation error for GameSearchResults
  Invalid JSON: EOF while parsing a string at line 1 column 2723 [type=json_invalid, input_value='{"web_search_summary":"P...d1d1d1d1d1d1d1d1d1d1d1 ', input_type=str]
    For further information visit https://errors.pydantic.dev/2.11/v/json_invalid


Processing games: 100%|██████████| 323/323 [04:27<00:00,  1.21it/s]


Next, I need to parse these results:

In [12]:
game_search_results_df = pd.DataFrame.from_records(game_search_results_list)

# Filtering Newly Found Data

In [15]:
filtered_game_search_results_df = (
    game_search_results_df.copy()
    .merge(
        newly_identified_exhibitor_games_df[["merge_key", "exhibitor_name"]],
        on="merge_key",
    )
    .merge(
        exhibitor_details_df[["name", "booth"]].rename(
            columns={"name": "exhibitor_name"}
        ),
        on=["exhibitor_name"],
        how="inner",
    )
)

# Remove tabletop
filtered_game_search_results_df = filtered_game_search_results_df[
    filtered_game_search_results_df["platforms"].apply(
        lambda platforms_list: (
            False
            if (not isinstance(platforms_list, list))
            else (False if "Tabletop" in platforms_list else True)
        )
    )
].copy()

# Remove any where the game wasn't correctly identified
filtered_game_search_results_df = filtered_game_search_results_df[
    filtered_game_search_results_df["correctly_identified_game"]
].copy()

# Remove any released in 2023 or earlier
filtered_game_search_results_df = filtered_game_search_results_df[
    filtered_game_search_results_df["release_year"].apply(
        lambda year: True if (year is None or year >= 2024) else False
    )
].copy()

# Remove any already identified
filtered_game_search_results_df = filtered_game_search_results_df[
    ~filtered_game_search_results_df["game_name"].apply(
        lambda name: alphanum_only(name).lower()
        in playable_games_df["name"]
        .apply(lambda name: alphanum_only(str(name).lower()))
        .unique()
    )
].copy()

Next, I'm going to filter out any games whose Steam details weren't correct:

In [33]:
import requests
import time
from tqdm import tqdm
from utils.miscellaneous import extract_steam_game_info

# Create a list of flags indicating if the game wasn't found on Steam or if extraction was successful
steam_extraction_success = []
for row in tqdm(list(filtered_game_search_results_df.itertuples())):
    # True if no Steam link exists (wasn't found on Steam to begin with)
    if pd.isna(row.steam_link):
        steam_extraction_success.append(True)
        continue

    try:
        # Make a request to the Steam page
        response = requests.get(row.steam_link, timeout=10)
        if response.status_code == 200:
            # Extract game information - if we get here, extraction was successful
            game_info = extract_steam_game_info(response.text)
            if game_info.get("name", None) is None:
                # If the game name is None, it means we couldn't extract the game info
                steam_extraction_success.append(False)
            else:
                # Check if the game name matches the expected name
                if alphanum_only(game_info["name"].lower()) != row.merge_key:
                    # If the names don't match, mark as unsuccessful
                    steam_extraction_success.append(False)
                else:
                    # Otherwise, mark as successful
                    steam_extraction_success.append(True)
            # Sleep to avoid rate limiting
            time.sleep(1)
        else:
            # Failed to get a valid response
            steam_extraction_success.append(False)
    except requests.exceptions.RequestException:
        # Request failed
        steam_extraction_success.append(False)

# Add the success flag to the dataframe
filtered_game_search_results_df["steam_extraction_success"] = steam_extraction_success

# Filter to only include games where extraction succeeded or no Steam link existed
filtered_game_search_results_df = filtered_game_search_results_df[
    filtered_game_search_results_df["steam_extraction_success"]
].copy()

  0%|          | 0/45 [00:00<?, ?it/s]

100%|██████████| 45/45 [01:05<00:00,  1.45s/it]


# Adding Newly Found Data

In [36]:
new_playable_games_df_records = []
for row in filtered_game_search_results_df.itertuples():
    new_playable_games_df_records.append(
        {
            "id": get_consistent_hash(row.game_name),
            "genres_and_tags": row.genres,
            "platforms": row.platforms,
            "name": row.game_name,
            "snappy_summary": row.snappy_summary,
            "description_texts": [
                {"source": "ai_search_summary", "text": row.description}
            ],
            "developer": row.exhibitor_name,
            "exhibitor": row.exhibitor_name,
            "booth_number": row.booth,
            "header_image_url": row.header_image_url,
            "steam_link": row.steam_link,
            "media": [],
            "released": row.released,
            "release_time": row.release_year,
            "links": [
                {
                    "title": "Google Search for Game",
                    "url": f"https://www.google.com/search?q={row.game_name + ' ' + row.exhibitor_name}",
                }
            ]
            + [
                {"title": link_tuple[0], "url": link_tuple[1]}
                for link_tuple in row.web_search_results
            ],
        }
    )

new_playable_games_df = pd.DataFrame(new_playable_games_df_records)

# Saving Data
Finally, I'm going to save the data:

In [39]:
final_playable_games_df = pd.concat(
    [playable_games_df, new_playable_games_df]
).drop_duplicates(subset=["name"], keep="first")

final_playable_games_df.to_json(
    "data/extra_final_enriched_games_data.json", orient="records", indent=2
)