# Notebook Objective and Setup

BGG01 involves the acquisition of game data from BoardGameGeek. Largely this is accomplished by XML API call, with some dynamic content scraped. Files are dumped to a "dirty" directory.

## Package Imports

In [1]:
import pandas as pd

pd.set_option("display.max_columns", None)
import numpy as np
from bs4 import BeautifulSoup
import requests
import regex as re
import time
import json
import os
import gc
import scrapy
from io import StringIO, BytesIO
from lxml import etree
from datetime import datetime

# ignore warnings (gets rid of Pandas copy warnings)
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 30)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os

## Functions

In [None]:
def create_thing_of_type(game_page, game_id, find_type_str):
    """Create DataFrame for things for a specific game id

    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game

    Outputs:
    dataframe"""

    # find all of the things on page
    all_this_type = game_page.find_all("link", type=find_type_str)

    # make dictionary for this item
    this_dict = {"BGGId": [int(game_id)]}

    # add this item's things to dictionary
    for item in all_this_type:
        this_dict[item["value"]] = [1]

    # create the dataframe
    df = pd.DataFrame(this_dict)

    # return dataframe
    return df

In [None]:
def create_mechanics(game_page, game_id):
    """Create DataFrame for Mechanics for a specific game id

    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game

    Outputs:
    dataframe"""

    # find all mechanics on page
    all_mechanics = game_page.find_all("link", type="boardgamemechanic")

    # make dictionary for this item
    mechanic = {"BGGId": [int(game_id)]}

    # add this item's mechanics to dictionary
    for item in all_mechanics:
        mechanic[item["value"]] = [1]

    # Try Tableau
    try:
        game_page.find(
            "link", type="boardgamefamily", value=("Mechanism: Tableau Building")
        )["value"]
        mechanic["TableauBuilding"] = [1]
    except:
        pass

    # Try is Legacy
    try:
        game_page.find("link", type="boardgamefamily", value=("Mechanism: Legacy"))[
            "value"
        ]
        mechanic["Legacy"] = [1]
    except:
        pass

    # append to dataframe
    mechanics = pd.DataFrame(mechanic)
    # return dataframe
    return mechanics

In [None]:
def create_awards(awards_level, game_id):
    """Create DataFrame for Awards for a specific game id

    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game

    Outputs:
    dataframe"""

    # find all awards on page
    all_awards = awards_level.find_all("a", class_="ng-binding")

    # make dictionary for this item
    award = {"BGGId": [int(game_id)]}

    # add this item's awards to dictionary
    for item in all_awards:
        item = re.sub("[0-9]", "", item.text).strip(" ")
        award[item] = [1]

    # append to dataframe
    awards = pd.DataFrame(award)

    # return dataframe
    return awards

# PULL - Game Data

Last game id: 349161

## Pull Games with Scrapy

In [None]:
df = pd.read_csv("boardgames_ranks.csv", low_memory=False)
game_ids = df["id"].astype(int).to_list()
len(game_ids)

In [None]:
game_block = 500


def generate_raw_urls(game_ids):

    start_position = 0
    end_position = game_block
    file_suffix = 0
    urls_list = []

    while start_position < (len(game_ids) + 1):

        ##### File Setup Section #####

        # increment file suffix
        file_suffix += 1
        # get file suffix as string
        suffix_str = str(file_suffix)

        # print start and end positions
        print(f"Getting items {str(start_position+1)} through {str(end_position)}")

        # get list of game ids to grab
        # grab_list = game_ids[0][start_position:end_position]
        grab_list = game_ids[start_position:end_position]

        # piece together target string of game ids for BGG
        targets = ""
        for item in grab_list:
            targets += f"{str(item)},"

        # establish path with targets and current page
        path = f"https://www.boardgamegeek.com/xmlapi2/thing?id={targets}&stats=1&type=boardgame"
        urls_list.append(path)

        start_position += game_block
        end_position += game_block

    return urls_list

In [None]:
scraper_urls_raw = generate_raw_urls(game_ids)

with open("data_store/data_dirty/scraper_urls_raw.json", "w") as convert_file:
    convert_file.write(json.dumps(scraper_urls_raw))

In [None]:
len(scraper_urls_raw)

In [None]:
scraper_urls_raw[-1:]

In [None]:
# !scrapy crawl bgg_raw

## Process files with BS

In [None]:
files = []

for item in os.listdir("data_store/data_dirty/scraped_games/"):
    files.append(item)

In [None]:
file_suffix = 0

start = time.time()

for file in files:

    games_dfs = []
    designers_dfs = []
    categories_dfs = []
    mechanics_dfs = []
    artists_dfs = []
    publishers_dfs = []
    subcategories_dfs = []
    comments_dfs = []

    ##### File Setup Section #####

    # increment file suffix
    file_suffix += 1

    path = f"data_store/data_dirty/scraped_games/{file}"
    print(path)

    game_page = BeautifulSoup(
        open(path, encoding="utf8"), "lxml"
    )  # parse page with beautifulsoup

    # make entry for each game item on page
    game_entries = game_page.find_all("item")
    print(f"Number of game entries in this file: {len(game_entries)}")

    print("Items loaded. Processing.")
    ##### Process Each Game #####

    for entry in game_entries:

        # check that this game has sufficient user ratings to incluide
        try:
            user_ratings = int(
                entry.find("usersrated")["value"]
            )  # get the number of user ratings

            if user_ratings < 10:  # check if user ratings are under 10
                continue
        except:
            continue

        # get game name and BGG ID
        game_name = entry.find("name", type="primary")["value"]
        game_id = entry["id"]
        print(f"Name: {game_name} BGG ID: {str(game_id)}")

        ##### Get Basic Stats #####

        # print("Getting basic stats")
        description = entry.find("description").text  # description text of the game

        try:
            year_pub = int(entry.find("yearpublished")["value"])  # year published
            if year_pub > datetime.now().year:
                continue
        except:
            pass

        try:
            minplayers = int(entry.find("minplayers")["value"])  # minimum players
        except:
            minplayers = None

        try:
            maxplayers = int(entry.find("maxplayers")["value"])  # maximum players
        except:
            maxplayers = None

        avg_rating = float(entry.find("average")["value"])  # average rating
        bayes_avg = float(entry.find("bayesaverage")["value"])  # bayes average rating
        std_dev = float(entry.find("stddev")["value"])  # standard deviation of rating
        num_owned = int(entry.find("owned")["value"])  # num of people own this game
        num_want = int(entry.find("wanting")["value"])  # num of people want this game
        num_wish = int(
            entry.find("wishing")["value"]
        )  # num of people with game on wishlist
        num_weight_votes = int(
            entry.find("numweights")["value"]
        )  # num of votes for game weight
        game_weight = float(entry.find("averageweight")["value"])  # voted game weight

        try:
            image_path = entry.find("image").text  # path to image
        except:
            image_path = None

        try:
            mfg_play_time = int(
                entry.find("playingtime")["value"]
            )  # mfg stated playtime
        except:
            mfg_play_time = None
        try:
            comm_min_play = int(
                entry.find("minplaytime")["value"]
            )  # community min playtime
        except:
            comm_min_play = None

        try:
            comm_max_play = int(
                entry.find("maxplaytime")["value"]
            )  # community max playtime
        except:
            comm_max_play = None

        try:
            mfg_age = int(entry.find("minage")["value"])  # mfg min age
        except:
            mfg_age = None

        # num_comments = int(entry.find('comments')['totalitems']) # num of ratings comments
        num_alts = len(
            entry.find_all("name", type="alternate")
        )  # number alternate versions
        num_expansions = len(
            entry.find_all("link", type="boardgameexpansion")
        )  # number of expansions
        num_implementations = len(
            entry.find_all("link", type="boardgameimplementation")
        )  # number of implementations

        ##### Get reimplementation flag #####
        reimplementation = entry.find(
            "link", type="boardgameimplementation", inbound="true"
        )  # check if game is a reimplementation
        if reimplementation:
            reimplements = 1  # if it's a reimplementation, flag it 1
        else:
            reimplements = 0

        ##### Basic stats requiring some compaction/refinement #####

        def evaluate_poll(poll_title):
            poll_result = None
            try:
                poll = entry.find("poll", title=poll_title).find_all("result")

                total = 0
                items = 0

                for item in poll:
                    vote = int(item["numvotes"]) * int(item["value"][:2])
                    total += vote
                    items += int(item["numvotes"])

                if items > 0:
                    poll_result = (
                        total / items
                    )  # make sure not dividing by 0, get community recommended age
                else:
                    poll_result = None  # if no votes, record none
            except:
                poll_result = None
            return poll_result

        comm_age = evaluate_poll("User Suggested Player Age")  # community age min poll
        lang_ease = evaluate_poll("Language Dependence")  # Language Ease poll

        try:
            # Best and Good Players
            players = entry.find(
                "poll", title="User Suggested Number of Players"
            ).find_all(
                "results"
            )  # get user players poll
            player_num_votes = int(
                entry.find("poll", title="User Suggested Number of Players")[
                    "totalvotes"
                ]
            )  # get total votes

            best_players, best_score, good_players = (
                0,
                0,
                [],
            )  # set up for best players loop

            if player_num_votes > 30:  # evaluate if more than 30 votes for num players
                for player in players:
                    best = int(player.find("result", value="Best")["numvotes"])
                    rec = int(player.find("result", value="Recommended")["numvotes"])
                    score = best * 2 + rec * 1
                    positives = best + rec
                    ratio = positives / player_num_votes
                    if score > best_score:
                        best_players, best_score = (
                            player["numplayers"],
                            score,
                        )  # put in # players for best score
                    if ratio > 0.5:
                        good_players.append(
                            player["numplayers"]
                        )  # put in good players if over 50% ratio
            else:
                best_players = None
        except:
            best_players = None

        # make dataframe for this game
        this_game = pd.DataFrame()
        this_game["BGGId"] = (int(game_id),)
        this_game["Name"] = (game_name,)
        this_game["Description"] = (description,)
        this_game["YearPublished"] = (int(year_pub),)
        this_game["GameWeight"] = (float(game_weight),)
        this_game["AvgRating"] = (float(avg_rating),)
        this_game["BayesAvgRating"] = (float(bayes_avg),)
        this_game["StdDev"] = (float(std_dev),)
        this_game["MinPlayers"] = (minplayers,)
        this_game["MaxPlayers"] = (maxplayers,)
        this_game["ComAgeRec"] = (comm_age,)
        this_game["LanguageEase"] = (lang_ease,)
        this_game["BestPlayers"] = (best_players,)
        this_game["GoodPlayers"] = (good_players,)
        this_game["NumOwned"] = (int(num_owned),)
        this_game["NumWant"] = (int(num_want),)
        this_game["NumWish"] = (int(num_wish),)
        this_game["NumWeightVotes"] = (int(num_weight_votes),)
        this_game["MfgPlaytime"] = (mfg_play_time,)
        this_game["ComMinPlaytime"] = (comm_min_play,)
        this_game["ComMaxPlaytime"] = (comm_max_play,)
        this_game["MfgAgeRec"] = (mfg_age,)
        this_game["NumUserRatings"] = (int(user_ratings),)
        # this_game['NumComments']=int(num_comments),
        this_game["NumAlternates"] = (int(num_alts),)
        this_game["NumExpansions"] = (int(num_expansions),)
        this_game["NumImplementations"] = (int(num_implementations),)
        this_game["IsReimplementation"] = (int(reimplements),)
        this_game["ImagePath"] = image_path

        # add unique information to end of df

        # Add game ranks
        ranks = entry.find_all("rank")
        try:
            for item in ranks:
                this_game["Rank:" + item["name"]] = float(item["value"])
        except:
            pass

        # Try to add components
        try:
            families = entry.find_all(
                "link", type="boardgamefamily", value=re.compile("Component")
            )
            for item in families:
                this_game["Components:" + item["name"]] = item["value"]
        except:
            pass

        # Try to add game series/family
        try:
            family = (
                entry.find("link", type="boardgamefamily", value=re.compile("Game:"))[
                    "value"
                ]
                .strip("Game:")
                .strip(" ")
            )
            this_game["Family"] = family
        except:
            pass

        try:
            family = (
                entry.find("link", type="boardgamefamily", value=re.compile("Series:"))[
                    "value"
                ]
                .strip("Series:")
                .strip(" ")
            )
            this_game["Family"] = family
        except:
            pass

        try:
            setting = (
                entry.find(
                    "link", type="boardgamefamily", value=re.compile("Setting:")
                )["value"]
                .strip("Setting:")
                .strip(" ")
            )
            this_game["Setting"] = setting
        except:
            pass

        # Try to add theme
        try:
            theme = (
                entry.find("link", type="boardgamefamily", value=re.compile("Theme:"))[
                    "value"
                ]
                .strip("Theme:")
                .strip(" ")
            )
            this_game["Theme"] = theme
        except:
            pass

        try:
            mechanism = (
                entry.find(
                    "link", type="boardgamefamily", value=re.compile("Mechanism:")
                )["value"]
                .strip("Mechanism:")
                .strip(" ")
            )
            this_game["Mechanism"] = mechanism
        except:
            pass

        # Try to add game category
        try:
            category = (
                entry.find(
                    "link", type="boardgamefamily", value=re.compile("Category:")
                )["value"]
                .strip("Category:")
                .strip(" ")
            )
            this_game["Category"] = category
        except:
            pass

        # Try is Kickstarted
        try:
            entry.find(
                "link", type="boardgamefamily", value=re.compile("Crowdfunding")
            )["value"]
            this_game["Kickstarted"] = int(1)
        except:
            pass

        ##### Get subcategories #####

        all_subcategories = entry.find_all("link", type="boardgamecategory")

        # Create an empty DataFrame with columns
        categories_hold = pd.DataFrame(
            columns=["BGGId"] + [item["value"] for item in all_subcategories]
        )

        # Create a dictionary for the new row
        subcategory = {"BGGId": [int(game_id)]}
        for item in all_subcategories:
            subcategory[item["value"]] = [1]

        # Append the dictionary as a new row to the DataFrame
        categories_hold = pd.DataFrame(subcategory)

        # create specialty dataframes
        designer = create_thing_of_type(
            entry, game_id, find_type_str="boardgamedesigner"
        )
        category = create_thing_of_type(
            entry, game_id, find_type_str="boardgamecategory"
        )
        mechanic = create_mechanics(entry, game_id)
        artist = create_thing_of_type(entry, game_id, find_type_str="boardgameartist")
        publisher = create_thing_of_type(
            entry, game_id, find_type_str="boardgamepublisher"
        )

        games_dfs.append(this_game)
        designers_dfs.append(designer)
        categories_dfs.append(category)
        mechanics_dfs.append(mechanic)
        artists_dfs.append(artist)
        publishers_dfs.append(publisher)
        subcategories_dfs.append(categories_hold)

    if games_dfs == []:
        continue
    games = pd.concat(games_dfs)
    designers = pd.concat(designers_dfs)
    categories = pd.concat(categories_dfs)
    mechanics = pd.concat(mechanics_dfs)
    artists = pd.concat(artists_dfs)
    publishers = pd.concat(publishers_dfs)
    subcategories = pd.concat(subcategories_dfs)

    games.to_pickle(
        f"data_store/data_dirty/scraped_games_processed/games{str(file_suffix)}.pkl"
    )
    designers.to_pickle(
        f"data_store/data_dirty/scraped_games_processed/designers{str(file_suffix)}.pkl"
    )
    categories.to_pickle(
        f"data_store/data_dirty/scraped_games_processed/categories{str(file_suffix)}.pkl"
    )
    mechanics.to_pickle(
        f"data_store/data_dirty/scraped_games_processed/mechanics{str(file_suffix)}.pkl"
    )
    artists.to_pickle(
        f"data_store/data_dirty/scraped_games_processed/artists{str(file_suffix)}.pkl"
    )
    publishers.to_pickle(
        f"data_store/data_dirty/scraped_games_processed/publishers{str(file_suffix)}.pkl"
    )
    subcategories.to_pickle(
        f"data_store/data_dirty/scraped_games_processed/subcategories{str(file_suffix)}.pkl"
    )

    print("Finished items in this group")

print(f"Time: {time.time() - start}\n\n")

### Data Validation

In [None]:
subcategories1 = pd.read_pickle(
    "data_store/data_dirty/scraped_games_processed/subcategories31.pkl"
)
games1 = pd.read_pickle("data_store/data_dirty/scraped_games_processed/games31.pkl")
designers1 = pd.read_pickle(
    "data_store/data_dirty/scraped_games_processed/designers31.pkl"
)
categories1 = pd.read_pickle(
    "data_store/data_dirty/scraped_games_processed/categories31.pkl"
)
mechanics1 = pd.read_pickle(
    "data_store/data_dirty/scraped_games_processed/mechanics31.pkl"
)
artists1 = pd.read_pickle("data_store/data_dirty/scraped_games_processed/artists31.pkl")
publishers1 = pd.read_pickle(
    "data_store/data_dirty/scraped_games_processed/publishers31.pkl"
)

In [None]:
subcategories1.head()

In [None]:
games1.head()

In [None]:
designers1.head()

In [None]:
categories1.head()

In [None]:
mechanics1.head()

In [None]:
artists1.head()

In [None]:
publishers1.head()

## Combine Files

In [None]:
games_dfs = []
designers_dfs = []
categories_dfs = []
mechanics_dfs = []
artists_dfs = []
publishers_dfs = []
subcategories_dfs = []


for number in range(1, 500):
    print(number)

    try:
        this_games = pd.read_pickle(
            "data_store/data_dirty/scraped_games_processed/games" + str(number) + ".pkl"
        )
        this_designers = pd.read_pickle(
            "data_store/data_dirty/scraped_games_processed/designers"
            + str(number)
            + ".pkl"
        )
        this_categories = pd.read_pickle(
            "data_store/data_dirty/scraped_games_processed/categories"
            + str(number)
            + ".pkl"
        )
        this_mechanics = pd.read_pickle(
            "data_store/data_dirty/scraped_games_processed/mechanics"
            + str(number)
            + ".pkl"
        )
        this_artists = pd.read_pickle(
            "data_store/data_dirty/scraped_games_processed/artists"
            + str(number)
            + ".pkl"
        )
        this_publishers = pd.read_pickle(
            "data_store/data_dirty/scraped_games_processed/publishers"
            + str(number)
            + ".pkl"
        )
        this_subcategories = pd.read_pickle(
            "data_store/data_dirty/scraped_games_processed/subcategories"
            + str(number)
            + ".pkl"
        )

        games_dfs.append(this_games)
        designers_dfs.append(this_designers)
        categories_dfs.append(this_categories)
        mechanics_dfs.append(this_mechanics)
        artists_dfs.append(this_artists)
        publishers_dfs.append(this_publishers)
        subcategories_dfs.append(this_subcategories)
    except:
        print(f"No entry for position {number}")
        continue

In [None]:
games = pd.concat(games_dfs)
designers = pd.concat(designers_dfs)
categories = pd.concat(categories_dfs)
mechanics = pd.concat(mechanics_dfs)
artists = pd.concat(artists_dfs)
publishers = pd.concat(publishers_dfs)
subcategories = pd.concat(subcategories_dfs)

In [None]:
games = games.reset_index(drop=True)
designers = designers.reset_index(drop=True)
categories = categories.reset_index(drop=True)
mechanics = mechanics.reset_index(drop=True)
artists = artists.reset_index(drop=True)
publishers = publishers.reset_index(drop=True)
subcategories = subcategories.reset_index(drop=True)

In [None]:
games.shape

In [None]:
games.head()

In [None]:
games.to_pickle("data_store/data_dirty/games.pkl")
designers.to_pickle("data_store/data_dirty/designers.pkl")
categories.to_pickle("data_store/data_dirty/categories.pkl")
mechanics.to_pickle("data_store/data_dirty/mechanics.pkl")
artists.to_pickle("data_store/data_dirty/artists.pkl")
publishers.to_pickle("data_store/data_dirty/publishers.pkl")
subcategories.to_pickle("data_store/data_dirty/subcategories.pkl")

### Data Validation

In [None]:
games = pd.read_pickle("data_store/data_dirty/games.pkl")
designers = pd.read_pickle("data_store/data_dirty/designers.pkl")
categories = pd.read_pickle("data_store/data_dirty/categories.pkl")
mechanics = pd.read_pickle("data_store/data_dirty/mechanics.pkl")
artists = pd.read_pickle("data_store/data_dirty/artists.pkl")
publishers = pd.read_pickle("data_store/data_dirty/publishers.pkl")
subcategories = pd.read_pickle("data_store/data_dirty/subcategories.pkl")

In [None]:
games.head()

In [None]:
designers.tail()

In [None]:
categories.tail()

In [None]:
mechanics.tail()

In [None]:
artists.tail()

In [None]:
publishers.tail()

In [None]:
subcategories.tail()

In [None]:
break

# PULL - User Ratings

## Create Scraper URLs

In [2]:
df = pd.read_csv("boardgames_ranks.csv", low_memory=False)
game_ids = df["id"].astype(int).to_list()
game_ids[:10]

games = pd.read_pickle("data_store/data_cleaned/games.pkl")

In [3]:
ratings_totals = pd.DataFrame(games["BGGId"])
ratings_totals["RatingsPages"] = np.ceil(games["NumUserRatings"] / 100).astype("int")
ratings_totals = ratings_totals.sort_values(
    "RatingsPages", ascending=False
).reset_index(drop=True)

In [4]:
ratings_totals.head()

Unnamed: 0,BGGId,RatingsPages
0,13,1260
1,822,1255
2,30549,1243
3,68448,1035
4,167791,970


In [5]:
max_ratings_pages = ratings_totals["RatingsPages"].max()
max_ratings_pages

1260

In [6]:
df_groups = {}
group_counter = 1
position = 0

while len(ratings_totals) > 0:

    indices = []
    group_positions = []
    chunk_size = 0
    group_dfs = []

    while max_ratings_pages > chunk_size:
        try:
            chunk_size += ratings_totals.iloc[position]["RatingsPages"]
            group_dfs.append(pd.DataFrame(ratings_totals.iloc[position]).T)
            ratings_totals = ratings_totals.drop(position)
            position += 1

        except:
            break

    print(chunk_size)
    if len(group_dfs) == 0:
        break
    group_positions = pd.concat(group_dfs)
    df_groups[f"group{group_counter}"] = group_positions
    print(f"group{group_counter} Complete")
    group_counter += 1

1260
group1 Complete
2213
group2 Complete
1808
group3 Complete
1674
group4 Complete
1429
group5 Complete
1319
group6 Complete
1709
group7 Complete
1534
group8 Complete
1453
group9 Complete
1391
group10 Complete
1346
group11 Complete
1272
group12 Complete
1614
group13 Complete
1479
group14 Complete
1395
group15 Complete
1327
group16 Complete
1269
group17 Complete
1512
group18 Complete
1419
group19 Complete
1357
group20 Complete
1265
group21 Complete
1461
group22 Complete
1391
group23 Complete
1321
group24 Complete
1272
group25 Complete
1426
group26 Complete
1374
group27 Complete
1313
group28 Complete
1430
group29 Complete
1359
group30 Complete
1283
group31 Complete
1389
group32 Complete
1303
group33 Complete
1378
group34 Complete
1320
group35 Complete
1372
group36 Complete
1282
group37 Complete
1347
group38 Complete
1290
group39 Complete
1326
group40 Complete
1336
group41 Complete
1266
group42 Complete
1289
group43 Complete
1312
group44 Complete
1307
group45 Complete
1306
group46 Comple

In [7]:
groups = [y for y in df_groups.values()]

In [8]:
def generate_ratings_urls(group):
    urls_list = []

    df_positions = list(range(0, group.shape[0]))
    assert group.shape[0] == len(df_positions)

    for position in df_positions:
        current_bgg_id = group.iloc[position]["BGGId"]
        max_page_number = group.iloc[position]["RatingsPages"]
        page_numbers = range(1, max_page_number + 1)
        # print(f"Df last page: {max_page_number}\n", f"First page: {page_numbers[0]}\n", f"Last page: {page_numbers[-1]}")

        for page_number in page_numbers:
            path = f"https://www.boardgamegeek.com/xmlapi2/thing?id={current_bgg_id}&ratingcomments=1&page={str(page_number)}&pagesize=100"
            urls_list.append(path)
    print("\n")
    return urls_list

In [9]:
group_urls = {}
group_num = 0

for group in groups:
    group_num += 1
    group_urls["group" + str(group_num)] = generate_ratings_urls(group)

with open("data_store/data_dirty/scraper_urls_ratings.json", "w") as convert_file:
    convert_file.write(json.dumps(group_urls))

















































































































































































































## Scrape URLs

In [10]:
for group in group_urls:

    print(group)

group1
group2
group3
group4
group5
group6
group7
group8
group9
group10
group11
group12
group13
group14
group15
group16
group17
group18
group19
group20
group21
group22
group23
group24
group25
group26
group27
group28
group29
group30
group31
group32
group33
group34
group35
group36
group37
group38
group39
group40
group41
group42
group43
group44
group45
group46
group47
group48
group49
group50
group51
group52
group53
group54
group55
group56
group57
group58
group59
group60
group61
group62
group63
group64
group65
group66
group67
group68
group69
group70
group71
group72
group73
group74
group75
group76
group77
group78
group79
group80
group81
group82
group83
group84
group85
group86
group87
group88
group89
group90
group91
group92
group93
group94
group95
group96
group97
group98
group99
group100
group101
group102
group103


In [None]:
for group in group_urls:

    print(group)

    !scrapy crawl bgg_ratings -a group=$group -a log=scrapy.log

## Process files with lxml

### One File Test

In [None]:
path = "data_store/data_dirty/pulled_ratings/ratings_group1_20240318193946.xml"

tree = etree.parse(path)
root = tree.getroot()

# set up empty list to store the ratings found on this page
bggid, names, ratings, user_comments, usernames = [], [], [], [], []

for child in root:

    # gets BGGId
    game_id = child.get("id")
    # print(game_id)

    # gets game name
    name_line = child.find("name")
    game_name = name_line.attrib.get("value")
    # print(game_name)

    # get ratings sections
    comments = child.findall(".//comment")

    for comment in comments:

        # gets username for comment/rating
        username = comment.get("username")
        # print(username)

        # gets user's rating
        rating = comment.get("rating")

        # gets user comment text
        comment_text = comment.get("value")

        bggid.append(game_id)
        names.append(game_name)
        ratings.append(rating)
        user_comments.append(comment_text)
        usernames.append(username)

    # dictionary of lists
    dict = {
        "BGGId": bggid,
        "Name": names,
        "Username": usernames,
        "Rating": ratings,
        "Comments": user_comments,
    }

    df = pd.DataFrame(dict)

In [None]:
df

### All Files

In [None]:
files = []

for item in os.listdir("data_store/data_dirty/pulled_ratings/"):
    files.append(item)

In [None]:
len(files)

In [None]:
raw_ratings_dfs = []

In [None]:
for file in files:

    path = "data_store/data_dirty/pulled_ratings/" + file
    print(path)

    tree = etree.parse(path)
    root = tree.getroot()

    # set up empty list to store the ratings found on this page
    bggid, names, ratings, user_comments, usernames = [], [], [], [], []

    for child in root:

        # gets BGGId
        game_id = child.get("id")
        # print(game_id)

        # gets game name
        name_line = child.find("name")
        game_name = name_line.attrib.get("value")
        # print(game_name)

        # get ratings sections
        comments = child.findall(".//comment")

        for comment in comments:

            # gets username for comment/rating
            username = comment.get("username")
            # print(username)

            # gets user's rating
            rating = comment.get("rating")

            # gets user comment text
            comment_text = comment.get("value")

            bggid.append(game_id)
            names.append(game_name)
            ratings.append(rating)
            user_comments.append(comment_text)
            usernames.append(username)

    # dictionary of lists
    file_dict = {
        "BGGId": bggid,
        "Name": names,
        "Username": usernames,
        "Rating": ratings,
        "Comments": user_comments,
    }

    df = pd.DataFrame(file_dict)

    raw_ratings_dfs.append(df)

raw_ratings = pd.concat(raw_ratings_dfs)

In [None]:
raw_ratings.head()

In [None]:
raw_ratings["BGGId"] = raw_ratings["BGGId"].astype(int)
raw_ratings["Rating"] = raw_ratings["Rating"].astype(float)

In [None]:
raw_ratings = raw_ratings.drop_duplicates(keep="first")

In [None]:
raw_ratings.to_pickle("data_store/data_dirty/raw_game_ratings.pkl")

# Appendix

## Get Game ids

In [None]:
df = pd.read_csv("boardgames_ranks.csv", low_memory=False)

In [None]:
df.head()

In [None]:
game_ids = df["id"].astype(int).to_list()

In [None]:
len(game_ids)

In [None]:
game_ids = pd.DataFrame(game_ids)
game_ids.to_pickle("data_store/data_dirty/big_game_ids.pkl")

## DEPRECATED

### Pull Games with Selenium/BS

In [None]:
# set up our columns list
columns = [
    "BGGId",
    "Name",
    "Description",
    "YearPublished",
    "GameWeight",
    "AvgRating",
    "BayesAvgRating",
    "StdDev",
    "MinPlayers",
    "MaxPlayers",
    "ComAgeRec",
    "LanguageEase",
    "BestPlayers",
    "GoodPlayers",
    "NumOwned",
    "NumWant",
    "NumWish",
    "NumWeightVotes",
    "MfgPlaytime",
    "ComMinPlaytime",
    "ComMaxPlaytime",
    "MfgAgeRec",
    "NumUserRatings",
    "NumComments",
    "NumAlternates",
    "NumExpansions",
    "NumImplementations",
    "IsReimplementation",
    "Family",
    "Theme",
    "Category",
    "Kickstarted",
    "ImagePath",
]

In [None]:
df = pd.read_csv("boardgames_ranks.csv", low_memory=False)
game_ids = df["id"].astype(int).to_list()
game_ids[:10]

In [None]:
start_position = 0
end_position = 1000
file_suffix = 0

overall_start = time.time()
while end_position < (len(game_ids) + 1):

    games = pd.DataFrame(columns=columns)
    designers = pd.DataFrame(columns=["BGGId"])
    categories = pd.DataFrame(columns=["BGGId"])
    mechanics = pd.DataFrame(columns=["BGGId"])
    artists = pd.DataFrame(columns=["BGGId"])
    publishers = pd.DataFrame(columns=["BGGId"])
    subcategories = pd.DataFrame(columns=["BGGId"])
    comments = pd.DataFrame(columns=["BGGId"])

    ##### File Setup Section #####

    # increment file suffix
    file_suffix += 1
    # get file suffix as string
    suffix_str = str(file_suffix)

    # print start and end positions
    print("Getting items " + str(start_position + 1) + " through " + str(end_position))

    # get list of game ids to grab
    # grab_list = game_ids[0][start_position:end_position]
    grab_list = game_ids[start_position:end_position]

    # piece together target string of game ids for BGG
    targets = ""
    for item in grab_list:
        targets += str(item) + ","

    # log start time for information retrieval
    start = time.time()  # log the start time for this entry

    ##### API Call Section #####

    # Set up Selenium drivers
    options = webdriver.ChromeOptions()  # set up chrome options
    options.add_argument("--headless")  # set up chrome options
    time.sleep(1)  # wait 1 second
    # establish path with targets
    path = (
        "https://www.boardgamegeek.com/xmlapi2/thing?id="
        + targets
        + "&stats=1&type=boardgame"
    )
    driver = webdriver.Chrome(options=options)  # initiate chrome driver with options
    print("New page retrieval. May be waiting for load.")
    driver.get(path)  # get path
    # wait until the driver finds the element that we need
    element = WebDriverWait(driver, 180).until(
        EC.presence_of_all_elements_located((By.ID, "folder0"))
    )

    game_page = BeautifulSoup(driver.page_source)  # parse page with beautifulsoup

    # make entry for each game item on page
    game_entries = game_page.find_all("item")

    print("Items loaded. Processing.")
    ##### Process Each Game #####

    for entry in game_entries:
        ##### Get Game Name, BGGId, and check that game should be included in list #####

        ##### Check is expansion #####
        # gametype = entry['type'] # check game type
        # if gametype != 'boardgame':
        #    continue
        # else: pass

        # check that this game has sufficient user ratings to incluide
        try:
            user_ratings = int(
                entry.find("usersrated")["value"]
            )  # get the number of user ratings

            if user_ratings < 30:  # check if user ratings are under 30
                continue
        except:
            continue

        # get game name and BGG ID
        game_name = entry.find("name", type="primary")["value"]
        game_id = entry["id"]
        # print("Name: "+game_name+", BGG ID: "+str(game_id))

        ##### Get Basic Stats #####

        # print("Getting basic stats")
        description = entry.find("description").text  # description text of the game

        try:
            year_pub = int(entry.find("yearpublished")["value"])  # year published
            if year_pub > 2021:
                continue
        except:
            pass

        try:
            minplayers = int(entry.find("minplayers")["value"])  # minimum players
        except:
            minplayers = None

        try:
            maxplayers = int(entry.find("maxplayers")["value"])  # maximum players
        except:
            maxplayers = None

        avg_rating = float(entry.find("average")["value"])  # average rating
        bayes_avg = float(entry.find("bayesaverage")["value"])  # bayes average rating
        std_dev = float(entry.find("stddev")["value"])  # standard deviation of rating
        num_owned = int(entry.find("owned")["value"])  # num of people own this game
        num_want = int(entry.find("wanting")["value"])  # num of people want this game
        num_wish = int(
            entry.find("wishing")["value"]
        )  # num of people with game on wishlist
        num_weight_votes = int(
            entry.find("numweights")["value"]
        )  # num of votes for game weight
        game_weight = float(entry.find("averageweight")["value"])  # voted game weight

        try:
            image_path = entry.find("image").text  # path to image
        except:
            image_path = None

        try:
            mfg_play_time = int(
                entry.find("playingtime")["value"]
            )  # mfg stated playtime
        except:
            mfg_play_time = None
        try:
            comm_min_play = int(
                entry.find("minplaytime")["value"]
            )  # community min playtime
        except:
            comm_min_play = None

        try:
            comm_max_play = int(
                entry.find("maxplaytime")["value"]
            )  # community max playtime
        except:
            comm_max_play = None

        try:
            mfg_age = int(entry.find("minage")["value"])  # mfg min age
        except:
            mfg_age = None

        # num_comments = int(entry.find('comments')['totalitems']) # num of ratings comments
        num_alts = len(
            entry.find_all("name", type="alternate")
        )  # number alternate versions
        num_expansions = len(
            entry.find_all("link", type="boardgameexpansion")
        )  # number of expansions
        num_implementations = len(
            entry.find_all("link", type="boardgameimplementation")
        )  # number of implementations

        ##### Get reimplementation flag #####
        reimplementation = entry.find(
            "link", type="boardgameimplementation", inbound="true"
        )  # check if game is a reimplementation
        if reimplementation:
            reimplements = 1  # if it's a reimplementation, flag it 1
        else:
            reimplements = 0

        ##### Basic stats requiring some compaction/refinement #####

        # community age min
        try:
            age_poll = entry.find("poll", title="User Suggested Player Age").find_all(
                "result"
            )

            total = 0
            items = 0

            for item in age_poll:
                vote = int(item["numvotes"]) * int(item["value"][:2])
                total += vote
                items += int(item["numvotes"])

            if items > 0:
                comm_age = (
                    total / items
                )  # make sure not dividing by 0, get community recommended age
            else:
                comm_age = None  # if no votes, record none
        except:
            comm_age = None

        # Language Ease
        try:

            lang_poll = entry.find("poll", title="Language Dependence").find_all(
                "result"
            )
            total, items = 0, 0

            for item in lang_poll:
                vote = int(item["numvotes"]) * int(item["level"])
                total += vote
                items += int(item["numvotes"])

            if items > 0:
                lang_ease = (
                    total / items
                )  # make sure not dividing by 0, get community language ease
            else:
                lang_ease = None  # if no votes, record none
        except:
            lang_ease = None  # if no votes, record none

        try:
            # Best and Good Players
            players = entry.find(
                "poll", title="User Suggested Number of Players"
            ).find_all(
                "results"
            )  # get user players poll
            player_num_votes = int(
                entry.find("poll", title="User Suggested Number of Players")[
                    "totalvotes"
                ]
            )  # get total votes

            best_players, best_score, good_players = (
                0,
                0,
                [],
            )  # set up for best players loop

            if player_num_votes > 30:  # evaluate if more than 30 votes for num players
                for player in players:
                    best = int(player.find("result", value="Best")["numvotes"])
                    rec = int(player.find("result", value="Recommended")["numvotes"])
                    score = best * 2 + rec * 1
                    positives = best + rec
                    ratio = positives / player_num_votes
                    if score > best_score:
                        best_players, best_score = (
                            player["numplayers"],
                            score,
                        )  # put in # players for best score
                    if ratio > 0.5:
                        good_players.append(
                            player["numplayers"]
                        )  # put in good players if over 50% ratio
            else:
                best_players = None
        except:
            best_players = None

        ##### Skip dynamic content which cannot be batched #####

        # this_game['NumFans']=int(num_fans),
        # this_game['NumPageViews']=int(num_views),
        # this_game['RulesPosts']=int(rules_threads),
        # this_game['TotalPosts']=int(total_threads),
        # this_game['NumAwards'] = int(num_awards)

        # make dataframe for this game
        this_game = pd.DataFrame()
        this_game["BGGId"] = (int(game_id),)
        this_game["Name"] = (game_name,)
        this_game["Description"] = (description,)
        this_game["YearPublished"] = (int(year_pub),)
        this_game["GameWeight"] = (float(game_weight),)
        this_game["AvgRating"] = (float(avg_rating),)
        this_game["BayesAvgRating"] = (float(bayes_avg),)
        this_game["StdDev"] = (float(std_dev),)
        this_game["MinPlayers"] = (minplayers,)
        this_game["MaxPlayers"] = (maxplayers,)
        try:
            this_game["ComAgeRec"] = (float(comm_age),)
        except:
            this_game["ComAgeRec"] = (None,)
        try:
            this_game["LanguageEase"] = (float(lang_ease),)
        except:
            this_game["LanguageEase"] = (None,)
        this_game["BestPlayers"] = (best_players,)
        this_game["GoodPlayers"] = (good_players,)
        this_game["NumOwned"] = (int(num_owned),)
        this_game["NumWant"] = (int(num_want),)
        this_game["NumWish"] = (int(num_wish),)
        this_game["NumWeightVotes"] = (int(num_weight_votes),)
        this_game["MfgPlaytime"] = (mfg_play_time,)
        this_game["ComMinPlaytime"] = (comm_min_play,)
        this_game["ComMaxPlaytime"] = (comm_max_play,)
        this_game["MfgAgeRec"] = (mfg_age,)
        this_game["NumUserRatings"] = (int(user_ratings),)
        # this_game['NumComments']=int(num_comments),
        this_game["NumAlternates"] = (int(num_alts),)
        this_game["NumExpansions"] = (int(num_expansions),)
        this_game["NumImplementations"] = (int(num_implementations),)
        this_game["IsReimplementation"] = (int(reimplements),)
        this_game["ImagePath"] = image_path

        # add unique information to end of df

        # Add game ranks
        ranks = entry.find_all("rank")
        try:
            for item in ranks:
                this_game["Rank:" + item["name"]] = float(item["value"])
        except:
            pass

        # Try to add components
        try:
            families = entry.find_all(
                "link", type="boardgamefamily", value=re.compile("Component")
            )
            for item in families:
                this_game["Components:" + item["name"]] = item["value"]
        except:
            pass

        # Try to add game series/family
        try:
            family = (
                entry.find("link", type="boardgamefamily", value=re.compile("Game:"))[
                    "value"
                ]
                .strip("Game:")
                .strip(" ")
            )
            this_game["Family"] = family
        except:
            pass

        try:
            family = (
                entry.find("link", type="boardgamefamily", value=re.compile("Series:"))[
                    "value"
                ]
                .strip("Series:")
                .strip(" ")
            )
            this_game["Family"] = family
        except:
            pass

        try:
            setting = (
                entry.find(
                    "link", type="boardgamefamily", value=re.compile("Setting:")
                )["value"]
                .strip("Setting:")
                .strip(" ")
            )
            this_game["Setting"] = setting
        except:
            pass

        # Try to add theme
        try:
            theme = (
                entry.find("link", type="boardgamefamily", value=re.compile("Theme:"))[
                    "value"
                ]
                .strip("Theme:")
                .strip(" ")
            )
            this_game["Theme"] = theme
        except:
            pass

        try:
            mechanism = (
                entry.find(
                    "link", type="boardgamefamily", value=re.compile("Mechanism:")
                )["value"]
                .strip("Mechanism:")
                .strip(" ")
            )
            this_game["Mechanism"] = mechanism
        except:
            pass

        # Try to add game category
        try:
            category = (
                entry.find(
                    "link", type="boardgamefamily", value=re.compile("Category:")
                )["value"]
                .strip("Category:")
                .strip(" ")
            )
            this_game["Category"] = category
        except:
            pass

        # Try is Kickstarted
        try:
            entry.find(
                "link", type="boardgamefamily", value=re.compile("Crowdfunding")
            )["value"]
            this_game["Kickstarted"] = int(1)
        except:
            pass

        ##### Get subcategories #####

        all_subcategories = entry.find_all("link", type="boardgamecategory")

        categories_hold = pd.DataFrame(columns=["BGGId"])
        subcategory = {"BGGId": int(game_id)}

        for item in all_subcategories:
            subcategory[item["value"]] = int(1)

        categories_hold = categories_hold.append(subcategory, ignore_index=True)

        # create specialty dataframes
        designer = create_thing_of_type(
            entry, game_id, find_type_str="boardgamedesigner"
        )
        category = create_thing_of_type(
            entry, game_id, find_type_str="boardgamecategory"
        )
        mechanic = create_mechanics(entry, game_id)
        artist = create_thing_of_type(entry, game_id, find_type_str="boardgameartist")
        publisher = create_thing_of_type(
            entry, game_id, find_type_str="boardgamepublisher"
        )

        games = games.append(this_game, ignore_index=True)
        designers = designers.append(designer, ignore_index=True)
        categories = categories.append(category, ignore_index=True)
        mechanics = mechanics.append(mechanic, ignore_index=True)
        artists = artists.append(artist, ignore_index=True)
        publishers = publishers.append(publisher, ignore_index=True)
        subcategories = subcategories.append(categories_hold, ignore_index=True)

    games.to_pickle("data_store/data_dirty/scraped_games/games" + suffix_str + ".pkl")
    designers.to_pickle(
        "data_store/data_dirty/scraped_games/designers" + suffix_str + ".pkl"
    )
    categories.to_pickle(
        "data_store/data_dirty/scraped_games/categories" + suffix_str + ".pkl"
    )
    mechanics.to_pickle(
        "data_store/data_dirty/scraped_games/mechanics" + suffix_str + ".pkl"
    )
    artists.to_pickle(
        "data_store/data_dirty/scraped_games/artists" + suffix_str + ".pkl"
    )
    publishers.to_pickle(
        "data_store/data_dirty/scraped_games/publishers" + suffix_str + ".pkl"
    )
    subcategories.to_pickle(
        "data_store/data_dirty/scraped_games/subcategories" + suffix_str + ".pkl"
    )

    print("Finished items in this group")

    print(f"Time: {time.time() - start}\n\n")

    start_position += 1000
    end_position += 1000

print(f"Time: {time.time() - overall_start}\n\n")