# Gathering List of Games and User Reviews

In [1]:
import numpy as np
import pandas as pd
import requests
import time
from bs4 import BeautifulSoup
from itertools import count
import os

In [2]:
url = 'https://boardgamegeek.com/browse/boardgame/page/'
api = 'https://api.geekdo.com/xmlapi2/thing'

## Creating the list of the top 1,000 Board Games
This section generates the top 1,000 board games according to BoardGameGeek.com as of March 16th, 2021. 

In [3]:
boardgame_top1000 = []
for n in range(1, 11):
    res = requests.get(url+str(n))
    if res.status_code != 200:
        print(f'Error code: {res.status_code}')
    
    soup_bgs = BeautifulSoup(res.text, 'lxml')
    table = soup_bgs.find('table', {'class': 'collection_table', 'id': 'collectionitems'})
    for i in range (100):
        boardgame = {
            'rank': table.find_all('td', {'class': 'collection_rank'})[i].text.strip(),
            'title':table.find_all('div', {'id': 'results_objectname'+str(i+1)})[0].find('a').text,
            'id':table.find_all('div', {'id': 'results_objectname'+str(i+1)})[0].find('a')['href'].split('/')[2]
        }
        boardgame_top1000.append(boardgame)
    
    print(f'Done with {100*n}')
    
    time.sleep(15)

Done with 100
Done with 200
Done with 300
Done with 400
Done with 500
Done with 600
Done with 700
Done with 800
Done with 900
Done with 1000


In [4]:
bg_top = pd.DataFrame(boardgame_top1000)
bg_top.set_index('rank', inplace=True)

In [5]:
bg_top

Unnamed: 0_level_0,title,id
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Gloomhaven,174430
2,Pandemic Legacy: Season 1,161936
3,Brass: Birmingham,224517
4,Terraforming Mars,167791
5,Gloomhaven: Jaws of the Lion,291457
...,...,...
996,1812: The Invasion of Canada,94246
997,Chimera Station,163642
998,Sons of Anarchy: Men of Mayhem,156091
999,Shadow Hunters,24068


In [6]:
bg_top.to_csv('../data/bg_top1000.csv')

#### Create list of Board Game IDs
The below code generates the list of board game ID codes for use in the APIs below. If code needs rerun for any reason, I will import the csv created above to generate the list of IDs. This will be included with the API in Google Cloud so that it has the board game list to pull from.

In [8]:
bg_top = pd.read_csv('../data/bg_top1000.csv')
bg_list = list(bg_top['id'])
bg_list

[174430,
 161936,
 224517,
 167791,
 291457,
 233078,
 220308,
 187645,
 162886,
 182028,
 115746,
 193738,
 12333,
 169786,
 316554,
 84876,
 167355,
 173346,
 124361,
 28720,
 120677,
 177736,
 266192,
 205637,
 183394,
 237182,
 164928,
 199792,
 266507,
 96848,
 312484,
 246900,
 175914,
 3076,
 102794,
 170216,
 285774,
 192135,
 31260,
 251247,
 276025,
 221107,
 247763,
 205059,
 256960,
 284083,
 185343,
 126163,
 2651,
 55690,
 216132,
 164153,
 184267,
 209010,
 35677,
 180263,
 244521,
 125153,
 521,
 161533,
 230802,
 72125,
 25613,
 266810,
 191189,
 124742,
 342942,
 28143,
 314040,
 201808,
 159675,
 121921,
 229853,
 171623,
 157354,
 68448,
 200680,
 110327,
 62219,
 182874,
 236457,
 264220,
 122515,
 93,
 18602,
 37111,
 324856,
 12493,
 73439,
 40834,
 269385,
 146021,
 170042,
 172386,
 203993,
 205896,
 281259,
 163412,
 144733,
 42,
 225694,
 102680,
 295947,
 155821,
 284378,
 178900,
 132531,
 36218,
 233371,
 172287,
 263918,
 30549,
 218417,
 196340,
 198928,

# Gather Details for Each Board Game
The below script is being copied over to a Microsoft Visual Studio to create a script that can be run in Google Cloud. This is a proof of concept.

In [3]:
bg_list = [174430, 161936, 224517]

In [4]:
game_dicts = []

for game in bg_list:
    params = {
        'id': game,
        'stats': 1,
        'page': 1
    }
        
    res = requests.get(api, params)
        
    if res.status_code != 200:
            print(f'Error {res.status_code} with Board Game {game}')
        
    soup = BeautifulSoup(res.text, 'xml')
    game_details ={
        'game_id': game,
        'game_name': soup.find('name', {'type': 'primary'})['value'],
        'game_des': soup.find('description').text,
        'game_yr_pub': int(soup.find('yearpublished')['value']),
        'min_players': int(soup.find('minplayers')['value']),
        'max_players': int(soup.find('maxplayers')['value']),
        'min_play_time': int(soup.find('minplaytime')['value']),
        'max_play_time': int(soup.find('maxplaytime')['value']),
        'min_age': int(soup.find('minage')['value']),
        'num_ratings': int(soup.find('usersrated')['value']),
        'avg_rating': float(soup.find('average')['value']),
        'bayes_avg_rating': float(soup.find('bayesaverage')['value']),
        'overall_rank_bayesavg': "|".join([soup.find('rank', {'name':'boardgame'})['value'],
                                           soup.find('rank', {'name':'boardgame'})['bayesaverage']]),
        'family_rank_bayes': '|'.join(['/'.join([row['name'], row['value'], row['bayesaverage']])
                                       for row in soup.find_all('rank', {'type':'family'})]),
        'complexity': float(soup.find('averageweight')['value']),
        'categories': '|'.join([row['value'] for row in soup.find_all('link', {'type':'boardgamecategory'})]),
        'mechanics': '|'.join([row['value'] for row in soup.find_all('link', {'type':'boardgamemechanic'})]),
        'families': '|'.join([row['value']for row in soup.find_all('link', {'type':'boardgamefamily'})]),
        'implementations': '|'.join([row['value']for row in soup.find_all('link', {'type':'boardgameimplementation'})]),
        'designers': '|'.join([row['value'] for row in soup.find_all('link', {'type':'boardgamedesigner'})]),
        'publishers': '|'.join([row['value'] for row in soup.find_all('link', {'type':'boardgamepublisher'})])
    }

    game_dicts.append(game_details)
        
    time.sleep(10)
        
    print(f'Done with {game}')

Done with 174430
Done with 161936
Done with 224517


In [5]:
pd.DataFrame.from_dict(game_dicts)

Unnamed: 0,game_id,game_name,game_des,game_yr_pub,min_players,max_players,min_play_time,max_play_time,min_age,num_ratings,...,bayes_avg_rating,overall_rank_bayesavg,family_rank_bayes,complexity,categories,mechanics,families,implementations,designers,publishers
0,174430,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,2017,1,4,60,120,14,49737,...,8.49563,1|8.49563,thematic/1/8.47366|strategygames/1/8.46295,3.8728,Adventure|Exploration|Fantasy|Fighting|Miniatures,Action Queue|Action Retrieval|Campaign / Battl...,Category: Dungeon Crawler|Components: Miniatur...,,Isaac Childres,Cephalofair Games|Albi|Albi Polska|Arclight|As...
1,161936,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,2015,2,4,60,60,13,46207,...,8.43544,2|8.43544,thematic/2/8.42624|strategygames/3/8.42193,2.8302,Environmental|Medical,Action Points|Cooperative Game|Hand Management...,Components: Map (Global Scale)|Components: Mul...,Pandemic Legacy: Season 2|Pandemic,Rob Daviau|Matt Leacock,Z-Man Games|Asterion Press|Devir|Filosofia Édi...
2,224517,Brass: Birmingham,Brass: Birmingham is an economic strategy game...,2018,2,4,60,120,14,27629,...,8.42263,3|8.42263,strategygames/2/8.45557,3.9043,Economic|Industry / Manufacturing|Post-Napoleo...,Hand Management|Income|Loans|Market|Network an...,Cities: Birmingham (England)|Country: England|...,Brass: Lancashire,Gavan Brown|Matt Tolman|Martin Wallace,Roxley|Arclight|BoardM Factory|Conclave Editor...


## Creating the API script for user ratings
This script is going to be exported to Visual Studio Code to be turned into a script that will run in Google cloud to gather all of the user ratings for each of the top 1000 board games. This is a proof of concept.

In [20]:
for game in bg_list:
    if not os.path.exists(f'{game}_ratings.csv'):
        user_ratings = []
        for i in count():
            page = i+1

            params = {
                'id': game,
                'ratingcomments': '1',
                'page': page,
                'pagesize': 100
            }

            res = requests.get(api, params)

            if res.status_code != 200:
                print(f'Error code {res.status_code} for game id: {game}')

            soup = BeautifulSoup(res.text, 'xml')

            if len(soup.find('comments').find_all('comment')) < 1:
                time.sleep(10)
                break

            for comment in soup.find('comments').find_all('comment'):
                user_rate ={
                    'user_id': comment['username'],
                'rating': comment['rating'],
                'game_id': game
                }

                user_ratings.append(user_rate)

            time.sleep(10)

        pd.DataFrame.from_dict(user_ratings).to_csv(f'{game}_ratings.csv')
        print(f'Done with {game}')
print('Done with all games')

Done with 174430
Done with 161936
Done with 224517
Done with all users


The script finished on March 31st, 2022. Now that all of the user reviews for each game have been gathered, I will now generate the sparse matrix and initial version of the user based recommender in the 2.2_User_Based_Recommender.