In [1]:
import requests
from bs4 import BeautifulSoup
import scipy.io
import matplotlib.pyplot as plt
import matplotlib 
import pandas as pd
import numpy as np
import pickle
from time import sleep
import timeit

In [2]:
def request(msg, slp=1):
    status_code = 500  # Want to get a status-code of 200
    while status_code != 200:
        sleep(slp)  # Don't ping the server too often
        try:
            r = requests.get(msg)
            status_code = r.status_code
            if status_code != 200:
                print("Server Error! Response Code %i. Retrying..." % (r.status_code))
        except:
            print("An exception has occurred, probably a momentory loss of connection. Waiting one seconds...")
            sleep(1)
    return r

In [3]:
# Initialize a DF to hold all our scraped game info
df_all = pd.DataFrame(columns=["id", "name", "nrate", "pic_url"])
min_nrate = 1e5
npage = 1

# Scraping successful pages in the results until we get down to games with < 500 ratings each
while min_nrate > 500:
    # Get full HTML for a specific page in the full listing of boardgames sorted by nrates 
    r = request("https://boardgamegeek.com/browse/boardgame/page/%i?sort=numvoters&sortdir=desc" % (npage,))
    soup = BeautifulSoup(r.text, "html.parser")    
    
    # Get rows for the table listing all the games on this page
    table = soup.find_all("tr", attrs={"id": "row_"})  # Get list of games on this page
    df = pd.DataFrame(columns=["id", "name", "nrate", "pic_url"], index=range(len(table)))  # DF to hold this pages results
    
    # Loop through each row and pull out the info for that game
    for idx, row in enumerate(table):
        # Row may or may not start with a "boardgame rank" link, if YES then strip it
        links = row.find_all("a")
        if "name" in links[0].attrs.keys():
            del links[0]
        gamelink = links[1]  # Get the relative URL for the specific game
        gameid = int(gamelink["href"].split("/")[2])  # Get the game ID by parsing the relative URL
        gamename = gamelink.contents[0]  # Get the actual name of the game as the link contents
        imlink = links[0]  # Get the URL for the game thumbnail
        thumbnail = imlink.contents[0]["src"]

        ratings_str = row.find_all("td", attrs={"class": "collection_bggrating"})[2].contents[0]
        nratings = int("".join(ratings_str.split()))

        df.iloc[idx, :] = [gameid, gamename, nratings, thumbnail]

    # Concatenate the results of this page to the master dataframe
    min_nrate = df["nrate"].min()  # The smallest number of ratings of any game on the page
    print("Page %i scraped, minimum number of ratings was %i" % (npage, min_nrate))
    df_all = pd.concat([df_all, df], axis=0)
    npage += 1
    sleep(2) # Keep the BGG server happy.

Page 1 scraped, minimum number of ratings was 21869
Page 2 scraped, minimum number of ratings was 14115
Page 3 scraped, minimum number of ratings was 10474
Page 4 scraped, minimum number of ratings was 7914
Page 5 scraped, minimum number of ratings was 6720
Page 6 scraped, minimum number of ratings was 5523
Page 7 scraped, minimum number of ratings was 4758
Page 8 scraped, minimum number of ratings was 4175
Page 9 scraped, minimum number of ratings was 3680
Page 10 scraped, minimum number of ratings was 3293
Page 11 scraped, minimum number of ratings was 3000
Page 12 scraped, minimum number of ratings was 2744
Page 13 scraped, minimum number of ratings was 2524
Page 14 scraped, minimum number of ratings was 2356
Page 15 scraped, minimum number of ratings was 2164
Page 16 scraped, minimum number of ratings was 2007
Page 17 scraped, minimum number of ratings was 1863
Page 18 scraped, minimum number of ratings was 1769
Page 19 scraped, minimum number of ratings was 1660
Page 20 scraped, m

In [None]:
df = df_all.copy()
# Reset the index since we concatenated a bunch of DFs with the same index into one DF
df.reset_index(inplace=True, drop=True)
# Write the DF to .csv for future use
df.to_csv("games_list.csv", index=False, encoding="utf-8")
df.head()