# Notebook Objective and Setup

BGG01 involves the acquisition of game data from BoardGameGeek. Largely this is accomplished by XML API call, with some dynamic content scraped. Files are dumped to a "dirty" directory.

## Package Imports

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from bs4 import BeautifulSoup
import requests
import regex as re
import time
import json
import os
import gc

# ignore warnings (gets rid of Pandas copy warnings)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os


## Functions

In [2]:
def create_designers(game_page, game_id):
    '''Create DataFrame for Designers for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all designers on page
    all_designers = game_page.find_all('link', type='boardgamedesigner')
    
    # make dataframe
    designers = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    design = {'BGGId':int(game_id)}
    
    # add this item's designers to dictionary
    for item in all_designers:
        design[item['value']] = int(1)
    
    # append to dataframe
    designers = designers.append(design, ignore_index=True)
    
    # return dataframe
    return designers

In [3]:
def create_categories(game_page, game_id):
    '''Create DataFrame for Categories for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all categories on page
    all_categories = game_page.find_all('link', type='boardgamecategory')
    
    # make dataframe
    categories = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    category = {'BGGId':int(game_id)}
    
    # add this item's categories to dictionary
    for item in all_categories:
        category[item['value']] = int(1)
    
    # append to dataframe
    categories = categories.append(category, ignore_index=True)
    
    # return dataframe
    return categories

In [4]:
def create_mechanics(game_page, game_id):
    '''Create DataFrame for Mechanics for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all mechanics on page
    all_mechanics = game_page.find_all('link', type='boardgamemechanic')
    # make dataframe
    mechanics = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    mechanic = {'BGGId':int(game_id)}

    # add this item's mechanics to dictionary
    for item in all_mechanics:
        mechanic[item['value']] = int(1)
    
    # Try Tableau
    try:
        game_page.find('link', type='boardgamefamily', value=("Mechanism: Tableau Building"))['value']
        mechanic['TableauBuilding'] = int(1)
    except: pass
    
    # Try is Legacy
    try:
        game_page.find('link', type='boardgamefamily', value=("Mechanism: Legacy"))['value']
        mechanic['Legacy'] = int(1)
    except: pass
    
    # append to dataframe
    mechanics = mechanics.append(mechanic, ignore_index=True)
    # return dataframe
    return mechanics

In [5]:
def create_artists(game_page, game_id):
    '''Create DataFrame for artists for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all artists on page
    all_artists = game_page.find_all('link', type='boardgameartist')
    
    # make dataframe
    artists = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    artist = {'BGGId':int(game_id)}

    # add this item's artists to dictionary
    for item in all_artists:
        artist[item['value']] = int(1)
    
    # append to dataframe
    artists = artists.append(artist, ignore_index=True)
    
    # return dataframe
    return artists

In [6]:
def create_publishers(game_page, game_id):
    '''Create DataFrame for Mechanics for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all publishers on page
    all_publishers = game_page.find_all('link', type='boardgamepublisher')
    
    # make dataframe
    publishers = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    publisher = {'BGGId':int(game_id)}
    
    # add this item's artists to dictionary
    for item in all_publishers:
        publisher[item['value']] = int(1)
    
    # append to dataframe
    publishers = publishers.append(publisher, ignore_index=True)
    
    # return dataframe
    return publishers

In [7]:
def create_awards(awards_level, game_id):
    '''Create DataFrame for Awards for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all awards on page
    all_awards = awards_level.find_all('a', class_='ng-binding')
    
    # make dataframe
    awards = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    award = {'BGGId':int(game_id)}

    # add this item's awards to dictionary
    for item in all_awards:
        item = re.sub("[0-9]", "", item.text).strip(' ')
        award[item] = int(1)
    
    # append to dataframe
    awards = awards.append(award, ignore_index=True)
    
    # return dataframe
    return awards

In [8]:
def create_ratings_dist(stats_page, game_id):
    '''Create DataFrame for ratings ditribution for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe
    '''
    
    # find all ratings sections
    all_ratings = stats_page.find('ratings-stats-graph')
    # find all ratings entries
    next_ratings = all_ratings.find_all('text')
    
    # make dataframe
    ratings = pd.DataFrame(columns=['BGGId'])
    # make dictionary
    rating = {'BGGId':int(game_id)}
    
    # set all ratings in dictionary
    rating['1'] = next_ratings[10].text
    rating['2'] = next_ratings[11].text
    rating['3'] = next_ratings[12].text
    rating['4'] = next_ratings[13].text
    rating['5'] = next_ratings[14].text
    rating['6'] = next_ratings[15].text
    rating['7'] = next_ratings[16].text
    rating['8'] = next_ratings[17].text
    rating['9'] = next_ratings[18].text
    rating['10'] = next_ratings[19].text
    
    # append to dataframe
    ratings = ratings.append(rating, ignore_index=True)
    
    # return dataframe
    return ratings

# Game Scraping

In [9]:
# set up our columns list
columns = ['BGGId',
                'Name',
               'Description',
                'YearPublished',
                'GameWeight',
                'AvgRating',  
                'BayesAvgRating',
                'StdDev',
                'MinPlayers',
                'MaxPlayers',
                'ComAgeRec',
                'LanguageEase',
                'BestPlayers',
                'GoodPlayers',
                'NumOwned',
                'NumWant',
                'NumWish',
                'NumWeightVotes',
                'MfgPlaytime',
                'ComMinPlaytime',
                'ComMaxPlaytime',
                'MfgAgeRec',
                'NumUserRatings',
                'NumComments',
                'NumAlternates',
                'NumExpansions',
                'NumImplementations',
           'IsReimplementation',
                'Family',
                'Theme',
               'Category',
               'Kickstarted',
               'ImagePath',
          ]

Last game id: 349161

In [None]:
game_ids_current = pd.read_pickle('data_dirty/game_ids_current')
game_ids = list(game_ids_current)

In [None]:
start_position =0
end_position = 1000
file_suffix = 0

overall_start = time.time()
while end_position < (len(game_ids)+1):
    
    games = pd.DataFrame(columns=columns)
    designers = pd.DataFrame(columns=['BGGId'])
    categories = pd.DataFrame(columns=['BGGId'])
    mechanics = pd.DataFrame(columns=['BGGId'])
    artists = pd.DataFrame(columns=['BGGId'])
    publishers = pd.DataFrame(columns=['BGGId'])
    subcategories = pd.DataFrame(columns=['BGGId'])
    comments = pd.DataFrame(columns=['BGGId'])
    
    ##### File Setup Section #####
    
    # increment file suffix
    file_suffix += 1
    # get file suffix as string
    suffix_str = str(file_suffix)
    
    # print start and end positions
    print("Getting items "+str(start_position+1)+' through '+str(end_position))
    
    # get list of game ids to grab
    #grab_list = game_ids[0][start_position:end_position]
    grab_list = game_ids[start_position:end_position]
    
    # piece together target string of game ids for BGG 
    targets = ''
    for item in grab_list:
        targets += str(item)+','
    
    # log start time for information retrieval
    start = time.time()# log the start time for this entry   
    
    
    ##### API Call Section #####
    
    # Set up Selenium drivers
    options = webdriver.ChromeOptions() # set up chrome options
    options.add_argument("--headless") # set up chrome options
    time.sleep(1) # wait 1 second
    # establish path with targets
    path = 'https://www.boardgamegeek.com/xmlapi2/thing?id='+targets+'&stats=1&type=boardgame'#&comments=1'&ratingcomments=1&page=1&pagesize=100
    driver = webdriver.Chrome(options=options)# initiate chrome driver with options
    print("New page retrieval. May be waiting for load.")
    driver.get(path)# get path
    # wait until the driver finds the element that we need
    element = WebDriverWait(driver, 180).until(EC.presence_of_all_elements_located((By.ID, 'folder0')))
        
    game_page = BeautifulSoup(driver.page_source) # parse page with beautifulsoup    
    
    # make entry for each game item on page
    game_entries = game_page.find_all('item')
    
    print("Items loaded. Processing.")
    ##### Process Each Game #####
    
    for entry in game_entries:
        ##### Get Game Name, BGGId, and check that game should be included in list #####

        ##### Check is expansion #####
        #gametype = entry['type'] # check game type
        #if gametype != 'boardgame':
        #    continue
        #else: pass
       

        # check that this game has sufficient user ratings to incluide
        try:
            user_ratings = int(entry.find('usersrated')['value'])# get the number of user ratings
        
            if user_ratings < 30: #check if user ratings are under 30
                continue
        except: continue
            
        # get game name and BGG ID
        game_name = entry.find('name', type='primary')['value']
        game_id = entry['id']
        #print("Name: "+game_name+", BGG ID: "+str(game_id))

        
        ##### Get Basic Stats #####

        #print("Getting basic stats")
        description = entry.find('description').text # description text of the game
        
        try:
            year_pub = int(entry.find('yearpublished')['value']) # year published
            if year_pub > 2021:
                continue
        except: pass
            
        try: minplayers = int(entry.find('minplayers')['value']) # minimum players
        except: minplayers = None
            
        try: maxplayers = int(entry.find('maxplayers')['value']) # maximum players
        except: maxplayers = None
            
        avg_rating = float(entry.find('average')['value']) # average rating
        bayes_avg = float(entry.find('bayesaverage')['value']) # bayes average rating
        std_dev = float(entry.find('stddev')['value']) # standard deviation of rating
        num_owned = int(entry.find('owned')['value']) # num of people own this game
        num_want = int(entry.find('wanting')['value']) # num of people want this game
        num_wish = int(entry.find('wishing')['value']) # num of people with game on wishlist
        num_weight_votes = int(entry.find('numweights')['value']) # num of votes for game weight
        game_weight = float(entry.find('averageweight')['value']) # voted game weight
        
        try: image_path = entry.find('image').text # path to image
        except: image_path = None
            
        try:  mfg_play_time = int(entry.find('playingtime')['value']) # mfg stated playtime
        except: mfg_play_time = None
        try: comm_min_play = int(entry.find('minplaytime')['value']) # community min playtime
        except: comm_min_play = None
            
        try: comm_max_play = int(entry.find('maxplaytime')['value']) # community max playtime
        except: comm_max_play = None
        
        try: mfg_age = int(entry.find('minage')['value']) # mfg min age
        except: mfg_age = None
            
        #num_comments = int(entry.find('comments')['totalitems']) # num of ratings comments
        num_alts = len(entry.find_all('name', type='alternate')) # number alternate versions
        num_expansions = len(entry.find_all('link', type='boardgameexpansion')) # number of expansions
        num_implementations = len(entry.find_all('link', type='boardgameimplementation')) # number of implementations    
        

        
        ##### Get reimplementation flag #####
        reimplementation = entry.find('link', type="boardgameimplementation", inbound="true") # check if game is a reimplementation
        if reimplementation: 
            reimplements = 1 # if it's a reimplementation, flag it 1
        else: 
            reimplements = 0
 


        ##### Basic stats requiring some compaction/refinement #####

        # community age min
        try:
            age_poll = entry.find('poll', title="User Suggested Player Age").find_all('result')

            total = 0
            items = 0
        
            for item in age_poll:   
                vote = int(item['numvotes']) * int(item['value'][:2])
                total += vote
                items += int(item['numvotes'])

            if items>0: comm_age = total/items # make sure not dividing by 0, get community recommended age
            else: comm_age=None # if no votes, record none
        except: comm_age=None
        
        # Language Ease
        try:
         
            lang_poll = entry.find('poll', title="Language Dependence").find_all('result')
            total, items = 0, 0

            for item in lang_poll:   
                vote = int(item['numvotes']) * int(item['level'])
                total += vote
                items += int(item['numvotes'])

            if items>0: lang_ease = total/items # make sure not dividing by 0, get community language ease
            else: lang_ease=None # if no votes, record none
        except: lang_ease=None # if no votes, record none
            
        try:    
            # Best and Good Players
            players = entry.find('poll', title="User Suggested Number of Players").find_all('results') # get user players poll
            player_num_votes = int(entry.find('poll', title="User Suggested Number of Players")['totalvotes'])# get total votes
        
            best_players, best_score, good_players = 0, 0, [] # set up for best players loop
        
            if player_num_votes > 30: # evaluate if more than 30 votes for num players
                for player in players:
                    best = int(player.find('result', value='Best')['numvotes'])
                    rec = int(player.find('result', value='Recommended')['numvotes'])
                    score = best*2 + rec*1
                    positives = best+rec
                    ratio = positives/player_num_votes
                    if score > best_score: best_players, best_score = player['numplayers'], score # put in # players for best score
                    if ratio > .5: good_players.append(player['numplayers']) # put in good players if over 50% ratio
            else: best_players=None
        except: best_players=None
        
        ##### Skip dynamic content which cannot be batched #####
            
        #this_game['NumFans']=int(num_fans),
        #this_game['NumPageViews']=int(num_views),
        #this_game['RulesPosts']=int(rules_threads),
        #this_game['TotalPosts']=int(total_threads),            
        #this_game['NumAwards'] = int(num_awards)              
            
        
        # make dataframe for this game
        this_game = pd.DataFrame()
        this_game['BGGId']=int(game_id),
        this_game['Name']=game_name,
        this_game['Description']=description,
        this_game['YearPublished']=int(year_pub),
        this_game['GameWeight']=float(game_weight),
        this_game['AvgRating']=float(avg_rating),
        this_game['BayesAvgRating']=float(bayes_avg),
        this_game['StdDev']=float(std_dev),
        this_game['MinPlayers']=minplayers,
        this_game['MaxPlayers']=maxplayers,
        try: this_game['ComAgeRec']=float(comm_age),
        except: this_game['ComAgeRec']=None,
        try: this_game['LanguageEase']=float(lang_ease),
        except: this_game['LanguageEase']=None,
        this_game['BestPlayers']=best_players,
        this_game['GoodPlayers']=good_players,
        this_game['NumOwned']=int(num_owned),
        this_game['NumWant']=int(num_want),
        this_game['NumWish']=int(num_wish),
        this_game['NumWeightVotes']=int(num_weight_votes),
        this_game['MfgPlaytime']=mfg_play_time,
        this_game['ComMinPlaytime']=comm_min_play,
        this_game['ComMaxPlaytime']=comm_max_play,
        this_game['MfgAgeRec']=mfg_age,
        this_game['NumUserRatings']=int(user_ratings),
        #this_game['NumComments']=int(num_comments),
        this_game['NumAlternates']=int(num_alts),
        this_game['NumExpansions']=int(num_expansions),
        this_game['NumImplementations']=int(num_implementations),
        this_game['IsReimplementation']=int(reimplements),
        this_game['ImagePath']=image_path
            
        
        # add unique information to end of df
        
        # Add game ranks
        ranks = entry.find_all('rank')
        try:
            for item in ranks:
                this_game['Rank:'+item['name']] = float(item['value'])
        except: pass
        
        # Try to add components
        try: 
            families = entry.find_all('link', type='boardgamefamily', value=re.compile("Component"))
            for item in families:                    
                this_game['Components:'+item['name']] = item['value']
        except: pass
            
        
        # Try to add game series/family
        try:
            family = entry.find('link', type='boardgamefamily', value=re.compile("Game:"))['value'].strip('Game:').strip(' ')
            this_game['Family'] = family
        except: pass
            
        try:
            family = entry.find('link', type='boardgamefamily', value=re.compile("Series:"))['value'].strip('Series:').strip(' ')
            this_game['Family'] = family
        except: pass
            
        try:
            setting = entry.find('link', type='boardgamefamily', value=re.compile("Setting:"))['value'].strip('Setting:').strip(' ')
            this_game['Setting'] = setting
        except: pass
            
        
        # Try to add theme
        try:
            theme = entry.find('link', type='boardgamefamily', value=re.compile("Theme:"))['value'].strip('Theme:').strip(' ')
            this_game['Theme'] = theme
        except: pass
            
        try:
            mechanism = entry.find('link', type='boardgamefamily', value=re.compile("Mechanism:"))['value'].strip('Mechanism:').strip(' ')
            this_game['Mechanism'] = mechanism
        except: pass
        
        # Try to add game category
        try:
            category = entry.find('link', type='boardgamefamily', value=re.compile("Category:"))['value'].strip('Category:').strip(' ')
            this_game['Category'] = category
        except: pass
        
        
        # Try is Kickstarted
        try:
            entry.find('link', type='boardgamefamily', value=re.compile("Crowdfunding"))['value']
            this_game['Kickstarted'] = int(1)
        except: pass
     
    
    
        ##### Get subcategories #####
        
        all_subcategories = entry.find_all('link', type='boardgamecategory')
        
        categories_hold = pd.DataFrame(columns=['BGGId'])
        subcategory = {'BGGId':int(game_id)}
    
        for item in all_subcategories:
            subcategory[item['value']] = int(1)      
        
        categories_hold = categories_hold.append(subcategory, ignore_index=True)    
        
        
        # create specialty dataframes
        designer = create_designers(entry, game_id)
        category = create_categories(entry, game_id)
        mechanic = create_mechanics(entry, game_id)
        artist = create_artists(entry, game_id)
        publisher = create_publishers(entry, game_id)
            
        games = games.append(this_game, ignore_index = True)
        designers = designers.append(designer, ignore_index=True)
        categories = categories.append(category, ignore_index=True)
        mechanics = mechanics.append(mechanic, ignore_index=True)
        artists = artists.append(artist, ignore_index=True)
        publishers = publishers.append(publisher, ignore_index=True)
        subcategories = subcategories.append(categories_hold, ignore_index=True)
        
    games.to_pickle('data_dirty/games'+suffix_str+'.pkl')
    designers.to_pickle('data_dirty/designers'+suffix_str+'.pkl')
    categories.to_pickle('data_dirty/categories'+suffix_str+'.pkl')
    mechanics.to_pickle('data_dirty/mechanics'+suffix_str+'.pkl')
    artists.to_pickle('data_dirty/artists'+suffix_str+'.pkl')
    publishers.to_pickle('data_dirty/publishers'+suffix_str+'.pkl')
    subcategories.to_pickle('data_dirty/subcategories'+suffix_str+'.pkl')
    
    print("Finished items in this group")
    
    print(f'Time: {time.time() - start}\n\n')       
    
    start_position += 1000
    end_position += 1000
    
print(f'Time: {time.time() - overall_start}\n\n') 

### DataValidation

In [None]:
subcategories1 = pd.read_pickle('data_dirty/subcategories52.pkl')
games1 = pd.read_pickle('data_dirty/games52.pkl')
designers1 = pd.read_pickle('data_dirty/designers52.pkl')
categories1 = pd.read_pickle('data_dirty/categories52.pkl')
mechanics1 = pd.read_pickle('data_dirty/mechanics52.pkl')
artists1 = pd.read_pickle('data_dirty/artists52.pkl')
publishers1 = pd.read_pickle('data_dirty/publishers52.pkl')

In [None]:
subcategories1

In [None]:
games1

In [None]:
designers1

In [None]:
categories1

In [None]:
mechanics1

In [None]:
artists1

In [None]:
publishers1

## Combine Files

In [None]:
#games = pd.DataFrame(columns=columns)
#designers = pd.DataFrame(columns=['BGGId'])
#categories = pd.DataFrame(columns=['BGGId'])
#mechanics = pd.DataFrame(columns=['BGGId'])
#artists = pd.DataFrame(columns=['BGGId'])
#publishers = pd.DataFrame(columns=['BGGId'])
#subcategories = pd.DataFrame(columns=['BGGId'])

In [None]:
for number in range(1, 354):
    print(number)
    
    #this_games = pd.read_pickle('data_dirty/games'+str(number)+'.pkl')
    #this_designers = pd.read_pickle('data_dirty/designers'+str(number)+'.pkl')
    #this_categories = pd.read_pickle('data_dirty/categories'+str(number)+'.pkl')
    #this_mechanics = pd.read_pickle('data_dirty/mechanics'+str(number)+'.pkl')
    #this_artists = pd.read_pickle('data_dirty/artists'+str(number)+'.pkl')
    #this_publishers = pd.read_pickle('data_dirty/publishers'+str(number)+'.pkl')
    #this_subcategories = pd.read_pickle('data_dirty/subcategories'+str(number)+'.pkl')
    
    #games = games.append(this_games)
    #designers = designers.append(this_designers)
    #categories = categories.append(this_categories)
    #mechanics = mechanics.append(this_mechanics)
    #artists = artists.append(this_artists)
    #publishers = publishers.append(this_publishers)
    #subcategories = subcategories.append(this_subcategories)
    
    

In [None]:
#games = games.reset_index(drop=True)
#designers = designers.reset_index(drop=True)
#categories = categories.reset_index(drop=True)
#mechanics = mechanics.reset_index(drop=True)
#artists = artists.reset_index(drop=True)
#publishers = publishers.reset_index(drop=True)
#subcategories = subcategories.reset_index(drop=True)

In [None]:
#games.to_pickle('data_dirty/games.pkl')
#designers.to_pickle('data_dirty/designers.pkl')
#categories.to_pickle('data_dirty/categories.pkl')
#mechanics.to_pickle('data_dirty/mechanics.pkl')
#artists.to_pickle('data_dirty/artists.pkl')
#publishers.to_pickle('data_dirty/publishers.pkl')
#subcategories.to_pickle('data_dirty/subcategories.pkl')

In [None]:
break

# Ratings and Comments

In [9]:
game_ids_current = pd.read_pickle('data_cleaned/game_ids_current.pkl')
game_ids = list(game_ids_current)

games = pd.read_pickle('data_cleaned/games.pkl')

In [10]:
ratings_totals = pd.DataFrame(games['BGGId'])
ratings_totals['RatingsPages'] = np.ceil(games['NumUserRatings']/100).astype('int')
ratings_totals = ratings_totals.sort_values('RatingsPages', ascending=False).reset_index(drop=True)

In [11]:
ratings_totals[:500]

Unnamed: 0,BGGId,RatingsPages
0,30549,1082
1,822,1080
2,13,1072
3,68448,893
4,36218,812
...,...,...
495,2952,76
496,128671,76
497,200147,76
498,826,76


In [12]:
#pages_for_item = ratings_totals.set_index('BGGId').to_dict(orient='dict')['RatingsPages']

In [13]:
#highest = ratings_totals['RatingsPages'].max()
#highest

1082

In [14]:
#pages_for_item[3]

152

## Create Scraper URLs

In [12]:
group1 = ratings_totals[:1000]
group2 = ratings_totals[1000:2000]
group3 = ratings_totals[2000:3000]
group4 = ratings_totals[3000:4000]
group5 = ratings_totals[4000:5000]
group6 = ratings_totals[5000:6000]
group7 = ratings_totals[6000:7000]
group8 = ratings_totals[7000:8000]
group9 = ratings_totals[8000:9000]
group10 = ratings_totals[9000:10000]
group11 = ratings_totals[10000:11000]
group12 = ratings_totals[11000:12000]
group13 = ratings_totals[12000:13000]
group14 = ratings_totals[13000:14000]
group15 = ratings_totals[14000:15000]
group16 = ratings_totals[15000:16000]
group17 = ratings_totals[16000:17000]
group18 = ratings_totals[17000:18000]
group19 = ratings_totals[18000:19000]
group20 = ratings_totals[19000:20000]
group21 = ratings_totals[20000:21000]
group22 = ratings_totals[21000:]

In [13]:
groups = [group1, group2, group3, group4, group5, group6, group7, group8, group9, group10,
         group11, group12, group13, group14, group15, group16, group17, group18, group19, group20,
         group21, group22]

In [14]:
def generate_urls(group):
    urls_list = []
    
    max_pages = group.RatingsPages.max()
    min_pages = group.RatingsPages.min()
    pages = np.arange(max_pages,0, -1)
    max_size = group.shape[0]-1
    #print(max_pages, max_size)
    
    targets = ''
    current_index = 0
    
    for page in pages:
        
        current_thresh = page
        
        while group.iloc[current_index]['RatingsPages'] == current_thresh:
            
            # get BGGId for the index
            current_item = group.iloc[current_index]['BGGId']            
            
            # target is the current item
            target = str(current_item)   
            # add the target to the targets list
            targets += target+','            
            
            current_index += 1
            if current_index == 999 or current_index == max_size:
                # get BGGId for the index
                current_item = group.iloc[current_index]['BGGId']            
            
                # target is the current item
                target = str(current_item)   
                # add the target to the targets list
                targets += target+','
                break
        
        #print(current_index)
        
        # establish path with targets and current page
        path = 'https://www.boardgamegeek.com/xmlapi2/thing?id='+targets+'&ratingcomments=1&page='+str(page)+'&pagesize=100'
        urls_list.append(path)
    
    return urls_list

In [15]:
group_urls = {}
group_num = 0

for group in groups:
    group_num += 1
    print(group_num)
    group_urls["group"+str(group_num)] = generate_urls(group)
    
with open('data_cleaned/scraper_urls_ratings.json', 'w') as convert_file:
    convert_file.write(json.dumps(group_urls))

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22


## Scrape URLs

In [17]:
import scrapy

for item in group_urls:
    
    print(item)
    
    !scrapy crawl bgg -a group=$item


group1
group1
group2


2022-01-09 17:12:43 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-09 17:12:43 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-09 17:12:43 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-09 17:12:43 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-09 17:12:43 [scrapy.extensions.telnet] INFO: Telnet Password: b1bd99af720e0e28
2022-01-09 17:12:43 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

2022-01-09 17:14:28 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.boardgamegeek.com/xmlapi2/thing?id=30549,822,13,&ratingcomments=1&page=1048&pagesize=100> (referer: None)
2022-01-09 17:14:28 [bgg] DEBUG: saved test file
2022-01-09 17:14:31 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.boardgamegeek.com/xmlapi2/thing?id=30549,822,13,&ratingcomments=1&page=1047&pagesize=100> (referer: None)
2022-01-09 17:14:31 [bgg] DEBUG: saved test file
2022-01-09 17:14:34 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.boardgamegeek.com/xmlapi2/thing?id=30549,822,13,&ratingcomments=1&page=1046&pagesize=100> (referer: None)
2022-01-09 17:14:34 [bgg] DEBUG: saved test file
2022-01-09 17:14:38 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.boardgamegeek.com/xmlapi2/thing?id=30549,822,13,&ratingcomments=1&page=1045&pagesize=100> (referer: None)
2022-01-09 17:14:38 [bgg] DEBUG: saved test file
2022-01-09 17:14:42 [scrapy.core.engine] DEBUG: Crawled (200) <G

group2group3


2022-01-10 01:31:59 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 01:31:59 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 01:31:59 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 01:31:59 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 01:31:59 [scrapy.extensions.telnet] INFO: Telnet Password: 97baacbaa8e05eb9
2022-01-10 01:31:59 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

 'scheduler/dequeued': 56,
 'scheduler/dequeued/memory': 56,
 'scheduler/enqueued': 56,
 'scheduler/enqueued/memory': 56,
 'start_time': datetime.datetime(2022, 1, 10, 9, 32, 0, 30868)}
2022-01-10 02:01:08 [scrapy.core.engine] INFO: Spider closed (finished)



group3
group4


2022-01-10 02:01:09 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:01:09 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:01:09 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:01:09 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:01:09 [scrapy.extensions.telnet] INFO: Telnet Password: c0ba28f3b09f6c7f
2022-01-10 02:01:09 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

2022-01-10 02:03:23 [scrapy.downloadermiddlewares.retry] DEBUG: Retrying <GET https://www.boardgamegeek.com/xmlapi2/thing?id=30367,223779,40832,3421,58,80006,24037,43152,33468,94480,165838,55829,175307,1501,230408,589,158889,251293,1536,104955,98527,36811,188547,2065,177352,95105,182340,158243,244114,193560,293889,293296,3565,21641,46396,159504,29308,130,420,136056,41066,282954,301255,7682,15045,47055,2476,28181,41569,139991,272739,40508,171905,44,276830,304783,146221,2379,833,280794,3141,35488,171662,11825,238638,279613,264982,262477,31483,259081,165948,40444,312267,40425,204135,185196,232666,13301,172546,302388,111,169427,265683,233262,38548,143147,22192,156138,19301,176361,154906,24304,296100,157820,215312,124380,7480,161226,531,6263,91984,20806,5781,92044,122294,22278,214000,254713,290359,22245,119632,184491,119391,94104,113873,117663,118247,118418,75547,5339,179460,127518,153097,15878,300442,15954,58110,132544,244,207016,158340,20134,228328,11229,63706,9829,299960,66424,269603,208

group4
group5


2022-01-10 02:11:08 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:11:08 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:11:08 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:11:08 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:11:08 [scrapy.extensions.telnet] INFO: Telnet Password: 1f41ae5154c49d42
2022-01-10 02:11:08 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group5group6


2022-01-10 02:12:16 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:12:16 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:12:16 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:12:16 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:12:16 [scrapy.extensions.telnet] INFO: Telnet Password: dc9626b1223d0e54
2022-01-10 02:12:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc


group6
group7


2022-01-10 02:13:58 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:13:58 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:13:58 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:13:58 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:13:58 [scrapy.extensions.telnet] INFO: Telnet Password: a27a8526033c644d
2022-01-10 02:13:58 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group7
group8


2022-01-10 02:15:19 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:15:19 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:15:19 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:15:19 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:15:19 [scrapy.extensions.telnet] INFO: Telnet Password: b03ccb7277300ea6
2022-01-10 02:15:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group8group9


2022-01-10 02:17:42 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:17:42 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:17:42 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:17:42 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:17:42 [scrapy.extensions.telnet] INFO: Telnet Password: 8558186c672c69c7
2022-01-10 02:17:42 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc


group9
group10


2022-01-10 02:19:14 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:19:14 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:19:14 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:19:14 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:19:14 [scrapy.extensions.telnet] INFO: Telnet Password: 8ae20c11451e6c86
2022-01-10 02:19:14 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group10group11



2022-01-10 02:20:18 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:20:18 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:20:18 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:20:18 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:20:18 [scrapy.extensions.telnet] INFO: Telnet Password: 7c3de6afb249d335
2022-01-10 02:20:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group11
group12


2022-01-10 02:21:16 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:21:16 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:21:16 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:21:16 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:21:16 [scrapy.extensions.telnet] INFO: Telnet Password: 7c1d435b71289c34
2022-01-10 02:21:16 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group12
group13


2022-01-10 02:23:06 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:23:06 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:23:06 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:23:06 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:23:06 [scrapy.extensions.telnet] INFO: Telnet Password: d0625e2cfb70c6f2
2022-01-10 02:23:06 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group13
group14


2022-01-10 02:23:58 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:23:58 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:23:58 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:23:58 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:23:58 [scrapy.extensions.telnet] INFO: Telnet Password: fdbb460bbc05dfb5
2022-01-10 02:23:58 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group14
group15


2022-01-10 02:24:58 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:24:58 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:24:58 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:24:58 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:24:58 [scrapy.extensions.telnet] INFO: Telnet Password: 61e09d9ea19b7b37
2022-01-10 02:24:58 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group15
group16


2022-01-10 02:25:58 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:25:58 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:25:58 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:25:58 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:25:58 [scrapy.extensions.telnet] INFO: Telnet Password: cf8a8dc23e1e0a68
2022-01-10 02:25:58 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group16
group17


2022-01-10 02:26:04 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:26:04 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:26:04 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:26:04 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:26:04 [scrapy.extensions.telnet] INFO: Telnet Password: 117a10da0fcca366
2022-01-10 02:26:04 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group17
group18


2022-01-10 02:26:15 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:26:15 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:26:15 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:26:15 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:26:15 [scrapy.extensions.telnet] INFO: Telnet Password: cf2027590772e777
2022-01-10 02:26:15 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group18
group19


2022-01-10 02:26:30 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:26:30 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:26:30 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:26:30 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:26:30 [scrapy.extensions.telnet] INFO: Telnet Password: 718b593ac551aad3
2022-01-10 02:26:30 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group19
group20


2022-01-10 02:26:41 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:26:41 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:26:41 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:26:41 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:26:41 [scrapy.extensions.telnet] INFO: Telnet Password: 586b7c0b2cbe26ff
2022-01-10 02:26:41 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group20
group21


2022-01-10 02:26:49 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:26:49 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:26:49 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:26:49 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:26:49 [scrapy.extensions.telnet] INFO: Telnet Password: 48ddd710c8b8667a
2022-01-10 02:26:49 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

group21group22


2022-01-10 02:26:57 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:26:57 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:26:57 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:26:57 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:26:57 [scrapy.extensions.telnet] INFO: Telnet Password: d47bda8de1e2f258
2022-01-10 02:26:57 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc


group22


2022-01-10 02:27:02 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: bggscraper)
2022-01-10 02:27:02 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.12, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.7.0, Python 3.7.10 | packaged by conda-forge | (default, Oct 13 2021, 20:22:05) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 21.0.0 (OpenSSL 1.1.1l  24 Aug 2021), cryptography 35.0.0, Platform Windows-10-10.0.19041-SP0
2022-01-10 02:27:02 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2022-01-10 02:27:02 [scrapy.crawler] INFO: Overridden settings:
{'AUTOTHROTTLE_ENABLED': True,
 'BOT_NAME': 'bggscraper',
 'COOKIES_ENABLED': False,
 'DOWNLOAD_DELAY': 2,
 'NEWSPIDER_MODULE': 'bggscraper.spiders',
 'SPIDER_MODULES': ['bggscraper.spiders']}
2022-01-10 02:27:02 [scrapy.extensions.telnet] INFO: Telnet Password: 44626fc642b9a59a
2022-01-10 02:27:02 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'sc

In [114]:
# Set up Selenium drivers
options = webdriver.ChromeOptions() # set up chrome options
options.add_argument("--headless") # set up chrome options 
#options.add_argument('--disable-dev-shm-usage')

for group in group_urls:
    
    appendation = str(group)
    
    user_ratings = pd.DataFrame()
    
    # for each page 
    for url in group_urls[group]:
        
        start = time.time()
        
        path = url
        print(path)
        
        # access URLS here #
    
        
        ##### API Call Section #####
        driver = webdriver.Chrome(options=options)# initiate chrome driver with options
        print("Waiting for page to load")

        flag = True
        while flag==True:
            
            try:
        
                driver.get(path)# get path
        
                # wait until the driver finds the element that we need
                myElem = WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.CLASS_NAME, 'opened')))
                print("Page loaded. Processing.")            
                flag=False
        
            except:
                print("Item did not appear, retrying")
                flag=True

        
        ##### Process Each Game #####
        
        #game_entries = game_page.find_all('item')
        game_entries = driver.find_elements_by_tag_name('item')  
        
        # for each game found on the page:
        for entry in game_entries:
            
            elementHTML = entry.get_attribute('outerHTML') #gives exact HTML content of the element
            elementSoup = BeautifulSoup(elementHTML,'html.parser')
            
            # make an empty dataframe for this item
            this_game_ratings = pd.DataFrame(columns=['BGGId'])
            
            # get the item id and item name
            item_id = int(elementSoup.find('item')['id'])
            item_name = elementSoup.find('name')['value']
        
            # get the list of ratings
            list_of_ratings = elementSoup.find_all('comment')
            
            # set up empty list to store the ratings found on this page
            bggid, names, ratings, comment, usernames = [], [], [], [], []
            
            # for each item in the list of ratings:
            for rating in list_of_ratings:
                # add the raw rating to the raw_ratings dictionary
                # add the item id to the list
                bggid.append(item_id)
                # add the item name to the list
                names.append(item_name)
                # add the rating to the list
                ratings.append(rating['rating'])
                # add the comment to the list
                comment.append(rating['value'])
                # add the username to the list
                usernames.append(rating['username'].strip('_'))
            
            # fill the dataframe for the entry
            this_game_ratings['BGGId'] = bggid    
            this_game_ratings['Name'] = names
            this_game_ratings['Rating'] = ratings
            this_game_ratings['Value'] = comment
            this_game_ratings['Username'] = usernames
            
            # append the entry to the overall user_ratings dataframe
            user_ratings = user_ratings.append(this_game_ratings)
        
        driver.close()
        
        end = time.time()
        print(str(end-start)+" seconds elapsed")    
        
        time.sleep(2)
    
    user_ratings.to_pickle('data_dirty/pulled_ratings/ratings_pull'+appendation+'.pkl')


    
    

https://www.boardgamegeek.com/xmlapi2/thing?id=5457,198609,283619,192120,118402,178134,121193,5641,8229,1511,404,176334,424,251890,217083,136587,28218,253719,25420,231197,3128,1855,318182,864,279869,205127,122588,14017,21704,207753,37141,161546,141791,13172,1549,288513,4583,38872,4424,384,2472,94389,5770,161547,277,59149,194100,96792,241491,210232,11017,9342,329841,273703,2259,160958,13511,122891,129976,1705,285533,186265,148205,55601,55952,233868,1245,3312,24770,172540,3318,3230,306881,300936,33150,21804,114903,229741,3208,59429,22237,122240,204837,260757,169649,4378,123576,164865,1017,32674,36888,26884,2652,129556,223931,142131,192547,147623,213492,173018,269160,149119,307002,230200,168433,182116,2228,161617,209660,39217,3321,346703,262114,99392,115233,282414,&ratingcomments=1&page=10&pagesize=100
Waiting for page to load
Page loaded. Processing.
7.248584985733032 seconds elapsed
https://www.boardgamegeek.com/xmlapi2/thing?id=5457,198609,283619,192120,118402,178134,121193,5641,8229,1

Waiting for page to load
Page loaded. Processing.
112.32217669487 seconds elapsed
https://www.boardgamegeek.com/xmlapi2/thing?id=5457,198609,283619,192120,118402,178134,121193,5641,8229,1511,404,176334,424,251890,217083,136587,28218,253719,25420,231197,3128,1855,318182,864,279869,205127,122588,14017,21704,207753,37141,161546,141791,13172,1549,288513,4583,38872,4424,384,2472,94389,5770,161547,277,59149,194100,96792,241491,210232,11017,9342,329841,273703,2259,160958,13511,122891,129976,1705,285533,186265,148205,55601,55952,233868,1245,3312,24770,172540,3318,3230,306881,300936,33150,21804,114903,229741,3208,59429,22237,122240,204837,260757,169649,4378,123576,164865,1017,32674,36888,26884,2652,129556,223931,142131,192547,147623,213492,173018,269160,149119,307002,230200,168433,182116,2228,161617,209660,39217,3321,346703,262114,99392,115233,282414,271615,5419,252877,317457,102435,109932,33643,133956,252399,22347,1489,103132,285826,152765,109764,326494,236248,35285,5942,38391,134711,279,371,2

Waiting for page to load
Item did not appear, retrying
Page loaded. Processing.
197.84216117858887 seconds elapsed
https://www.boardgamegeek.com/xmlapi2/thing?id=5457,198609,283619,192120,118402,178134,121193,5641,8229,1511,404,176334,424,251890,217083,136587,28218,253719,25420,231197,3128,1855,318182,864,279869,205127,122588,14017,21704,207753,37141,161546,141791,13172,1549,288513,4583,38872,4424,384,2472,94389,5770,161547,277,59149,194100,96792,241491,210232,11017,9342,329841,273703,2259,160958,13511,122891,129976,1705,285533,186265,148205,55601,55952,233868,1245,3312,24770,172540,3318,3230,306881,300936,33150,21804,114903,229741,3208,59429,22237,122240,204837,260757,169649,4378,123576,164865,1017,32674,36888,26884,2652,129556,223931,142131,192547,147623,213492,173018,269160,149119,307002,230200,168433,182116,2228,161617,209660,39217,3321,346703,262114,99392,115233,282414,271615,5419,252877,317457,102435,109932,33643,133956,252399,22347,1489,103132,285826,152765,109764,326494,236248,

Waiting for page to load
Item did not appear, retrying
Page loaded. Processing.
209.50228333473206 seconds elapsed
https://www.boardgamegeek.com/xmlapi2/thing?id=5457,198609,283619,192120,118402,178134,121193,5641,8229,1511,404,176334,424,251890,217083,136587,28218,253719,25420,231197,3128,1855,318182,864,279869,205127,122588,14017,21704,207753,37141,161546,141791,13172,1549,288513,4583,38872,4424,384,2472,94389,5770,161547,277,59149,194100,96792,241491,210232,11017,9342,329841,273703,2259,160958,13511,122891,129976,1705,285533,186265,148205,55601,55952,233868,1245,3312,24770,172540,3318,3230,306881,300936,33150,21804,114903,229741,3208,59429,22237,122240,204837,260757,169649,4378,123576,164865,1017,32674,36888,26884,2652,129556,223931,142131,192547,147623,213492,173018,269160,149119,307002,230200,168433,182116,2228,161617,209660,39217,3321,346703,262114,99392,115233,282414,271615,5419,252877,317457,102435,109932,33643,133956,252399,22347,1489,103132,285826,152765,109764,326494,236248,

Waiting for page to load
Item did not appear, retrying
Page loaded. Processing.
232.59868621826172 seconds elapsed


# Appendix

### Data Validation

In [None]:
games=pd.read_pickle('data_dirty/games.pkl')
designers=pd.read_pickle('data_dirty/designers.pkl')
categories=pd.read_pickle('data_dirty/categories.pkl')
mechanics=pd.read_pickle('data_dirty/mechanics.pkl')
artists=pd.read_pickle('data_dirty/artists.pkl')
publishers=pd.read_pickle('data_dirty/publishers.pkl')
subcategories=pd.read_pickle('data_dirty/subcategories.pkl')

In [None]:
games

In [None]:
designers.tail()

In [None]:
categories.tail()

In [None]:
mechanics.tail()

In [None]:
artists.tail()

In [None]:
publishers.tail()

In [None]:
subcategories.tail()

## Get game ids

In [None]:
games_ids_current = games['BGGId']
games_ids_current

In [None]:
games_ids_current.to_pickle('data_dirty/game_ids_current')