# Notebook Objective and Setup

BGG01 involves the acquisition of game data from BoardGameGeek. Largely this is accomplished by XML API call, with some dynamic content scraped. Files are dumped to a "dirty" directory.

## Package Imports

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from bs4 import BeautifulSoup
import requests
import regex as re
import time
import json
import os

# ignore warnings (gets rid of Pandas copy warnings)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os


## Functions

In [10]:
def create_designers(game_page, game_id):
    '''Create DataFrame for Designers for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all designers on page
    all_designers = game_page.find_all('link', type='boardgamedesigner')
    
    # make dataframe
    designers = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    design = {'BGGId':int(game_id)}
    
    # add this item's designers to dictionary
    for item in all_designers:
        design[item['value']] = int(1)
    
    # append to dataframe
    designers = designers.append(design, ignore_index=True)
    
    # return dataframe
    return designers

In [11]:
def create_categories(game_page, game_id):
    '''Create DataFrame for Categories for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all categories on page
    all_categories = game_page.find_all('link', type='boardgamecategory')
    
    # make dataframe
    categories = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    category = {'BGGId':int(game_id)}
    
    # add this item's categories to dictionary
    for item in all_categories:
        category[item['value']] = int(1)
    
    # append to dataframe
    categories = categories.append(category, ignore_index=True)
    
    # return dataframe
    return categories

In [12]:
def create_mechanics(game_page, game_id):
    '''Create DataFrame for Mechanics for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all mechanics on page
    all_mechanics = game_page.find_all('link', type='boardgamemechanic')
    # make dataframe
    mechanics = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    mechanic = {'BGGId':int(game_id)}

    # add this item's mechanics to dictionary
    for item in all_mechanics:
        mechanic[item['value']] = int(1)
    
    # Try Tableau
    try:
        game_page.find('link', type='boardgamefamily', value=("Mechanism: Tableau Building"))['value']
        mechanic['TableauBuilding'] = int(1)
    except: pass
    
    # Try is Legacy
    try:
        game_page.find('link', type='boardgamefamily', value=("Mechanism: Legacy"))['value']
        mechanic['Legacy'] = int(1)
    except: pass
    
    # append to dataframe
    mechanics = mechanics.append(mechanic, ignore_index=True)
    # return dataframe
    return mechanics

In [13]:
def create_artists(game_page, game_id):
    '''Create DataFrame for artists for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all artists on page
    all_artists = game_page.find_all('link', type='boardgameartist')
    
    # make dataframe
    artists = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    artist = {'BGGId':int(game_id)}

    # add this item's artists to dictionary
    for item in all_artists:
        artist[item['value']] = int(1)
    
    # append to dataframe
    artists = artists.append(artist, ignore_index=True)
    
    # return dataframe
    return artists

In [14]:
def create_publishers(game_page, game_id):
    '''Create DataFrame for Mechanics for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all publishers on page
    all_publishers = game_page.find_all('link', type='boardgamepublisher')
    
    # make dataframe
    publishers = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    publisher = {'BGGId':int(game_id)}
    
    # add this item's artists to dictionary
    for item in all_publishers:
        publisher[item['value']] = int(1)
    
    # append to dataframe
    publishers = publishers.append(publisher, ignore_index=True)
    
    # return dataframe
    return publishers

In [15]:
def create_awards(awards_level, game_id):
    '''Create DataFrame for Awards for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe'''
    
    # find all awards on page
    all_awards = awards_level.find_all('a', class_='ng-binding')
    
    # make dataframe
    awards = pd.DataFrame(columns=['BGGId'])
    # make dictionary for this item
    award = {'BGGId':int(game_id)}

    # add this item's awards to dictionary
    for item in all_awards:
        item = re.sub("[0-9]", "", item.text).strip(' ')
        award[item] = int(1)
    
    # append to dataframe
    awards = awards.append(award, ignore_index=True)
    
    # return dataframe
    return awards

In [16]:
def create_ratings_dist(stats_page, game_id):
    '''Create DataFrame for ratings ditribution for a specific game id
    
    Inputs:
    game_page: page loaded and read with BeautifulSoup
    game_id: id for this game
    
    Outputs:
    dataframe
    '''
    
    # find all ratings sections
    all_ratings = stats_page.find('ratings-stats-graph')
    # find all ratings entries
    next_ratings = all_ratings.find_all('text')
    
    # make dataframe
    ratings = pd.DataFrame(columns=['BGGId'])
    # make dictionary
    rating = {'BGGId':int(game_id)}
    
    # set all ratings in dictionary
    rating['1'] = next_ratings[10].text
    rating['2'] = next_ratings[11].text
    rating['3'] = next_ratings[12].text
    rating['4'] = next_ratings[13].text
    rating['5'] = next_ratings[14].text
    rating['6'] = next_ratings[15].text
    rating['7'] = next_ratings[16].text
    rating['8'] = next_ratings[17].text
    rating['9'] = next_ratings[18].text
    rating['10'] = next_ratings[19].text
    
    # append to dataframe
    ratings = ratings.append(rating, ignore_index=True)
    
    # return dataframe
    return ratings

# Game Scraping

In [9]:
# set up our columns list
columns = ['BGGId',
                'Name',
               'Description',
                'YearPublished',
                'GameWeight',
                'AvgRating',  
                'BayesAvgRating',
                'StdDev',
                'MinPlayers',
                'MaxPlayers',
                'ComAgeRec',
                'LanguageEase',
                'BestPlayers',
                'GoodPlayers',
                'NumOwned',
                'NumWant',
                'NumWish',
                'NumWeightVotes',
                'MfgPlaytime',
                'ComMinPlaytime',
                'ComMaxPlaytime',
                'MfgAgeRec',
                'NumUserRatings',
                'NumComments',
                'NumAlternates',
                'NumExpansions',
                'NumImplementations',
           'IsReimplementation',
                'Family',
                'Theme',
               'Category',
               'Kickstarted',
               'ImagePath',
          ]

Last game id: 349161

In [None]:
game_ids_current = pd.read_pickle('data_dirty_new_scraper/game_ids_current')
game_ids = list(game_ids_current)

In [None]:
start_position =0
end_position = 1000
file_suffix = 0

overall_start = time.time()
while end_position < (len(game_ids)+1):
    
    games = pd.DataFrame(columns=columns)
    designers = pd.DataFrame(columns=['BGGId'])
    categories = pd.DataFrame(columns=['BGGId'])
    mechanics = pd.DataFrame(columns=['BGGId'])
    artists = pd.DataFrame(columns=['BGGId'])
    publishers = pd.DataFrame(columns=['BGGId'])
    subcategories = pd.DataFrame(columns=['BGGId'])
    comments = pd.DataFrame(columns=['BGGId'])
    
    ##### File Setup Section #####
    
    # increment file suffix
    file_suffix += 1
    # get file suffix as string
    suffix_str = str(file_suffix)
    
    # print start and end positions
    print("Getting items "+str(start_position+1)+' through '+str(end_position))
    
    # get list of game ids to grab
    #grab_list = game_ids[0][start_position:end_position]
    grab_list = game_ids[start_position:end_position]
    
    # piece together target string of game ids for BGG 
    targets = ''
    for item in grab_list:
        targets += str(item)+','
    
    # log start time for information retrieval
    start = time.time()# log the start time for this entry   
    
    
    ##### API Call Section #####
    
    # Set up Selenium drivers
    options = webdriver.ChromeOptions() # set up chrome options
    options.add_argument("--headless") # set up chrome options
    time.sleep(1) # wait 1 second
    # establish path with targets
    path = 'https://www.boardgamegeek.com/xmlapi2/thing?id='+targets+'&stats=1&type=boardgame'#&comments=1'&ratingcomments=1&page=1&pagesize=100
    driver = webdriver.Chrome(options=options)# initiate chrome driver with options
    print("New page retrieval. May be waiting for load.")
    driver.get(path)# get path
    # wait until the driver finds the element that we need
    element = WebDriverWait(driver, 180).until(EC.presence_of_all_elements_located((By.ID, 'folder0')))
        
    game_page = BeautifulSoup(driver.page_source) # parse page with beautifulsoup    
    
    # make entry for each game item on page
    game_entries = game_page.find_all('item')
    
    print("Items loaded. Processing.")
    ##### Process Each Game #####
    
    for entry in game_entries:
        ##### Get Game Name, BGGId, and check that game should be included in list #####

        ##### Check is expansion #####
        #gametype = entry['type'] # check game type
        #if gametype != 'boardgame':
        #    continue
        #else: pass
       

        # check that this game has sufficient user ratings to incluide
        try:
            user_ratings = int(entry.find('usersrated')['value'])# get the number of user ratings
        
            if user_ratings < 30: #check if user ratings are under 30
                continue
        except: continue
            
        # get game name and BGG ID
        game_name = entry.find('name', type='primary')['value']
        game_id = entry['id']
        #print("Name: "+game_name+", BGG ID: "+str(game_id))

        
        ##### Get Basic Stats #####

        #print("Getting basic stats")
        description = entry.find('description').text # description text of the game
        
        try:
            year_pub = int(entry.find('yearpublished')['value']) # year published
            if year_pub > 2021:
                continue
        except: pass
            
        try: minplayers = int(entry.find('minplayers')['value']) # minimum players
        except: minplayers = None
            
        try: maxplayers = int(entry.find('maxplayers')['value']) # maximum players
        except: maxplayers = None
            
        avg_rating = float(entry.find('average')['value']) # average rating
        bayes_avg = float(entry.find('bayesaverage')['value']) # bayes average rating
        std_dev = float(entry.find('stddev')['value']) # standard deviation of rating
        num_owned = int(entry.find('owned')['value']) # num of people own this game
        num_want = int(entry.find('wanting')['value']) # num of people want this game
        num_wish = int(entry.find('wishing')['value']) # num of people with game on wishlist
        num_weight_votes = int(entry.find('numweights')['value']) # num of votes for game weight
        game_weight = float(entry.find('averageweight')['value']) # voted game weight
        
        try: image_path = entry.find('image').text # path to image
        except: image_path = None
            
        try:  mfg_play_time = int(entry.find('playingtime')['value']) # mfg stated playtime
        except: mfg_play_time = None
        try: comm_min_play = int(entry.find('minplaytime')['value']) # community min playtime
        except: comm_min_play = None
            
        try: comm_max_play = int(entry.find('maxplaytime')['value']) # community max playtime
        except: comm_max_play = None
        
        try: mfg_age = int(entry.find('minage')['value']) # mfg min age
        except: mfg_age = None
            
        #num_comments = int(entry.find('comments')['totalitems']) # num of ratings comments
        num_alts = len(entry.find_all('name', type='alternate')) # number alternate versions
        num_expansions = len(entry.find_all('link', type='boardgameexpansion')) # number of expansions
        num_implementations = len(entry.find_all('link', type='boardgameimplementation')) # number of implementations    
        

        
        ##### Get reimplementation flag #####
        reimplementation = entry.find('link', type="boardgameimplementation", inbound="true") # check if game is a reimplementation
        if reimplementation: 
            reimplements = 1 # if it's a reimplementation, flag it 1
        else: 
            reimplements = 0
 


        ##### Basic stats requiring some compaction/refinement #####

        # community age min
        try:
            age_poll = entry.find('poll', title="User Suggested Player Age").find_all('result')

            total = 0
            items = 0
        
            for item in age_poll:   
                vote = int(item['numvotes']) * int(item['value'][:2])
                total += vote
                items += int(item['numvotes'])

            if items>0: comm_age = total/items # make sure not dividing by 0, get community recommended age
            else: comm_age=None # if no votes, record none
        except: comm_age=None
        
        # Language Ease
        try:
         
            lang_poll = entry.find('poll', title="Language Dependence").find_all('result')
            total, items = 0, 0

            for item in lang_poll:   
                vote = int(item['numvotes']) * int(item['level'])
                total += vote
                items += int(item['numvotes'])

            if items>0: lang_ease = total/items # make sure not dividing by 0, get community language ease
            else: lang_ease=None # if no votes, record none
        except: lang_ease=None # if no votes, record none
            
        try:    
            # Best and Good Players
            players = entry.find('poll', title="User Suggested Number of Players").find_all('results') # get user players poll
            player_num_votes = int(entry.find('poll', title="User Suggested Number of Players")['totalvotes'])# get total votes
        
            best_players, best_score, good_players = 0, 0, [] # set up for best players loop
        
            if player_num_votes > 30: # evaluate if more than 30 votes for num players
                for player in players:
                    best = int(player.find('result', value='Best')['numvotes'])
                    rec = int(player.find('result', value='Recommended')['numvotes'])
                    score = best*2 + rec*1
                    positives = best+rec
                    ratio = positives/player_num_votes
                    if score > best_score: best_players, best_score = player['numplayers'], score # put in # players for best score
                    if ratio > .5: good_players.append(player['numplayers']) # put in good players if over 50% ratio
            else: best_players=None
        except: best_players=None
        
        ##### Skip dynamic content which cannot be batched #####
            
        #this_game['NumFans']=int(num_fans),
        #this_game['NumPageViews']=int(num_views),
        #this_game['RulesPosts']=int(rules_threads),
        #this_game['TotalPosts']=int(total_threads),            
        #this_game['NumAwards'] = int(num_awards)              
            
        
        # make dataframe for this game
        this_game = pd.DataFrame()
        this_game['BGGId']=int(game_id),
        this_game['Name']=game_name,
        this_game['Description']=description,
        this_game['YearPublished']=int(year_pub),
        this_game['GameWeight']=float(game_weight),
        this_game['AvgRating']=float(avg_rating),
        this_game['BayesAvgRating']=float(bayes_avg),
        this_game['StdDev']=float(std_dev),
        this_game['MinPlayers']=minplayers,
        this_game['MaxPlayers']=maxplayers,
        try: this_game['ComAgeRec']=float(comm_age),
        except: this_game['ComAgeRec']=None,
        try: this_game['LanguageEase']=float(lang_ease),
        except: this_game['LanguageEase']=None,
        this_game['BestPlayers']=best_players,
        this_game['GoodPlayers']=good_players,
        this_game['NumOwned']=int(num_owned),
        this_game['NumWant']=int(num_want),
        this_game['NumWish']=int(num_wish),
        this_game['NumWeightVotes']=int(num_weight_votes),
        this_game['MfgPlaytime']=mfg_play_time,
        this_game['ComMinPlaytime']=comm_min_play,
        this_game['ComMaxPlaytime']=comm_max_play,
        this_game['MfgAgeRec']=mfg_age,
        this_game['NumUserRatings']=int(user_ratings),
        #this_game['NumComments']=int(num_comments),
        this_game['NumAlternates']=int(num_alts),
        this_game['NumExpansions']=int(num_expansions),
        this_game['NumImplementations']=int(num_implementations),
        this_game['IsReimplementation']=int(reimplements),
        this_game['ImagePath']=image_path
            
        
        # add unique information to end of df
        
        # Add game ranks
        ranks = entry.find_all('rank')
        try:
            for item in ranks:
                this_game['Rank:'+item['name']] = float(item['value'])
        except: pass
        
        # Try to add components
        try: 
            families = entry.find_all('link', type='boardgamefamily', value=re.compile("Component"))
            for item in families:                    
                this_game['Components:'+item['name']] = item['value']
        except: pass
            
        
        # Try to add game series/family
        try:
            family = entry.find('link', type='boardgamefamily', value=re.compile("Game:"))['value'].strip('Game:').strip(' ')
            this_game['Family'] = family
        except: pass
            
        try:
            family = entry.find('link', type='boardgamefamily', value=re.compile("Series:"))['value'].strip('Series:').strip(' ')
            this_game['Family'] = family
        except: pass
            
        try:
            setting = entry.find('link', type='boardgamefamily', value=re.compile("Setting:"))['value'].strip('Setting:').strip(' ')
            this_game['Setting'] = setting
        except: pass
            
        
        # Try to add theme
        try:
            theme = entry.find('link', type='boardgamefamily', value=re.compile("Theme:"))['value'].strip('Theme:').strip(' ')
            this_game['Theme'] = theme
        except: pass
            
        try:
            mechanism = entry.find('link', type='boardgamefamily', value=re.compile("Mechanism:"))['value'].strip('Mechanism:').strip(' ')
            this_game['Mechanism'] = mechanism
        except: pass
        
        # Try to add game category
        try:
            category = entry.find('link', type='boardgamefamily', value=re.compile("Category:"))['value'].strip('Category:').strip(' ')
            this_game['Category'] = category
        except: pass
        
        
        # Try is Kickstarted
        try:
            entry.find('link', type='boardgamefamily', value=re.compile("Crowdfunding"))['value']
            this_game['Kickstarted'] = int(1)
        except: pass
     
    
    
        ##### Get subcategories #####
        
        all_subcategories = entry.find_all('link', type='boardgamecategory')
        
        categories_hold = pd.DataFrame(columns=['BGGId'])
        subcategory = {'BGGId':int(game_id)}
    
        for item in all_subcategories:
            subcategory[item['value']] = int(1)      
        
        categories_hold = categories_hold.append(subcategory, ignore_index=True)    
        
        
        # create specialty dataframes
        designer = create_designers(entry, game_id)
        category = create_categories(entry, game_id)
        mechanic = create_mechanics(entry, game_id)
        artist = create_artists(entry, game_id)
        publisher = create_publishers(entry, game_id)
            
        games = games.append(this_game, ignore_index = True)
        designers = designers.append(designer, ignore_index=True)
        categories = categories.append(category, ignore_index=True)
        mechanics = mechanics.append(mechanic, ignore_index=True)
        artists = artists.append(artist, ignore_index=True)
        publishers = publishers.append(publisher, ignore_index=True)
        subcategories = subcategories.append(categories_hold, ignore_index=True)
        
    games.to_pickle('data_dirty_new_scraper/games'+suffix_str+'.pkl')
    designers.to_pickle('data_dirty_new_scraper/designers'+suffix_str+'.pkl')
    categories.to_pickle('data_dirty_new_scraper/categories'+suffix_str+'.pkl')
    mechanics.to_pickle('data_dirty_new_scraper/mechanics'+suffix_str+'.pkl')
    artists.to_pickle('data_dirty_new_scraper/artists'+suffix_str+'.pkl')
    publishers.to_pickle('data_dirty_new_scraper/publishers'+suffix_str+'.pkl')
    subcategories.to_pickle('data_dirty_new_scraper/subcategories'+suffix_str+'.pkl')
    
    print("Finished items in this group")
    
    print(f'Time: {time.time() - start}\n\n')       
    
    start_position += 1000
    end_position += 1000
    
print(f'Time: {time.time() - overall_start}\n\n') 

### DataValidation

In [None]:
subcategories1 = pd.read_pickle('data_dirty_new_scraper/subcategories52.pkl')
games1 = pd.read_pickle('data_dirty_new_scraper/games52.pkl')
designers1 = pd.read_pickle('data_dirty_new_scraper/designers52.pkl')
categories1 = pd.read_pickle('data_dirty_new_scraper/categories52.pkl')
mechanics1 = pd.read_pickle('data_dirty_new_scraper/mechanics52.pkl')
artists1 = pd.read_pickle('data_dirty_new_scraper/artists52.pkl')
publishers1 = pd.read_pickle('data_dirty_new_scraper/publishers52.pkl')

In [None]:
subcategories1

In [None]:
games1

In [None]:
designers1

In [None]:
categories1

In [None]:
mechanics1

In [None]:
artists1

In [None]:
publishers1

## Combine Files

In [None]:
#games = pd.DataFrame(columns=columns)
#designers = pd.DataFrame(columns=['BGGId'])
#categories = pd.DataFrame(columns=['BGGId'])
#mechanics = pd.DataFrame(columns=['BGGId'])
#artists = pd.DataFrame(columns=['BGGId'])
#publishers = pd.DataFrame(columns=['BGGId'])
#subcategories = pd.DataFrame(columns=['BGGId'])

In [None]:
for number in range(1, 354):
    print(number)
    
    #this_games = pd.read_pickle('data_dirty_new_scraper/games'+str(number)+'.pkl')
    #this_designers = pd.read_pickle('data_dirty_new_scraper/designers'+str(number)+'.pkl')
    #this_categories = pd.read_pickle('data_dirty_new_scraper/categories'+str(number)+'.pkl')
    #this_mechanics = pd.read_pickle('data_dirty_new_scraper/mechanics'+str(number)+'.pkl')
    #this_artists = pd.read_pickle('data_dirty_new_scraper/artists'+str(number)+'.pkl')
    #this_publishers = pd.read_pickle('data_dirty_new_scraper/publishers'+str(number)+'.pkl')
    #this_subcategories = pd.read_pickle('data_dirty_new_scraper/subcategories'+str(number)+'.pkl')
    
    #games = games.append(this_games)
    #designers = designers.append(this_designers)
    #categories = categories.append(this_categories)
    #mechanics = mechanics.append(this_mechanics)
    #artists = artists.append(this_artists)
    #publishers = publishers.append(this_publishers)
    #subcategories = subcategories.append(this_subcategories)
    
    

In [None]:
#games = games.reset_index(drop=True)
#designers = designers.reset_index(drop=True)
#categories = categories.reset_index(drop=True)
#mechanics = mechanics.reset_index(drop=True)
#artists = artists.reset_index(drop=True)
#publishers = publishers.reset_index(drop=True)
#subcategories = subcategories.reset_index(drop=True)

In [None]:
#games.to_pickle('data_dirty_new_scraper/games.pkl')
#designers.to_pickle('data_dirty_new_scraper/designers.pkl')
#categories.to_pickle('data_dirty_new_scraper/categories.pkl')
#mechanics.to_pickle('data_dirty_new_scraper/mechanics.pkl')
#artists.to_pickle('data_dirty_new_scraper/artists.pkl')
#publishers.to_pickle('data_dirty_new_scraper/publishers.pkl')
#subcategories.to_pickle('data_dirty_new_scraper/subcategories.pkl')

In [None]:
break

# Ratings and Comments

In [17]:
games=pd.read_pickle('data_dirty_new_scraper/games.pkl')

In [18]:
game_ids_current = pd.read_pickle('data_dirty_new_scraper/game_ids_current')
game_ids = list(game_ids_current)

In [19]:
ratings_totals = pd.DataFrame(games['BGGId'])
ratings_totals['RatingsPages'] = np.ceil(games['NumUserRatings']/100)
ratings_totals = ratings_totals.sort_values('RatingsPages', ascending=False).reset_index(drop=True)

In [20]:
ratings_totals.head(30)

Unnamed: 0,BGGId,RatingsPages
0,30549,1082
1,822,1080
2,13,1072
3,68448,893
4,36218,812
5,9209,756
6,178900,736
7,167791,731
8,173346,683
9,31260,659


In [21]:
highest = ratings_totals['RatingsPages'].max()

In [70]:
pages_1100 = list(ratings_totals.loc[ratings_totals['RatingsPages']>=1000]['BGGId'])
pages_999 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<1000) & (ratings_totals['RatingsPages']>=600)]['BGGId'])
pages_599 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<600) & (ratings_totals['RatingsPages']>=400)]['BGGId'])
pages_399 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<400) & (ratings_totals['RatingsPages']>=300)]['BGGId'])
pages_299 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<300) & (ratings_totals['RatingsPages']>=200)]['BGGId'])
pages_199 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<200) & (ratings_totals['RatingsPages']>=100)]['BGGId'])
pages_99 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<100) & (ratings_totals['RatingsPages']>=50)]['BGGId'])
pages_49 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<50) & (ratings_totals['RatingsPages']>=40)]['BGGId'])
pages_39 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<40) & (ratings_totals['RatingsPages']>=30)]['BGGId'])
pages_29 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<30) & (ratings_totals['RatingsPages']>=20)]['BGGId'])


pages_19_1 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<20) & (ratings_totals['RatingsPages']>=12)]['BGGId'])[:500]
pages_19_2 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<20) & (ratings_totals['RatingsPages']>=12)]['BGGId'])[500:]


pages_11_1 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<12) & (ratings_totals['RatingsPages']>=10)]['BGGId'])[:500]

pages_9_1 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<10) & (ratings_totals['RatingsPages']>=7)]['BGGId'])[:500]
pages_9_2 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<10) & (ratings_totals['RatingsPages']>=7)]['BGGId'])[500:]


pages_6_1 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<7) & (ratings_totals['RatingsPages']>=5)]['BGGId'])[:500]
pages_6_2 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<7) & (ratings_totals['RatingsPages']>=5)]['BGGId'])[500:]


pages_5_1 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<6) & (ratings_totals['RatingsPages']>=5)]['BGGId'])[:500]
pages_5_2 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<6) & (ratings_totals['RatingsPages']>=5)]['BGGId'])[500:]


pages_4_1 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<5) & (ratings_totals['RatingsPages']>=4)]['BGGId'])[:500]
pages_4_2 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<5) & (ratings_totals['RatingsPages']>=4)]['BGGId'])[500:]


pages_3_1 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<4) & (ratings_totals['RatingsPages']>=3)]['BGGId'])[:500]
pages_3_2 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<4) & (ratings_totals['RatingsPages']>=3)]['BGGId'])[500:1000]
pages_3_3 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<4) & (ratings_totals['RatingsPages']>=3)]['BGGId'])[1000:1500]
pages_3_4 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<4) & (ratings_totals['RatingsPages']>=3)]['BGGId'])[1500:]

pages_2_1 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<3) & (ratings_totals['RatingsPages']>=2)]['BGGId'])[:500]
pages_2_2 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<3) & (ratings_totals['RatingsPages']>=2)]['BGGId'])[500:1000]
pages_2_3 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<3) & (ratings_totals['RatingsPages']>=2)]['BGGId'])[1000:1500]
pages_2_4 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<3) & (ratings_totals['RatingsPages']>=2)]['BGGId'])[1500:2000]
pages_2_5 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<3) & (ratings_totals['RatingsPages']>=2)]['BGGId'])[2000:2500]
pages_2_6 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<3) & (ratings_totals['RatingsPages']>=2)]['BGGId'])[2500:3000]
pages_2_7 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<3) & (ratings_totals['RatingsPages']>=2)]['BGGId'])[3000:3500]
pages_2_8 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<3) & (ratings_totals['RatingsPages']>=2)]['BGGId'])[3500:]


pages_1_1 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[:500]
pages_1_2 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[500:1000]
pages_1_3 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[1000:1500]
pages_1_4 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[1500:2000]
pages_1_5 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[2000:2500]
pages_1_6 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[2500:3000]
pages_1_7 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[3000:3500]
pages_1_8 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[3500:4000]
pages_1_9 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[4000:4500]
pages_1_10 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[4500:5000]
pages_1_11 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[5000:5500]
pages_1_12 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[5500:6000]
pages_1_13 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[6000:6500]
pages_1_14 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[6500:7000]
pages_1_15 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[7000:7500]
pages_1_16 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[7500:8000]
pages_1_17 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[8000:8500]
pages_1_18 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[8500:9000]
pages_1_19 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[9000:9500]
pages_1_20 = list(ratings_totals.loc[(ratings_totals['RatingsPages']<2)]['BGGId'])[9500:]

In [71]:
#user_ratings = pd.read_pickle('data_dirty_new_scraper/user_ratings_comments'+str(item)+'.pkl')
        
### THIS NEEDS TO BE REBUILT FROM SCRATCH FROM THE RATINGS MATRICES ###

#with open('data_dirty_new_scraper/raw_ratings.json') as json_file:
#    raw_ratings = json.load(json_file)

In [74]:
len(list_of_lists[9_1])

500

In [107]:
list_of_lists = {}
#list_of_lists[highest] = pages_1100
#list_of_lists[999] = pages_999
#list_of_lists[599] = pages_599
#list_of_lists[399] = pages_399
#list_of_lists[299] = pages_299
#list_of_lists[199] = pages_199
#list_of_lists[99] = pages_99
#list_of_lists[49] = pages_49
#list_of_lists[39] = pages_39
#list_of_lists[29] = pages_29

#list_of_lists[19_1] = pages_19_1
#list_of_lists[19_2] = pages_19_2

#list_of_lists[11_1] = pages_11_1

#list_of_lists[9_1] = pages_9_1

#list_of_lists[9_2] = pages_9_2

#list_of_lists[6_1] = pages_6_1


#list_of_lists[6_2] = pages_6_2

#list_of_lists[5_1] = pages_5_1
#list_of_lists[5_2] = pages_5_2

#list_of_lists[4_1] = pages_4_1
#list_of_lists[4_2] = pages_4_2

#list_of_lists[3_1] = pages_3_1
#list_of_lists[3_2] = pages_3_2
#list_of_lists[3_3] = pages_3_3
#list_of_lists[3_4] = pages_3_4


#list_of_lists[2_1] = pages_2_1
#list_of_lists[2_2] = pages_2_2
#list_of_lists[2_3] = pages_2_3
#list_of_lists[2_4] = pages_2_4
#list_of_lists[2_5] = pages_2_5
#list_of_lists[2_6] = pages_2_6
#list_of_lists[2_7] = pages_2_7
#list_of_lists[2_8] = pages_2_8




#list_of_lists[1_1] = pages_1_1
#list_of_lists[1_2] = pages_1_2
#list_of_lists[1_3] = pages_1_3
#list_of_lists[1_4] = pages_1_4
#list_of_lists[1_5] = pages_1_5
#list_of_lists[1_6] = pages_1_6
#list_of_lists[1_7] = pages_1_7
#list_of_lists[1_8] = pages_1_8
#list_of_lists[1_9] = pages_1_9
#list_of_lists[1_10] = pages_1_10
#list_of_lists[1_11] = pages_1_11
#list_of_lists[1_12] = pages_1_12
#list_of_lists[1_13] = pages_1_13
#list_of_lists[1_14] = pages_1_14
#list_of_lists[1_15] = pages_1_15
#list_of_lists[1_16] = pages_1_16
#list_of_lists[1_17] = pages_1_17
#list_of_lists[1_18] = pages_1_18
#list_of_lists[1_19] = pages_1_19
#list_of_lists[1_20] = pages_1_20



In [108]:
for item in list_of_lists:
    
    # make blank dataframe for user ratings for this block of targets
    user_ratings = pd.DataFrame(columns=['BGGId'])
    #user_ratings = pd.read_pickle('data_dirty_new_scraper/user_ratings_comments'+str(item)+'.pkl')
    
    page = int(str(item)[:1])
    
    # get number of ratings pages for this block of targets
    pages = np.arange(1, page+1, 1)
    
    # piece together target string of game ids for this block of targets 
    targets = ''
    for target in list_of_lists[item]:
        targets += str(target)+','
    
    for entry in list_of_lists[item]:
        raw_ratings[entry] = []
    
    # for each page in the number of ratings pages:
    for page in pages[:]:
        
        # report what page we are on
        print("Getting page "+str(page)+" of "+str(pages.max()))
        
        # establish path with targets and current page
        path = 'https://www.boardgamegeek.com/xmlapi2/thing?id='+targets+'&ratingcomments=1&page='+str(page)+'&pagesize=100'
        
        ##### API Call Section #####
    
        # Set up Selenium drivers
        options = webdriver.ChromeOptions() # set up chrome options
        options.add_argument("--headless") # set up chrome options
        time.sleep(1) # wait 1 second
        
        driver = webdriver.Chrome(options=options)# initiate chrome driver with options
        print(path)
        print("Waiting for page to load")
        
        driver.get(path)# get path
        # wait until the driver finds the element that we need
        element = WebDriverWait(driver, 1000).until(EC.presence_of_all_elements_located((By.ID, 'folder1')))
        
        game_page = BeautifulSoup(driver.page_source) # parse page with beautifulsoup    
    
        # make entry for each game item on page
        game_entries = game_page.find_all('item')
    
        print("Items loaded. Processing.")
        
        
        ##### Process Each Game #####
        
        # for each game found on the page:
        for entry in game_entries:
            
            # make an empty dataframe for this item
            this_game_ratings = pd.DataFrame(columns=['BGGId'])
            
            # get the item id and item name
            item_id = int(entry['id'])
            item_name = entry.find('name')['value']
            
            # get the list of ratings
            list_of_ratings = entry.find_all('comment')
            
            # set up empty list to store the ratings found on this page
            bggid, names, ratings, comment, usernames = [], [], [], [], []
            
            
            
            # for each item in the list of ratings:
            for rating in list_of_ratings:
                # add the raw rating to the raw_ratings dictionary
                # add the item id to the list
                bggid.append(item_id)
                # add the item name to the list
                names.append(item_name)
                # add the rating to the list
                ratings.append(rating['rating'])
                # add the comment to the list
                comment.append(rating['value'])
                # add the username to the list
                usernames.append(rating['username'].strip('_'))
            
            # fill the dataframe for the entry
            this_game_ratings['BGGId'] = bggid    
            this_game_ratings['Name'] = names
            this_game_ratings['Rating'] = ratings
            this_game_ratings['Value'] = comment
            this_game_ratings['Username'] = usernames
            
            # append the entry to the overall user_ratings dataframe
            user_ratings = user_ratings.append(this_game_ratings)
        
        time.sleep(5)
        
        user_ratings.to_pickle('data_dirty_new_scraper/user_ratings_comments'+str(item)+'.pkl')

Getting page 1 of 1
https://www.boardgamegeek.com/xmlapi2/thing?id=160950,161126,161743,161148,161206,161429,161528,161543,161553,161600,161682,161720,161726,162888,162915,162944,164600,164279,164284,164300,164339,164369,164428,164446,164449,164542,164566,164645,164201,164670,164686,164778,164808,164829,164838,165022,165044,165095,165189,164202,164190,162974,163565,163031,163048,163062,163099,163144,163186,163255,163263,163319,163478,163601,164187,163688,163724,163798,163908,163957,163963,164010,164022,164059,164091,107811,107703,107680,34296,35163,35167,35219,35220,35262,35312,35354,35371,35414,35425,35453,35457,35466,35468,35481,35482,35492,35499,35508,35548,35585,35618,35635,35137,35054,35040,34655,34310,34311,34374,34377,34381,34383,34395,34403,34575,34590,34692,35023,34713,34785,34825,34866,34890,34902,34933,34938,34974,35003,35654,35658,35661,36615,36415,36431,36478,36510,36538,36560,36598,36603,36604,36612,36616,36412,36623,36639,36650,36674,36687,36698,36737,36751,36777,36788,3

Items loaded. Processing.
Getting page 1 of 1
https://www.boardgamegeek.com/xmlapi2/thing?id=47475,47484,48154,48863,48867,49010,49038,49050,49096,43890,43845,42783,43169,42789,42882,42897,42901,43042,43043,43080,43096,43110,43167,43196,43802,43233,43295,43332,43340,43486,43489,43693,43775,43799,43801,62291,62344,62374,62478,69904,69973,70096,70222,70262,70329,70376,70488,70502,70912,70922,71035,71304,71569,71889,71899,71956,72050,72200,72298,72340,72373,72376,69843,69819,69815,69292,68251,68387,68504,68743,68816,68947,69205,69232,69233,69275,69347,69796,69543,69544,69632,69638,69676,69687,69703,69721,69761,69785,72460,72560,72566,75418,74203,74309,74310,74311,74312,74678,74936,74987,75292,75382,75445,73960,75529,75587,75644,75668,75674,75762,75783,75809,75890,75957,74098,73759,72751,73244,72957,73091,73230,73232,73233,73238,73240,73241,73242,73243,73245,73589,73251,73252,73253,73288,73360,73361,73363,73367,73536,73574,68201,68187,67977,64657,64219,64331,64426,64430,64431,64438,64520,6

In [33]:
this_game_ratings

Unnamed: 0,BGGId,Name,Rating,Value,Username
0,182626,Mistborn: House War,10,,cheshirekat13
1,182626,Mistborn: House War,10,,LaborLawLarry
2,182626,Mistborn: House War,10,,ShakaUVM
3,182626,Mistborn: House War,10,I love this game. Beautiful artwork. Fun to play.,TheBlackThorn
4,182626,Mistborn: House War,10,,pkapera
...,...,...,...,...,...
95,182626,Mistborn: House War,8,,Gryph51
96,182626,Mistborn: House War,8,,Popesixtus
97,182626,Mistborn: House War,8,Control noble houses and win the favor of the ...,TheLordWinter
98,182626,Mistborn: House War,8,,Zaineph


In [34]:
user_ratings

Unnamed: 0,BGGId,Name,Rating,Value,Username
0,244946,Brook City,10,,Thomas83
1,244946,Brook City,10,"WITH STRETCH GOALS, [45euro], 1 play, premium ...",kukugames
2,244946,Brook City,10,,Vendus
3,244946,Brook City,10,,PanMan
4,244946,Brook City,10,,Buschsoevn
...,...,...,...,...,...
95,182626,Mistborn: House War,8,,Gryph51
96,182626,Mistborn: House War,8,,Popesixtus
97,182626,Mistborn: House War,8,Control noble houses and win the favor of the ...,TheLordWinter
98,182626,Mistborn: House War,8,,Zaineph


# Appendix

### Data Validation

In [None]:
games=pd.read_pickle('data_dirty_new_scraper/games.pkl')
designers=pd.read_pickle('data_dirty_new_scraper/designers.pkl')
categories=pd.read_pickle('data_dirty_new_scraper/categories.pkl')
mechanics=pd.read_pickle('data_dirty_new_scraper/mechanics.pkl')
artists=pd.read_pickle('data_dirty_new_scraper/artists.pkl')
publishers=pd.read_pickle('data_dirty_new_scraper/publishers.pkl')
subcategories=pd.read_pickle('data_dirty_new_scraper/subcategories.pkl')

In [None]:
games

In [None]:
designers.tail()

In [None]:
categories.tail()

In [None]:
mechanics.tail()

In [None]:
artists.tail()

In [None]:
publishers.tail()

In [None]:
subcategories.tail()

## Get game ids

In [None]:
games_ids_current = games['BGGId']
games_ids_current

In [None]:
games_ids_current.to_pickle('data_dirty_new_scraper/game_ids_current')