# BGG Scraper

This script is optimized to collect information from BGG web pages using their XMLAPI2 API.

All games are displayed on a search page provided by the site. We use that as a starting point for the scraper. With games ID (witch are unique) we are able to collect even more information about them, specialy the Statistics data.

The API alow us to query for more than 1 game, making the extraction incredibly more efficient, restricted only by the size of the URL itself. There for, we have to break the URLs into smaller groups. In this case, 500 games per request is enought to get the date without overloading the downloaded string.

In [114]:
"""Boardgame class"""

class BoardGame(object):
    """Object containing information about a boardgame"""
    
    def __init__(self, data):
        self._data = data
    
    def __len__(self):
        return self._data_len
    
    def __repr__(self):
        return("Boardgame(" + self.name + ")")
    
    def data(self):
        """Internal data dictionary"""
        return self._data
    
    @property
    def item(self):
        """Internal dictionary of 'item'"""
        return self._data #['items']['item']
        
    @property
    def statistics(self):
        """Internal dictionary of 'statistics'"""
        return self.item['statistics']
    
    @property
    def name(self):
        """object name"""
        try:
            return str(self.item['name']['@value'])
        except:
            return str(self.item['name'][0]['@value'])

            
    @property
    def id(self):
        """BGG ID"""
        return self.item['@id']
        
    @property
    def description(self):
        """Game description"""
        return self.item['description']
        
    @property
    def year_published(self):
        """Year of publication"""
        try:
            temp = int(self.item['yearpublished']['@value'])
        except:
            temp = 0
        
        return temp
        
    @property
    def min_players(self):
        """Minimum number of players, according to publisher"""
        try:
            temp = int(self.item['minplayers']['@value'])
        except:
            temp = 0
        
        return temp
        
    @property
    def max_players(self):
        """Maximum number of players, according to publisher"""
        try:
            temp = int(self.item['maxplayers']['@value'])
        except:
            temp = 0
        
        return temp

    @property
    def suggested_players(self):
        """BGG users suggested best player count"""
        poll = self.item['poll']
        poll_names = [ii['@name'] for ii in poll]
        poll = poll[poll_names.index('suggested_numplayers')]['results']
        
        best_player_cnt = {}
        def _get_best_num_players(index):
            numplay = poll[index]['@numplayers']
            best_votes = poll[index]['result'][0]['@numvotes']
            best_player_cnt[numplay] = int(best_votes)
        
        for x in range(len(poll)):
            _get_best_num_players(x) 
        max_votes = max(list(best_player_cnt.values()))
        
        suggested_players = [count for count, votes in best_player_cnt.items() if votes == max_votes][0]
                
        return suggested_players
    
    @property
    def min_age(self):
        """Publisher defined minimum age"""
        try:
            temp = int(self.item['minage']['@value'])
        except:
            temp = 0
        
        return temp
    
    @property
    def suggested_player_age(self):
        """BGG users suggested minimum age"""
        poll = self.item['poll']
        poll_names = [ii['@name'] for ii in poll]
        poll = poll[poll_names.index('suggested_playerage')]['results']['result']
        
        best_player_age = {}
        def _get_best_age_players(index):
            age = poll[index]['@value']
            num_votes = poll[index]['@numvotes']
            best_player_age[age] = int(num_votes)
        
        for x in range(len(poll)):
            _get_best_age_players(x)
        
        max_votes = max(list(best_player_age.values()))
        
        suggested_age = [count for count, votes in best_player_age.items() if votes == max_votes][0]
        
        return int(suggested_age)
    
    @property
    def language_dependence(self):
        """BGG users suggested language dependence"""
        poll = self.item['poll']
        poll_names = [ii['@name'] for ii in poll]
        poll = poll[poll_names.index('language_dependence')]['results']['result']
        
        voted_language_dependence = {}
        def _get_language_requirements(index):
            language_level = poll[index]['@value']
            num_votes = poll[index]['@numvotes']
            voted_language_dependence[language_level] = int(num_votes)
        
        for x in range(len(poll)):
            _get_language_requirements(x)
            
        max_votes = max(list(voted_language_dependence.values()))
        language_requirement = [count for count, votes in voted_language_dependence.items() if votes == max_votes][0]
        
        return language_requirement
        
    @property
    def playing_time(self):
        """BGG suggested playing time"""
        return int(self.item['playingtime']['@value'])
        
    @property
    def min_play_time(self):
        """Publisher defined minimum play time"""
        return int(self.item['minplaytime']['@value'])
        
    @property
    def max_play_time(self):
        """Publisher defined maximum play time"""
        return int(self.item['maxplaytime']['@value'])
        
    @property
    def categories(self):
        """List of game categories (i.e. theme)"""
        link = self.item['link']
        link_names = [ii['@type'] for ii in link]
        names_index = [i for i, x in enumerate(link_names) if x == 'boardgamecategory']
        categories = [link[ii]['@value'] for ii in names_index]
        
        return categories
        
    @property
    def mechanics(self):
        """List of game categories (i.e. tile-laying, set collection)"""
        link = self.item['link']
        try:
            link_names = [ii['@type'] for ii in link]
            names_index = [i for i, x in enumerate(link_names) if x == 'boardgamemechanic']
            mechanics = [link[ii]['@value'] for ii in names_index]
        except:
            link_names = 'NEDD A FIX'
        return mechanics
        
    @property
    def game_family(self):
        """List of game families (i.e. Kickstarter, Made in Canda)"""
        link = self.item['link']
        link_names = [ii['@type'] for ii in link]
        names_index = [i for i, x in enumerate(link_names) if x == 'boardgamefamily']
        family = [link[ii]['@value'] for ii in names_index]
        
        return family
        
    @property
    def implementations(self):
        """Does game implement another? (e.g. Pandemic: Legacy re-implements Pandemic)"""
        link = self.item['link']
        link_names = [ii['@type'] for ii in link]
        names_index = [i for i, x in enumerate(link_names) if x == 'boardgameimplementation']
        implements = [link[ii]['@value'] for ii in names_index]
        
        return implements
        
    @property
    def designers(self):
        """List of game's designers"""
        link = self.item['link']
        link_names = [ii['@type'] for ii in link]
        names_index = [i for i, x in enumerate(link_names) if x == 'boardgamedesigner']
        designers = [link[ii]['@value'] for ii in names_index]
        
        return designers
        
    @property
    def artists(self):
        """List of game's artists"""
        link = self.item['link']
        link_names = [ii['@type'] for ii in link]
        names_index = [i for i, x in enumerate(link_names) if x == 'boardgameartist']
        artists = [link[ii]['@value'] for ii in names_index]
        
        return artists
        
    @property
    def publishers(self):
        """List of game's Publisher"""
        link = self.item['link']
        try:
            link_names = [ii['@type'] for ii in link]
            names_index = [i for i, x in enumerate(link_names) if x == 'boardgamepublisher']
            publishers = [link[ii]['@value'] for ii in names_index]
        except:
            link_names = link[0]['@type']
            names_index = [i for i, x in enumerate(link_names) if x == 'boardgamepublisher']
            publishers = link[0]['@value']
        
        return publishers
    
    
    
    
        
    @property
    def rank(self):
        """Dictionary of games various ranks (e.g. Overall, Strategy Games, Family, etc.)"""

        rankings = self.statistics['ratings']['ranks']['rank']
        
        num_rank_names = sum([i == '@name'for i in rankings])
        
        if num_rank_names == 1:
            if type(rankings) is int:
                ranks = {'boardgame':float(rankings['@value'])}
            else:
                ranks = {'boardgame': 0}
        else:
            rank_names = [ii['@name'] for ii in rankings]
            
            ranks = {}
            def _get_ranking(name):
                name_rank = rank_names.index(name)
                if type(rankings[name_rank]['@value']) is int:
                    ranks[name] = int(rankings[name_rank]['@value'])
                else:
                    ranks[name] = 0
            for ii in rank_names:
                _get_ranking(ii)
        
        return ranks
    
    @property
    def ranks_bayes(self):
        """Dictionary of games various bayesian ranks (e.g. Overall, Strategy Games, Family, etc.)"""
        rankings = self.statistics['ratings']['ranks']['rank']
        num_rank_names = sum([i == '@name' for i in rankings])
        
        if num_rank_names == 1:
            ranks = {'boardgame':float(rankings['@bayesaverage'])}
        
        else:
            rank_names = [ii['@name'] for ii in rankings]
            
            ranks = {}
            def _get_bayes_ranking(name):
                name_rank = rank_names.index(name)
                ranks[name] = float(rankings[name_rank]['@bayesaverage'])
            
            for ii in rank_names:
                _get_bayes_ranking(ii)
        
        return ranks
    
    @property
    def users_rated(self):
        """Number of BGG users who've rated game"""
        return int(self.statistics['ratings']['usersrated']['@value'])
        
    @property
    def avg_rating(self):
        """Average BGG user rating (10-point scale)"""
        return float(self.statistics['ratings']['average']['@value'])
        
    @property
    def bayes_avg_rating(self):
        """Average BGG user's bayesian rating (10-point scale)"""
        return float(self.statistics['ratings']['bayesaverage']['@value'])
    
    @property
    def stdev_rating(self):
        """Standard deviation of BGG user rating (10-point scale)"""
        return float(self.statistics['ratings']['stddev']['@value'])
    
    @property
    def median_rating(self):
        """Median BGG user rating (10-point scale)"""
        return float(self.statistics['ratings']['median']['@value'])
        
    @property
    def num_owned(self):
        """Number of BGG users who own this game"""
        return  int(self.statistics['ratings']['owned']['@value'])
        
    @property
    def num_trading(self):
        """Number of copies being traded on BGG marketplace"""
        return int(self.statistics['ratings']['trading']['@value'])
        
    @property
    def num_wanting(self):
        """Number of BGG users who want to buy this game"""
        return int(self.statistics['ratings']['wanting']['@value'])
        
    @property
    def num_wishing(self):
        """Number of BGG users who wish they had this game"""
        return int(self.statistics['ratings']['wishing']['@value'])
        
    @property
    def num_comments(self):
        """Number of comments about the game"""
        return int(self.statistics['ratings']['numcomments']['@value'])
        
    @property
    def num_weights(self):
        """Number of BGG users who have assigned the game a weight (5-point scale)"""
        return int(self.statistics['ratings']['numweights']['@value'])
        
    @property
    def avg_weight(self):
        """Average weight assigned by BGG users (5-point scale)"""
        return float(self.statistics['ratings']['averageweight']['@value'])

## Import necessary packages

First, import all packages necessary for this extraction.

In [115]:
"""Main boardgame script file"""

import pickle
import pandas as pd
import time 
# from urllib2 import urlopen
from urllib.request import urlopen
import requests
import re
from bs4 import BeautifulSoup as BS
import xmltodict
import time
import progressbar
import sys
import numpy as np
import pyodbc
import json
import math
from tqdm import tqdm

In [116]:
total_timer_start = time.time()

## Download pages from the internet (not yet games details, just ID and Name)

This function is used to download the Game ID from BGG search page. This will be helpful to collect information about the game itself later on, based on the IDs.

The Game name is also downloaded, but in the end, it is not used.

In [117]:
def pull_game_names(page):
    """Get all game names and IDs from a page of BGG website.
    
    Description:
        Scrapes Board Game Geek website games in order of rank.
        Returns a dictionary of games titles:BGG ID, given a page number
        Games are listed in increments of 50
        
    inputs:
        page (int): Page number (starts at 1)
        
    returns:
        game_list (dict): {Name:ID}
    """
    url = 'https://boardgamegeek.com/browse/boardgame/page/{}'.format(page)
    bgg_page = urlopen(url)

    my_bytes = bgg_page.read()
    url_text = my_bytes.decode("utf8")
    bgg_page.close()
    url_text = BS(url_text, 'html.parser')
    
    games = url_text.find_all("td", class_="collection_objectname")
    
    def get_game_name(item):
        game_name = item.findNext('a').text
        return(game_name)
        
    def get_game_ID(item):
        game_link_id = str(item.findNext('a'))
        game_link_id = re.search('[0-9]{1,7}', game_link_id).group(0)
        return(int(game_link_id))
    
    game_list = {get_game_name(ii):get_game_ID(ii) for ii in games}
    

    return(game_list)

In [118]:
def list_to_boardgame_class(num_pages):

    """Return list of Boardgame Objects
    
    Description:
        Calls pull_game_names() and boardgame_info(), passing information to 
        Boardgame() class, and returns a list of all class objects.
     
    Input:
        num_pages (int): Number of pages of games (100 per page)
    
    Returns:
        all_games (list): List of each game as a Boardgame object
    
    """
    ## Get boardgame names and IDs for the top 1000 games (10 pages)
    game_list = [pull_game_names(ii) for ii in progressbar.log_progress(range(1, num_pages + 1),'Game List')]
    game_IDs = [x for y in list([ii.values() for ii in game_list]) for x in y]
    ## Connect IDs with BGG API, get game information

    return(game_IDs) #(game_list, game_IDs)

## Procedure Start

We start here by going througth the pages and collecting the BoardGame ID. Then, we append that to a list with all games IDs

### Feedback
Sometimes scripts run for a long time without stopping, specialy with URL request, since those rely on the connection itself. Or even for long loops.
In this case, both things happen, so I added a ProgressBar as a way to know if the script is really running correctly. 

The user feedback function is called **progressbar.log_progress(list, string)**

In [119]:
start = time.time()

num_pages = 1050

if __name__ == "__main__":
    ## Get boardgame names and IDs for the top xxx games (100 * num_pages)
    all_games_IDs = list_to_boardgame_class(num_pages)
    print("Scraping complete")

end = time.time()
print('Elapsed time: ', time.strftime("%H:%M:%S", time.gmtime(end-start)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=1050)))

Scraping complete
Elapsed time:  01:13:33


In [120]:
len(all_games_IDs)

104957

## Building URL with 500 games to increase download performance

In [121]:
all_games_IDs_backup = all_games_IDs.copy()

In [122]:
ids_already_processed = []
list_of_urls = []
number_of_games_on_search = 400
search_len = math.ceil(len(all_games_IDs)/number_of_games_on_search) #math.ceil will round up to the next integer
if search_len < 1:
    search_len = 1
print('Number of searches:', search_len)

Number of searches: 263


In [123]:
for i in range(int(search_len)):
    url = 'https://boardgamegeek.com/xmlapi2/thing?id='
    for ID in all_games_IDs[:number_of_games_on_search]:
            url = url + str(ID) + ','
            ids_already_processed.append(ID)
            all_games_IDs.remove(ID)
    url= url[:-1] + '&stats=1'
    list_of_urls.append(url)

In [124]:
print('Number of XMLs is', len(list_of_urls), ',with a total of ',len(ids_already_processed),'games')

Number of XMLs is 263 ,with a total of  104957 games


## Download game details

If game IDs collected and URLs group and ready, we can finally download the XMLs from BGG Website

### First try
Sometimes the page returns an error for a few requests. Since this takes a long time to finish, I divided this in two parts

1) First, download pages from the web and store in a list
    a) While doing that, check for errors in the download and store on another list for a second retry

2) With everything local, parse the XMLL. This will avoid having to download everything again in search for errors

### Checkpoint

In [125]:
save_list_of_urls = list_of_urls.copy()

In [126]:
def download_bg_details(list_of_urls_to_download):
    attempts = 0
    for url in progressbar.log_progress(list_of_urls_to_download, 'Downloading Pages (Multiple Games Details)'):
            page = requests.get(url)
            if page.status_code == requests.codes.ok:
                successful_requests.append(page)
            else:
                bad_url_request.append(url)
    if len(bad_url_request) != 0 and attempts <=10:
        download_bg_details(bad_url_request)
        attempts += 1
    return successful_requests

In [127]:
first_half = int(round(len(list_of_urls)/2,0))
second_half = int((len(list_of_urls) - first_half)*-1)

In [129]:
start = time.time()
bad_url_request = []
successful_requests = []

downloaded_xmls = download_bg_details(list_of_urls[:first_half])
downloaded_xmls = download_bg_details(list_of_urls[:second_half])

end = time.time()
print('Elapsed time: ', time.strftime("%H:%M:%S", time.gmtime(end-start)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=132)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=132)))

Elapsed time:  00:32:11


In [130]:
dup = []
unique = []
for item in downloaded_xmls:
    if item not in unique:
        unique.append(item)
    else:
        dup.append(item)
len(dup)

0

In [131]:
backup_of_downloads = downloaded_xmls.copy

## Breaking down the XMLs received into a list with BoardGame objects

In [132]:
start = time.time()
flatted_list = []

for item in progressbar.log_progress(downloaded_xmls, 'Pages on XML'):
    page_xml = xmltodict.parse(item.content)
    for each_game in page_xml['items']['item']:
        temp = BoardGame(each_game)
        flatted_list.append(temp)
end = time.time()
print('Elapsed time: ', time.strftime("%H:%M:%S", time.gmtime(end-start)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=264)))

Elapsed time:  00:05:29


## Build DataFrame

Finaly we build the DataFrame of all games. This process is taking very long to complete. Improvements may be necessary.

In [136]:
lst = flatted_list
all_games = lst[:]
bg_df = pd.DataFrame()

bg_df = pd.DataFrame()
bg_df['Name'] = [i.name for i in progressbar.log_progress(flatted_list, 'Scanning Name')]
bg_df['ID'] = [i.id for i in progressbar.log_progress(flatted_list, 'Scanning ID')] 
bg_df['Publisher'] = [i.publishers[0] for i in progressbar.log_progress(flatted_list, 'Scanning Publishers')]
bg_df['Minimum Players'] = [i.min_players for i in progressbar.log_progress(flatted_list, 'Scanning Min Players')]
bg_df['Max Players'] = [i.max_players for i in progressbar.log_progress(flatted_list, 'Scanning Max Players')]
bg_df['Num_Ratings'] = [i.users_rated for i in progressbar.log_progress(flatted_list, 'Scanning Num Ratings')]
bg_df['Avg_Rating'] = [i.avg_rating for i in progressbar.log_progress(flatted_list, 'Scanning Ratings')]
bg_df['Bayes_Avg_Rating'] = [i.bayes_avg_rating for i in progressbar.log_progress(flatted_list, 'Scanning Bayes Rating')]
bg_df['Avg_Weight'] = [i.avg_weight for i in progressbar.log_progress(flatted_list, 'Scanning AVG Weight')]
bg_df['StDev_Rating'] = [i.stdev_rating for i in progressbar.log_progress(flatted_list, 'Scanning StDev Rating')]
bg_df['Num_Owned'] = [i.num_owned for i in progressbar.log_progress(flatted_list, 'Scanning Num Owned')]
bg_df['Num_Comments'] = [i.num_comments for i in progressbar.log_progress(flatted_list, 'Scanning Num Comments')]
bg_df['Num_Trading'] = [i.num_trading for i in progressbar.log_progress(flatted_list, 'Scanning Num Tradings')]
bg_df['Num_Wanting'] = [i.num_wanting for i in progressbar.log_progress(flatted_list, 'Scanning Num Waiting')]
bg_df['Num_Wishing'] = [i.num_wishing for i in progressbar.log_progress(flatted_list, 'Scanning Num Wishing')]
bg_df['Num_Weights'] = [i.num_weights for i in progressbar.log_progress(flatted_list, 'Scanning Num weights')]
bg_df['Year'] = [i.year_published for i in progressbar.log_progress(flatted_list, 'Scanning Year Published')]

## Rating
bg_df['Num_Ratings'] = [i.users_rated for i in progressbar.log_progress(all_games, 'Scanning Num Ratings')]
bg_df['Avg_Rating'] = [i.avg_rating for i in progressbar.log_progress(all_games, 'Scanning Ratings')]
bg_df['Bayes_Avg_Rating'] = [i.bayes_avg_rating for i in progressbar.log_progress(all_games, 'Scanning Bayes Rating')]
bg_df['Avg_Weight'] = [i.avg_weight for i in progressbar.log_progress(all_games, 'Scanning AVG Weight')]
bg_df['StDev_Rating'] = [i.stdev_rating for i in progressbar.log_progress(all_games, 'Scanning StDev Rating')]

## Players
try:
    bg_df['Pub_Min_Age'] = [i.min_age for i in progressbar.log_progress(all_games, 'Publisher Min Age')]
except:
    bg_df['Pub_Min_Age'] = 0


BGG_Min_Age = []
for i in all_games:
    try:
        BGG_Min_Age.append(i.suggested_player_age)
    except:
        BGG_Min_Age.append(np.nan)

bg_df['BGG_Min_Age'] = BGG_Min_Age

try:
    bg_df['Pub_Min_Players'] = [i.min_players for i in progressbar.log_progress(all_games, 'Publisher Min Players')]
except:
    bg_df['Pub_Min_Players'] = 0

try:
    bg_df['Pub_Max_Players'] = [i.max_players for i in progressbar.log_progress(all_games, 'Publisher Max Players')]
except:
    bg_df['Pub_Max_Players'] = 0

BGG_Num_Players = []
for i in all_games:
    try:
        BGG_Num_Players.append(i.suggested_players)
    except:
        BGG_Num_Players.append(np.nan)

bg_df['BGG_Num_Players'] = BGG_Num_Players
try:
    bg_df['Play_Time'] = [i.playing_time for i in progressbar.log_progress(all_games, 'Scanning Play Time')]
except:
    bg_df['Play_Time'] = 0

## Ownership & Community Engagement
bg_df['Num_Owned'] = [i.num_owned for i in progressbar.log_progress(all_games, 'Scanning Num Owned')]
bg_df['Num_Comments'] = [i.num_comments for i in progressbar.log_progress(all_games, 'Scanning Num Comments')]
bg_df['Num_Trading'] = [i.num_trading for i in progressbar.log_progress(all_games, 'Scanning Num Tradings')]
bg_df['Num_Wanting'] = [i.num_wanting for i in progressbar.log_progress(all_games, 'Scanning Num Waiting')]
bg_df['Num_Wishing'] = [i.num_wishing for i in progressbar.log_progress(all_games, 'Scanning Num Wishing')]
bg_df['Num_Weights'] = [i.num_weights for i in progressbar.log_progress(all_games, 'Scanning Num weights')]

### Ranks
rank_categories = []
for i in progressbar.log_progress(all_games, 'Rank categories'):
    rank_categories.append(list(i.rank.keys()))

unique_rank_categories = list(set([i for j in progressbar.log_progress(rank_categories, 'Rank categories') for i in j]))
rank_columns = ['Name', 'ID'] + unique_rank_categories
major_rank_df = pd.DataFrame(columns = rank_columns)

for game in progressbar.log_progress(all_games, 'Rank'):
    rank_data = game.rank
    rank_data['Name'] = game.name
    rank_data['ID'] = game.id
    minor_rank_df = pd.DataFrame(rank_data, index=[0], columns = rank_columns)
    major_rank_df = major_rank_df.append(minor_rank_df)

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

In [None]:
bg_df_backup = bg_df.copy()

In [138]:
bg_df = bg_df.merge(major_rank_df, on = ['Name', 'ID'])

In [139]:
bg_df

Unnamed: 0,Name,ID,Publisher,Minimum Players,Max Players,Num_Ratings,Avg_Rating,Bayes_Avg_Rating,Avg_Weight,StDev_Rating,...,strategygames,familygames,childrensgames,boardgameaccessory,wargames,thematic,arcade,cgs,videogame,commodore64
0,Gloomhaven,174430,Cephalofair Games,1,4,23504,8.91228,8.61315,3.7753,1.60192,...,0,,,,,0,,,,
1,Gloomhaven,174430,Cephalofair Games,1,4,23504,8.91228,8.61315,3.7753,1.60192,...,0,,,,,0,,,,
2,Gloomhaven,174430,Cephalofair Games,1,4,23504,8.91228,8.61315,3.7753,1.60192,...,0,,,,,0,,,,
3,Gloomhaven,174430,Cephalofair Games,1,4,23504,8.91228,8.61315,3.7753,1.60192,...,0,,,,,0,,,,
4,Pandemic Legacy: Season 1,161936,"Z-Man Games, Inc.",2,4,30370,8.64675,8.49226,2.8318,1.61203,...,0,,,,,0,,,,
5,Pandemic Legacy: Season 1,161936,"Z-Man Games, Inc.",2,4,30370,8.64675,8.49226,2.8318,1.61203,...,0,,,,,0,,,,
6,Pandemic Legacy: Season 1,161936,"Z-Man Games, Inc.",2,4,30370,8.64675,8.49226,2.8318,1.61203,...,0,,,,,0,,,,
7,Pandemic Legacy: Season 1,161936,"Z-Man Games, Inc.",2,4,30370,8.64675,8.49226,2.8318,1.61203,...,0,,,,,0,,,,
8,Through the Ages: A New Story of Civilization,182028,Czech Games Edition,2,4,15179,8.54243,8.26341,4.3657,1.49137,...,0,,,,,,,,,
9,Through the Ages: A New Story of Civilization,182028,Czech Games Edition,2,4,15179,8.54243,8.26341,4.3657,1.49137,...,0,,,,,,,,,


In [142]:
bg_df = bg_df.drop_duplicates(subset=None, keep='first', inplace=False)

In [143]:
bg_df

Unnamed: 0,Name,ID,Publisher,Minimum Players,Max Players,Num_Ratings,Avg_Rating,Bayes_Avg_Rating,Avg_Weight,StDev_Rating,...,strategygames,familygames,childrensgames,boardgameaccessory,wargames,thematic,arcade,cgs,videogame,commodore64
0,Gloomhaven,174430,Cephalofair Games,1,4,23504,8.91228,8.61315,3.7753,1.601920,...,0,,,,,0,,,,
4,Pandemic Legacy: Season 1,161936,"Z-Man Games, Inc.",2,4,30370,8.64675,8.49226,2.8318,1.612030,...,0,,,,,0,,,,
8,Through the Ages: A New Story of Civilization,182028,Czech Games Edition,2,4,15179,8.54243,8.26341,4.3657,1.491370,...,0,,,,,,,,,
12,Terraforming Mars,167791,FryxGames,1,5,36736,8.40232,8.23708,3.2313,1.358540,...,0,,,,,,,,,
16,Twilight Struggle,12333,GMT Games,2,2,33857,8.32508,8.17631,3.5645,1.587340,...,0,,,,0,,,,,
20,Star Wars: Rebellion,187645,Fantasy Flight Games,2,4,16091,8.44952,8.16307,3.6729,1.377980,...,0,,,,,0,,,,
24,Gaia Project,220308,Feuerland Spiele,1,4,8593,8.56949,8.12296,4.3012,1.421250,...,0,,,,,,,,,
28,Scythe,169786,Stonemaier Games,1,5,37941,8.29311,8.12183,3.3705,1.420140,...,0,,,,,,,,,
32,Great Western Trail,193738,eggertspiele,2,4,18302,8.28808,8.07636,3.6909,1.251350,...,0,,,,,,,,,
36,Twilight Imperium (Fourth Edition),233078,Fantasy Flight Games,3,6,6021,8.75469,8.06175,4.1752,1.535010,...,0,,,,,,,,,


In [148]:
### Category Function
def cast_merge_by_class(grouping, fill_term, lst, bg_df):
    all_terms_grouping = [getattr(i, grouping) for i in lst]
    unique_terms_grouping = list(set([i for j in all_terms_grouping for i in j]))
    grouping_term_columns = ['Name', 'ID'] + [fill_term + s for s in unique_terms_grouping]
    grouping_term_columns = [re.sub(' ', '_', x) for x in grouping_term_columns]

    major_grouping_df = pd.DataFrame(columns = grouping_term_columns)

    for game in progressbar.log_progress(lst, 'Scanning list'):
        minor_grouping_df = pd.DataFrame(columns = grouping_term_columns, index=[0])
        minor_grouping_df = minor_grouping_df.fillna(False)
        minor_grouping_df['Name'] = game.name
        minor_grouping_df['ID'] = game.id
        for entry in getattr(game, grouping):
            term = fill_term + re.sub(' ', '_', entry)
            minor_grouping_df[term] = True

        major_grouping_df = major_grouping_df.append(minor_grouping_df)

    # global bg_df
    df_out = pd.merge(df, major_grouping_df, on = ['Name', 'ID'])
    return(df_out)

In [149]:
merge_pairs = {'mechanics':'Mechanic_', 'categories':'Category_'}

for key in progressbar.log_progress(merge_pairs.keys(), 'Casting Classes'):
    bg_dfbg_df = cast_merge_by_class(key, merge_pairs[key], flatted_list, bg_df)

VBox(children=(HTML(value=''), IntProgress(value=0, max=2)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

VBox(children=(HTML(value=''), IntProgress(value=0, max=105600)))

## Remove duplicates

In [153]:
bg_dfbg_df = bg_df.drop_duplicates(keep='first')
bg_df

Unnamed: 0,Name,ID,Publisher,Minimum Players,Max Players,Num_Ratings,Avg_Rating,Bayes_Avg_Rating,Avg_Weight,StDev_Rating,...,Category_Educational,Category_Travel,Category_Industry_/_Manufacturing,Category_Farming,Category_Adventure,Category_Religious,Category_Transportation,Category_Murder/Mystery,Category_Age_of_Reason,Category_Deduction
0,Gloomhaven,174430,Cephalofair Games,1,4,23504,8.91228,8.61315,3.7753,1.601920,...,False,False,False,False,True,False,False,False,False,False
4,Pandemic Legacy: Season 1,161936,"Z-Man Games, Inc.",2,4,30370,8.64675,8.49226,2.8318,1.612030,...,False,False,False,False,False,False,False,False,False,False
8,Through the Ages: A New Story of Civilization,182028,Czech Games Edition,2,4,15179,8.54243,8.26341,4.3657,1.491370,...,False,False,False,False,False,False,False,False,False,False
12,Terraforming Mars,167791,FryxGames,1,5,36736,8.40232,8.23708,3.2313,1.358540,...,False,False,True,False,False,False,False,False,False,False
16,Twilight Struggle,12333,GMT Games,2,2,33857,8.32508,8.17631,3.5645,1.587340,...,False,False,False,False,False,False,False,False,False,False
20,Star Wars: Rebellion,187645,Fantasy Flight Games,2,4,16091,8.44952,8.16307,3.6729,1.377980,...,False,False,False,False,False,False,False,False,False,False
24,Gaia Project,220308,Feuerland Spiele,1,4,8593,8.56949,8.12296,4.3012,1.421250,...,False,False,False,False,False,False,False,False,False,False
28,Scythe,169786,Stonemaier Games,1,5,37941,8.29311,8.12183,3.3705,1.420140,...,False,False,False,False,False,False,False,False,False,False
32,Great Western Trail,193738,eggertspiele,2,4,18302,8.28808,8.07636,3.6909,1.251350,...,False,False,False,False,False,False,False,False,False,False
36,Twilight Imperium (Fourth Edition),233078,Fantasy Flight Games,3,6,6021,8.75469,8.06175,4.1752,1.535010,...,False,False,False,False,False,False,False,False,False,False


## Add Today to Dataframe

In [154]:
bg_df['Downloaded Date'] = time.strftime("%Y-%m-%d", time.gmtime(time.time()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


## Saving DataFrame to File and Database

In [159]:
import MySQLdb
from sqlalchemy import create_engine

In [None]:
engine = create_engine('mysql+mysqldb://coolmini_gonk:G0nkG0nk@venus.coolminiornot.com:3306/coolmini_business_intelligence', echo = False)
bg_df.to_sql(name = 'BGG_Scraper', con = engine, if_exists = 'append', index = False)

In [None]:
total_timer_end = time.time()
print('Total elapsed time: ', time.strftime("%H:%M:%S", time.gmtime(total_timer_end-total_timer_start)))