# Notebook Setup

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
from bs4 import BeautifulSoup
import requests
import regex as re
import time


from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import os


## Functions

In [2]:
def create_designers(game_page, game_id):
    all_designers = game_page.find_all('link', type='boardgamedesigner')
    
    designers = pd.DataFrame(columns=['BGGId'])
    design = {'BGGId':int(game_id)}
    
    for item in all_designers:
        design[item['value']] = int(1)
    
    designers = designers.append(design, ignore_index=True)
    
    return designers

In [3]:
def create_categories(game_page, game_id):
    all_categories = game_page.find_all('link', type='boardgamecategory')
    
    categories = pd.DataFrame(columns=['BGGId'])
    category = {'BGGId':int(game_id)}

    for item in all_categories:
        category[item['value']] = int(1)
    
    categories = categories.append(category, ignore_index=True)
    
    return categories

In [4]:
def create_mechanics(game_page, game_id):
    all_mechanics = game_page.find_all('link', type='boardgamemechanic')
    
    mechanics = pd.DataFrame(columns=['BGGId'])
    mechanic = {'BGGId':int(game_id)}

    for item in all_mechanics:
        mechanic[item['value']] = int(1)
    
    # Try Tableau
    try:
        game_page.find('link', type='boardgamefamily', value=("Mechanism: Tableau Building"))['value']
        mechanic['TableauBuilding'] = int(1)
    except: pass
    
    # Try is Legacy
    try:
        game_page.find('link', type='boardgamefamily', value=("Mechanism: Legacy"))['value']
        mechanic['Legacy'] = int(1)
    except: pass
    
    mechanics = mechanics.append(mechanic, ignore_index=True)
    
    return mechanics

In [5]:
def create_artists(game_page, game_id):
    
    all_artists = game_page.find_all('link', type='boardgameartist')
    
    artists = pd.DataFrame(columns=['BGGId'])
    artist = {'BGGId':int(game_id)}

    for item in all_artists:
        artist[item['value']] = int(1)
    
    artists = artists.append(artist, ignore_index=True)
    
    return artists

In [6]:
def create_publishers(game_page, game_id):
    all_publishers = game_page.find_all('link', type='boardgamepublisher')
    
    publishers = pd.DataFrame(columns=['BGGId'])
    publisher = {'BGGId':int(game_id)}

    for item in all_publishers:
        publisher[item['value']] = int(1)
    
    publishers = publishers.append(publisher, ignore_index=True)
    
    return publishers

In [7]:
def create_awards(awards_level, game_id):
    all_awards = awards_level.find_all('a', class_='ng-binding')
    
    awards = pd.DataFrame(columns=['BGGId'])
    award = {'BGGId':int(game_id)}

    for item in all_awards:
        item = re.sub("[0-9]", "", item.text).strip(' ')
        award[item] = int(1)
    
    awards = awards.append(award, ignore_index=True)
    
    return awards

In [8]:
def create_ratings_dist(stats_page, game_id):

    all_ratings = stats_page.find('ratings-stats-graph')
    next_ratings = all_ratings.find_all('text')

    ratings = pd.DataFrame(columns=['BGGId'])
    rating = {'BGGId':int(game_id)}
    
    rating['1'] = next_ratings[10].text
    rating['2'] = next_ratings[11].text
    rating['3'] = next_ratings[12].text
    rating['4'] = next_ratings[13].text
    rating['5'] = next_ratings[14].text
    rating['6'] = next_ratings[15].text
    rating['7'] = next_ratings[16].text
    rating['8'] = next_ratings[17].text
    rating['9'] = next_ratings[18].text
    rating['10'] = next_ratings[19].text
    
    ratings = ratings.append(rating, ignore_index=True)
    
    return ratings

In [9]:
def create_game_entry(game_id):
    '''Takes in a single game id and gets ALL THE THINGS about that game
    Returns 9 separate dataframes to be appended to larger frames'''
    
    
    start = time.time()# log the start time for this entry
    
    # get the game path using the game id, call the api and get the page
    path = 'https://www.boardgamegeek.com/xmlapi2/thing?id='+str(game_id)+'&stats=1&comments=1&ratingcomments=1&page=1&pagesize=100'
    page = requests.get(path) # get the page
    game_page = BeautifulSoup(page.content, 'xml') # parse the page with beautifulsoup
    
    # Print the game we're pulling data on
    game_name = game_page.find('name', type='primary')['value']
    print("Starting",game_name,game_id)
    
    # check that this game has sufficient user ratings to incluide
    user_ratings = int(game_page.find('usersrated')['value'])# get the number of user ratings
    if user_ratings < 30: #check if user ratings are under 30
        print("Not enough data to include this listing")# if so, print a decline message and exit the function
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    else:
        print("Getting basic stats")
        description = game_page.find('description').text # description text of the game
        year_pub = int(game_page.find('yearpublished')['value']) # year published
        minplayers = int(game_page.find('minplayers')['value']) # minimum players
        maxplayers = int(game_page.find('maxplayers')['value']) # maximum players
        avg_rating = float(game_page.find('average')['value']) # average rating
        bayes_avg = float(game_page.find('bayesaverage')['value']) # bayes average rating
        std_dev = float(game_page.find('stddev')['value']) # standard deviation of rating
        num_owned = int(game_page.find('owned')['value']) # num of people own this game
        num_want = int(game_page.find('wanting')['value']) # num of people want this game
        num_wish = int(game_page.find('wishing')['value']) # num of people with game on wishlist
        num_weight_votes = int(game_page.find('numweights')['value']) # num of votes for game weight
        game_weight = float(game_page.find('averageweight')['value']) # voted game weight
        try: image_path = game_page.find('image').text # path to image
        except: image_path = None
        mfg_play_time = int(game_page.find('playingtime')['value']) # mfg stated playtime
        comm_min_play = int(game_page.find('minplaytime')['value']) # community min playtime
        comm_max_play = int(game_page.find('maxplaytime')['value']) # community max playtime
        mfg_age = int(game_page.find('minage')['value']) # mfg min age
        num_comments = int(game_page.find('comments')['totalitems']) # num of ratings comments
        num_alts = len(game_page.find_all('name', type='alternate')) # number alternate versions
        num_expansions = len(game_page.find_all('link', type='boardgameexpansion')) # number of expansions
        num_implementations = len(game_page.find_all('link', type='boardgameimplementation')) # number of implementations
    
    
    
        # Get expansion flag
        gametype = game_page.find('item')['type'] # check game type
        if gametype == 'boardgameexpansion': expansion_flag = 1 # if game is an expansion, flag it 1
        else: expansion_flag=0
        
        
        
        # Get reimplementation flag
        reimplementation = game_page.find('link', type="boardgameimplementation", inbound="true") # check if game is a reimplementation
        if reimplementation: reimplements = 1 # if it's a reimplementation, flag it 1
        else: reimplements = 0
        
        
        
        # Get community age min
        age_poll = game_page.find('poll', title="User Suggested Player Age").find_all('result')

        total = 0
        items = 0

        for item in age_poll:   
            vote = int(item['numvotes']) * int(item['value'][:2])
            total += vote
            items += int(item['numvotes'])

        if items>0: comm_age = total/items # make sure not dividing by 0, get community recommended age
        else: comm_age=None # if no votes, record none
    
    
    
        # Language Ease
        lang_poll = game_page.find('poll', title="Language Dependence").find_all('result')
        total, items = 0, 0

        for item in lang_poll:   
            vote = int(item['numvotes']) * int(item['level'])
            total += vote
            items += int(item['numvotes'])

        if items>0: lang_ease = total/items # make sure not dividing by 0, get community language ease
        else: lang_ease=None # if no votes, record none
    
    
    
    
        # Best and Good Players
        players = game_page.find('poll', title="User Suggested Number of Players").find_all('results') # get user players poll
        player_num_votes = int(game_page.find('poll', title="User Suggested Number of Players")['totalvotes'])# get total votes
        
        best_players, best_score, good_players = 0, 0, [] # set up for best players loop
        
        if player_num_votes > 30: # evaluate if more than 30 votes for num players
            for player in players:
                best = int(player.find('result', value='Best')['numvotes'])
                rec = int(player.find('result', value='Recommended')['numvotes'])
                score = best*2 + rec*1
                positives = best+rec
                ratio = positives/player_num_votes
                if score > best_score: best_players, best_score = player['numplayers'], score # put in # players for best score
                if ratio > .5: good_players.append(player['numplayers']) # put in good players if over 50% ratio
        else: best_players=None
        
        
        
        # Use Selenium driver to scrape dynamic content
        # Set up Selenium drivers
        options = webdriver.ChromeOptions() # set up chrome options
        options.add_argument("--headless") # set up chrome options
        
        time.sleep(1) # wait 1 second
        
        print("New page retrieval for awards.")
        driver = webdriver.Chrome(options=options) # initiate chrome driver with options
        path = "https://boardgamegeek.com/boardgame/"+str(game_id)+'/' # determine path
        driver.get(path) # get path
        
        # Get scrape page for awards
        game_page_dynamic = BeautifulSoup(driver.page_source) #parse dynamic page with beautifulsoup
        
        # number of awards
        try:
            awards_level = game_page_dynamic.find('awards-module') # find awards on page
            awards = awards_level.find_all('a', class_='ng-binding') # get list of awards  
            num_awards = len(awards) # log number of awards
        except: pass
    
    
    
        # get stats page path
        print("New page retrieval for ratings distribution. May be waiting for chart to load.")
        time.sleep(1) 
        driver = webdriver.Chrome(options=options)# initiate chrome driver with options
        stats_path = game_page_dynamic.find('link')['href']+"/stats" #determine path
        driver.get(stats_path)# get path
        # wait until the driver finds the element that we need
        element = WebDriverWait(driver, 30).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#mainbody > div > div.global-body-content.pending.ready > div.content.ng-isolate-scope > div:nth-child(2) > ng-include > div > div > ui-view > ui-view > div > div > div.panel-body > div > div.col-sm-6.col-sm-push-6 > div > div.stats-graph > ratings-stats-graph > div > div > div:nth-child(1) > div > svg > g:nth-child(2) > g:nth-child(5)')))
        stats_page = BeautifulSoup(driver.page_source) # parse page with beautifulsoup
        
        
        # get number of user ratings, number of fans, number of views
        num_fans = int(stats_page.find_all('div', class_="outline-item-description")[5].text.replace(',','').strip(' ')) # get number of fans
        num_views = int(stats_page.find_all('div', class_="outline-item-description")[6].text.replace(',','').strip(' ')) # get number of page views
        

        
        # Get forum thread top-level info
        print('New page retrieval for forum ids.')
        forum_page = requests.get("https://www.boardgamegeek.com/xmlapi2/forumlist?id="+str(game_id)+"&type=thing") # get forum top level
        forums = BeautifulSoup(forum_page.content, 'xml') # parse with beautifulsoup
        
        total_threads = 0 # set threads to 0
        all_forums = forums.find_all('forum') # find all forum listings
        for item in all_forums: # for each listing in forum listings
            total_threads += int(item['numthreads']) #add the number of threads to total threads

        rules_threads = int(all_forums[3]['numthreads']) # get the number of Rules threads       
        
        print(game_id)
        # make dataframe for this game
        this_game = pd.DataFrame()
        
        this_game['BGGId']=int(game_id),
        this_game['Name']=game_name,
        this_game['Description']=description,
        this_game['YearPublished']=int(year_pub),
        this_game['GameWeight']=float(game_weight),
        this_game['AvgRating']=float(avg_rating),
        this_game['BayesAvgRating']=float(bayes_avg),
        this_game['StdDev']=float(std_dev),
        this_game['MinPlayers']=int(minplayers),
        this_game['MaxPlayers']=int(maxplayers),
        try: this_game['ComAgeRec']=float(comm_age),
        except: this_game['ComAgeRec']=None,
        try: this_game['LanguageEase']=float(lang_ease),
        except: this_game['LanguageEase']=None,
        this_game['BestPlayers']=best_players,
        this_game['GoodPlayers']=good_players,
        this_game['NumOwned']=int(num_owned),
        this_game['NumWant']=int(num_want),
        this_game['NumWish']=int(num_wish),
        this_game['NumWeightVotes']=int(num_weight_votes),
        this_game['MfgPlaytime']=int(mfg_play_time),
        this_game['ComMinPlaytime']=int(comm_min_play),
        this_game['ComMaxPlaytime']=int(comm_max_play),
        this_game['MfgAgeRec']=int(mfg_age),
        this_game['NumUserRatings']=int(user_ratings),
        this_game['NumComments']=int(num_comments),
        this_game['NumAlternates']=int(num_alts),
        this_game['NumExpansions']=int(num_expansions),
        this_game['NumAwards'] = int(num_awards)
        this_game['NumImplementations']=int(num_implementations),
        this_game['NumFans']=int(num_fans),
        this_game['NumPageViews']=int(num_views),
        this_game['RulesPosts']=int(rules_threads),
        this_game['TotalPosts']=int(total_threads),
        this_game['IsExpansion']=int(expansion_flag),
        this_game['IsReimplementation']=int(reimplements),
        this_game['ImagePath']=image_path
            
        
        # add unique information to end of df
        
        # Add game ranks
        ranks = game_page.find_all('rank')
        for item in ranks:
            this_game['Rank:'+item['name']] = float(item['value'])
        
        # Try to add components
        try: 
            families = game_page.find_all('link', type='boardgamefamily', value=re.compile("Component"))
            for item in families:
                this_game['Components:'+item['name']] = item['value']
        except: pass
        
        # Try to add game series/family
        try:
            family = game_page.find('link', type='boardgamefamily', value=re.compile("Game:"))['value'].strip('Game:').strip(' ')
            this_game['Family'] = family
        except: pass
        try:
            family = game_page.find('link', type='boardgamefamily', value=re.compile("Series:"))['value'].strip('Series:').strip(' ')
            this_game['Family'] = family
        except: pass
        
        # Try to add theme
        try:
            theme = game_page.find('link', type='boardgamefamily', value=re.compile("Theme:"))['value'].strip('Theme:').strip(' ')
            this_game['Theme'] = theme
        except: pass
        
        # Try to add game category
        try:
            category = game_page.find('link', type='boardgamefamily', value=re.compile("Category:"))['value'].strip('Category:').strip(' ')
            this_game['Category'] = category
        except: pass
        
        
        # Try is Kickstarted
        try:
            game_page.find('link', type='boardgamefamily', value=re.compile("Crowdfunding"))['value']
            this_game['Kickstarted'] = int(1)
        except: pass
        
        
        
        # create specialty dataframes
        print("Making specialty data frames")
        designers = create_designers(game_page, game_id)
        categories = create_categories(game_page, game_id)
        mechanics = create_mechanics(game_page, game_id)
        artists = create_artists(game_page, game_id)
        publishers = create_publishers(game_page, game_id)
        awards = create_awards(awards_level, game_id)
        ratings_dist = create_ratings_dist(stats_page, game_id)
        
        
        # Get comments - new api calls
        
        comment_pages = int(np.ceil(int(game_page.find('comments')['totalitems'])/100))
        comments = pd.DataFrame(columns=['BGGId', 'Name', 'Rating', 'Value', 'Username'])
        print("Getting comments. There are "+str(comment_pages)+" pages to parse.")
        
        bggid, names, ratings, comment, usernames = [], [], [], [], []

        
        for i in range(0, comment_pages):
            print("page "+str(i+1)+" of "+str(comment_pages))
            path2 = 'https://www.boardgamegeek.com/xmlapi2/thing?id='+str(game_id)+'&comments=1&ratingcomments=1&page='+str(i)+'&pagesize=100'
            all_comments = requests.get(path2)
            comments_page = BeautifulSoup(all_comments.content, 'xml')
            list_of_comments = comments_page.find_all('comment')
            for item in list_of_comments:
                bggid.append(int(game_id))
                names.append(game_name)
                ratings.append(item['rating'])
                comment.append(item['value'])
                usernames.append(item['username'].strip('_'))
            time.sleep(2)
            
        comments['BGGId'] = bggid    
        comments['Name'] = names
        comments['Rating'] = ratings
        comments['Value'] = comment
        comments['Username'] = usernames
        
        
        
        print(f'Time: {time.time() - start}')
        
        # Pause script
        randit = np.random.randint(1,3)
        time.sleep(1)
        
        return this_game, designers, categories, mechanics, artists, publishers, comments, awards, ratings_dist#, reviews

# Game Scraping

In [11]:
games = pd.read_pickle('backups_11-8/games.pkl')
designers = pd.read_pickle('backups_11-8/designers.pkl')
categories = pd.read_pickle('backups_11-8/categories.pkl')
mechanics = pd.read_pickle('backups_11-8/mechanics.pkl')
artists = pd.read_pickle('backups_11-8/artists.pkl')
publishers = pd.read_pickle('backups_11-8/publishers.pkl')
comments = pd.read_pickle('backups_11-8/comments.pkl')
awards = pd.read_pickle('backups_11-8/awards.pkl')
ratings_dist = pd.read_pickle('backups_11-8/ratings_dist.pkl')
#reviews = pd.read_pickle('reviews.pkl')

In [12]:
games

Unnamed: 0,BGGId,Name,Description,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,BestPlayers,GoodPlayers,NumOwned,NumWant,NumWish,NumWeightVotes,MfgPlaytime,ComMinPlaytime,ComMaxPlaytime,MfgAgeRec,NumUserRatings,NumComments,NumAlternates,NumExpansions,NumAwards,NumImplementations,NumFans,NumPageViews,RulesPosts,TotalPosts,IsExpansion,IsReimplementation,Family,Theme,Category,Kickstarted,ImagePath,Rank:boardgame,Rank:thematic,Rank:strategygames,Rank:wargames,Rank:familygames,Rank:cgs,Rank:abstracts,Rank:partygames,Rank:childrensgames,Rank:rpgitem,Rank:boardgameaccessory,Rank:videogame,Rank:amiga,Rank:commodore64,Rank:arcade,Rank:atarist
0,174430,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,2017,3.8726,8.76029,8.52385,1.63358,1,4,12.784946,4.152542,3,"[1, 2, 3, 4]",74975,1375,17276,2009,120,60,120,14,46228,8308,6,12,30,0,7718,10816029,6326,14752,0,0,Gloomhaven,,Dungeon Crawl,1,https://cf.geekdo-images.com/sZYp_3BTDGjh2unaZ...,1.0,1.0,1.0,,,,,,,,,,,,,
1,161936,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,2015,2.8331,8.60116,8.45000,1.56368,2,4,11.354037,4.077778,4,"[2, 3, 4]",68974,840,11544,1210,60,60,60,13,44043,6682,10,0,30,2,2952,3450336,1229,3086,0,1,Pandemic,,,,https://cf.geekdo-images.com/-Qer2BBPG7qGGDu6K...,2.0,2.0,3.0,,,,,,,,,,,,,
2,224517,Brass: Birmingham,Brass: Birmingham is an economic strategy game...,2018,3.9038,8.66907,8.41066,1.24331,2,4,13.226190,1.035714,3,"[2, 3, 4]",35448,1528,11277,1143,120,60,120,14,23776,3610,6,0,18,1,1902,1974872,387,1028,0,1,Brass,Canals,,1,https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg...,3.0,,2.0,,,,,,,,,,,,,
3,167791,Terraforming Mars,"In the 2400s, mankind begins to terraform the ...",2016,3.2429,8.42155,8.27751,1.38628,1,5,11.891156,3.380952,3,"[1, 2, 3, 4]",97330,2061,18842,2944,120,120,120,12,71474,10452,14,22,39,2,6372,6285054,1155,4313,0,0,Terraforming Mars,,,,https://cf.geekdo-images.com/wg9oOLcsKvDesSUdZ...,4.0,,6.0,,,,,,,,,,,,,
4,291457,Gloomhaven: Jaws of the Lion,Gloomhaven: Jaws of the Lion is a standalone g...,2020,3.5649,8.72198,8.25902,1.42169,1,4,12.051948,4.000000,2,"[1, 2, 3, 4]",33444,477,6250,485,120,30,120,14,13939,2149,6,0,11,0,1649,1471587,1073,2062,0,0,Gloomhaven,,Dungeon Crawl,,https://cf.geekdo-images.com/_HhIdavYW-hid20Iq...,5.0,3.0,5.0,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21403,7316,Bingo,A classic party game in which players cover pl...,1530,1.0439,2.87934,3.96295,1.76399,2,99,4.720000,1.062500,3,[3],1653,2,27,205,60,60,60,5,2257,707,72,0,0,0,10,57207,0,12,0,0,Classic (Schmidt Spiele),,,,https://cf.geekdo-images.com/7xqN6StcQz1FoGplo...,21478.0,,,,,,,631.0,,,,,,,,
21404,5048,Candy Land,Created by Eleanor Abbott in the early 1940's ...,1949,1.1012,3.18388,3.79668,1.72104,2,4,3.325000,1.052632,4,"[2, 3, 4]",6109,4,66,346,30,30,30,3,4178,1543,10,0,0,3,36,321486,3,108,0,0,,Food / Cooking,,,https://cf.geekdo-images.com/97n-BYkjnFiHAhqUz...,21479.0,,,,,,,,873.0,,,,,,,
21405,5432,Chutes and Ladders,Traditional game from ancient India was brough...,-200,1.0195,2.86610,3.61367,1.64312,2,6,3.357143,1.000000,4,"[2, 3, 4, 5]",4705,4,58,308,30,30,30,3,3967,1318,162,0,0,0,22,252400,0,55,0,0,GoPlay,Circus,,,https://cf.geekdo-images.com/P1qJDS_DFTtP_FrpW...,21480.0,,,,,,,,874.0,,,,,,,
21406,11901,Tic-Tac-Toe,A very old game where each player attempts to ...,-1300,1.1697,2.69687,3.57174,1.98415,2,2,4.181818,1.035714,2,[2],1436,9,27,383,1,1,1,4,3399,1007,38,1,0,0,28,147304,2,66,0,0,Game in a Tin (HABA),Video Game Theme: Super Mario Bros.,n in a row,,https://cf.geekdo-images.com/UImMYmMZKE4AGTMPH...,21481.0,,,,,,1106.0,,875.0,,,,,,,


In [13]:
comments

Unnamed: 0,BGGId,Name,Rating,Value,Username
0,174430,Gloomhaven,,Cons: -10/10 fiddliness -The time investment i...,-Johnny-
1,174430,Gloomhaven,9,Kickstarter (Estimated delivery: May 2016),-mIDE-
2,174430,Gloomhaven,,If it's half as good as Forge War...,0 1 1 2 3 5 8
3,174430,Gloomhaven,10,Best and most addictive coop game I’ve ever pl...,0stuart0
4,174430,Gloomhaven,5,[IMG]https://cf.geekdo-static.com/mbs/mb_17587...,1 Family Meeple
...,...,...,...,...,...
4881451,339592,Sheep in Disguise,10,Este es el primer proyecto que patrocino en Ki...,Pitersenpai
4881452,339592,Sheep in Disguise,10,"This game looks to be a lot of fun, I'm lookin...",Radalict
4881453,339592,Sheep in Disguise,10,Can't wait to play it go support it on kicksta...,Rigoberto123
4881454,339592,Sheep in Disguise,10,"I cannot WAIT to get this game, there will be ...",TAB4two


In [None]:
game_ids = pd.read_pickle('game_ids.pkl')
scrape = list(game_ids[0][21409:])

In [None]:
for i in scrape:
  
    this_game, designer, category, mechanic, artist, publisher, comment, award, ratings_dist_1 = create_game_entry(i) #, review
    
    games = games.append(this_game, ignore_index = True)
    designers = designers.append(designer, ignore_index=True)
    categories = categories.append(category, ignore_index=True)
    mechanics = mechanics.append(mechanic, ignore_index=True)
    artists = artists.append(artist, ignore_index=True)
    publishers = publishers.append(publisher, ignore_index=True)
    comments = comments.append(comment, ignore_index=True)
    awards = awards.append(award, ignore_index=True)
    ratings_dist = ratings_dist.append(ratings_dist_1, ignore_index=True)


In [None]:
games

In [None]:
comments

In [15]:
games.to_pickle('games.pkl')
designers.to_pickle('designers.pkl')
categories.to_pickle('categories.pkl')
mechanics.to_pickle('mechanics.pkl')
artists.to_pickle('artists.pkl')
publishers.to_pickle('publishers.pkl')
comments.to_pickle('comments.pkl')
awards.to_pickle('awards.pkl')
ratings_dist.to_pickle('ratings_dist.pkl')
#reviews.to_pickle('reviews.pkl')

In [None]:
break

## Workspace

Section of forum threads pulled out of the main function. Too many and keeps timing out. Will get later...

In [None]:
        # Get forum review text - new api calls
        
        reviews = pd.DataFrame(columns=['Name', 'Author', 'Subject', 'PostDate', 'ReviewText'])
        names, authors, subjects, postdates, reviewtexts = [], [], [], [], []
        
        forum_page = requests.get("https://www.boardgamegeek.com/xmlapi2/forumlist?id="+str(game_id)+"&type=thing")

        forums = BeautifulSoup(forum_page.content, 'xml')
        total_threads = 0
        all_forums = forums.find_all('forum')

        forum_id = all_forums[0]['id']
        
        reviews_page = requests.get('https://www.boardgamegeek.com/xmlapi2/forum?id='+str(forum_id))
        get_reviews = BeautifulSoup(reviews_page.content, 'xml')
        all_reviews = get_reviews.find_all('thread')
        
        print("Getting forum reviews. "+str(len(all_reviews))+" to parse.")
        for item in all_reviews:
            author = item['author']
            subject = item['subject']
            postdate =  item['postdate']
            review_id = item['id']
    
            specific_thread = requests.get('https://www.boardgamegeek.com/xmlapi2/thread?id='+str(review_id))
            thread = BeautifulSoup(specific_thread.content, 'xml')
            review_text = thread.find('body').text.replace('<br/>','').replace('</b>','').replace('<b>','').replace('&lt;br/&gt;','').replace('&lt;br/&gt;&lt;b&gt;','').replace('&lt;b&gt;','').strip(' ')

            
            names.append(game_name)
            authors.append(author)
            subjects.append(subject)
            postdates.append(postdate)
            reviewtexts.append(review_text)
            
            time.sleep(2)
        
        reviews['Name'] = names
        reviews['Author'] = authors
        reviews['Subject'] = subjects
        reviews['PostDate'] = postdates
        reviews['ReviewText'] = reviewtexts

# FIX COMMENTS

In [None]:
games

# Get game ids

In [None]:
d1 = games.set_index('Name').to_dict()['BGGId']
d1
comments['BGGId'] = comments['Name'].map(d1)

In [None]:
break

In [None]:
comments['BGGId'] = comments['BGGId'].astype(int)

In [None]:
comments.tail(300)

In [None]:
game_ids = []

for i in range(1,251):
    path = "https://boardgamegeek.com/browse/boardgame/page/"+str(i)
    print(path)
    page = requests.get(path)
    time.sleep(1)
    rank_titles = BeautifulSoup(page.content, 'html.parser')
    titles = rank_titles.find_all('a', class_='primary')
    for item in titles:
        game_id = item['href'].strip('https://boardgamegeek.com/boardgame/')
        game_id = re.sub("/.*$", "", game_id)
        game_ids.append(game_id)
    
    wait = np.random.randint(1,3)
    time.sleep(wait)

In [None]:
game_ids = pd.DataFrame(game_ids)
game_ids.to_pickle('game_ids.pkl')

## Create Storage Frames

DO NOT RUN AGAIN

In [None]:
'''columns = ['BGGId',
                'Name',
               'Description',
                'YearPublished',
                'GameWeight',
                'AvgRating',  
                'BayesAvgRating',
                'StdDev',
                'MinPlayers',
                'MaxPlayers',
                'ComAgeRec',
                'LanguageEase',
                'BestPlayers',
                'GoodPlayers',
                'NumOwned',
                'NumWant',
                'NumWish',
                'NumWeightVotes',
                'MfgPlaytime',
                'ComMinPlaytime',
                'ComMaxPlaytime',
                'MfgAgeRec',
                'NumUserRatings',
                'NumComments',
                'NumAlternates',
                'NumExpansions',
               'NumAwards',
                'NumImplementations',
               'NumFans',
               'NumPageViews',
               'RulesPosts',
               'TotalPosts',
               'IsExpansion',
           'IsReimplementation',
                'Family',
                'Theme',
               'Category',
               'Kickstarted',
               'ImagePath',
          ]

games = pd.DataFrame(columns=columns)
designers = pd.DataFrame(columns=['BGGId'])
categories = pd.DataFrame(columns=['BGGId'])
mechanics = pd.DataFrame(columns=['BGGId'])
artists = pd.DataFrame(columns=['BGGId'])
publishers = pd.DataFrame(columns=['BGGId'])
comments = pd.DataFrame(columns=['BGGId'])
awards = pd.DataFrame(columns=['BGGId'])
ratings_dist = pd.DataFrame(columns=['BGGId'])
reviews = pd.DataFrame(columns=['BGGId'])'''

In [None]:
'''games.to_pickle('games.pkl')
designers.to_pickle('designers.pkl')
categories.to_pickle('categories.pkl')
mechanics.to_pickle('mechanics.pkl')
artists.to_pickle('artists.pkl')
publishers.to_pickle('publishers.pkl')
comments.to_pickle('comments.pkl')
awards.to_pickle('awards.pkl')
ratings_dist.to_pickle('ratings_dist.pkl')
reviews.to_pickle('reviews.pkl')'''