# parsing

In [1]:
import requests
import xml.etree.ElementTree as ET

import pandas as pd

In [2]:
BASE_URI = 'https://www.boardgamegeek.com/xmlapi2/'

def get_users_collection(username):
    collections_endpoint = BASE_URI + 'collection?'
    parameters = f'username={username}'
    return requests.get(collections_endpoint + parameters)

def get_xml_string_from_response(response): 
    return ET.fromstring(response.text)

def get_game_ids_from_collection(collection_element_tree):
    return [child.attrib['objectid'] for child in collection_element_tree]

def get_games_from_game_ids(game_ids):
    comma_seperated_game_ids = ','.join(game_ids)
    thing_endpoint = BASE_URI + 'thing?'
    parameters = f'id={comma_seperated_game_ids}&stats=1'
    print(thing_endpoint + parameters)
    return requests.get(thing_endpoint + parameters)

In [20]:
class BoardgameXMLParser:
    def __init__(self, board_game_element):
        self.board_game_element = board_game_element
        self.type = self.board_game_element.get('type')
        self.id = self.board_game_element.get('id')
        self.title = self._get_attribute_from_element('name[@type="primary"]', 'value')
        self.description = self.board_game_element.find('description').text
        self.image = self.board_game_element.find('image').text
        self.thumbnail = self.board_game_element.find('thumbnail').text
        self.year_published = self._get_attribute_from_element('yearpublished', 'value')
        self.min_players_from_creators = self._get_attribute_from_element('minplayers', 'value')
        self.max_players_from_creators = self._get_attribute_from_element('maxplayers', 'value')
        self.playing_time = self._get_attribute_from_element('playingtime', 'value')
        self.min_playing_time = self._get_attribute_from_element('minplaytime', 'value')
        self.max_playing_time = self._get_attribute_from_element('maxplaytime', 'value')
        self.min_age = self._get_attribute_from_element('minage', 'value')
        self.average_rating = self._get_attribute_from_element('.//average', 'value')
        self.bayes_average_rating = self._get_attribute_from_element('.//bayesaverage', 'value')
        self.board_game_rank = self._get_attribute_from_element('.//rank[@name="boardgame"]', 'value')  
        
        self.designers = self._get_attributes_from_element('link[@type="boardgamedesigner"]', 'value')
        self.mechanics = self._get_attributes_from_element('link[@type="boardgamemechanic"]', 'value')
        self.categories = self._get_attributes_from_element('link[@type="boardgamecategory"]', 'value')
        
        suggested_player_poll_element, suggested_players_total_votes = self._get_poll_results('poll[@name="suggested_numplayers"]')

        if suggested_players_total_votes == 0:
            self.user_suggested_best_number_of_players = ''
            self.user_suggested_recommended_number_of_players = ''
        else:   
            suggested_player_counts_df = self._get_suggested_player_counts_dataframe(suggested_player_poll_element)
            suggested_player_counts_df_with_poll_result = self._get_suggested_player_counts_data_with_realtive_amounts(suggested_player_counts_df)
            self.user_suggested_best_number_of_players = self._get_best_player_counts(suggested_player_counts_df_with_poll_result)
            self.user_suggested_recommended_number_of_players = self._get_recommended_player_counts(suggested_player_counts_df_with_poll_result)
        
    def _get_attribute_from_element(self, find_string, attribute_name):
        element = self.board_game_element.find(f'{find_string}')
        return element.get(attribute_name)
    
    def _get_attributes_from_element(self, find_string, attribute_name):
        elements = self.board_game_element.findall(f'{find_string}')
        attributes = [element.get(attribute_name) for element in elements]
        return '|'.join(attributes)
    
    def _get_poll_results(self, poll_pattern):
        poll = self.board_game_element.find(poll_pattern)
        return (poll.findall('results'), int(poll.get('totalvotes')))
    
    def _get_suggested_player_counts_dataframe(self, suggested_player_poll_element):
        options = []
        for option in suggested_player_poll_element:
            option_row = {}
            option_row['num_players'] = option.get('numplayers')
            option_row['best'] = int(option.find('result[@value="Best"]').get('numvotes'))
            option_row['recommended'] = int(option.find('result[@value="Recommended"]').get('numvotes'))
            option_row['not_recommended'] = int(option.find('result[@value="Not Recommended"]').get('numvotes'))
            options.append(option_row)
        return pd.DataFrame(options)
    
    def _get_suggested_player_counts_data_with_realtive_amounts(self, suggested_player_counts_df):
        df = suggested_player_counts_df
        df.loc[df['not_recommended'] > df['recommended'] + df['best'], 'poll_result'] = 'not_recommended'
        df.loc[(df['poll_result'] != 'not_recommended') & (df['recommended'] < df['best']), 'poll_result'] = 'best'
        df.loc[(df['poll_result'] != 'not_recommended') & (df['poll_result'] != 'best'), 'poll_result'] = 'recommended'
        return df
        
    def _get_best_player_counts(self, poll_result):
        best_player_counts = poll_result.loc[poll_result['poll_result'] == 'best', 'num_players'].tolist()
        return '|'.join(best_player_counts)
    
    def _get_recommended_player_counts(self, poll_result):
        recommended_player_counts = poll_result.loc[poll_result['poll_result'] != 'not_recommended', 'num_players'].tolist()
        return '|'.join(recommended_player_counts)
    

class CollectionXMLParser():
    def __init__(self, bgg_username):
        pass
    def get_users_collection(username):
        BASE_URI = 'https://www.boardgamegeek.com/xmlapi2/'
        collections_endpoint = BASE_URI + 'collection?'
        parameters = f'username={username}'
        return request,s.get(collections_endpoint + parameters)

In [4]:
collection = get_users_collection('bobbaganush')

In [5]:
# Uses the api, own cell not to re-send the request

collection_et = get_xml_string_from_response(collection)
game_ids = get_game_ids_from_collection(collection_et)

In [6]:

games = get_games_from_game_ids(game_ids)
games_et = get_xml_string_from_response(games)

https://www.boardgamegeek.com/xmlapi2/thing?id=68448,173346,31260,13464,205637,230802,170216,174506,174801,224517,171131,822,21385,13,926,325,553,478,178900,198773,39463,225694,104162,36218,5177,283355,157958,72125,246900,175621,177736,199478,169124,175155,37904,31481,23730,291457,193738,227460,198994,154597,859,154203,206051,84159,257501,257,70323,281960,823,143884,463,205059,1927,3943,1621,164928,30549,161936,221107,218603,2651,183006,28143,41114,181,121921,18,237182,438,169786,199727,242277,298638,8222,148228,187645,226840,1897,146508,189035,229853,120677,167791,247030,244522,182028,14996,276894,148951,233078,126163,122328,115746,228051,261594,262906,233867,266192,163602&stats=1


# sqlalchemy

In [7]:
from sqlalchemy import create_engine

In [8]:
engine = create_engine('sqlite:///db.sqlite', echo=True)

In [9]:
from sqlalchemy.ext.declarative import declarative_base

In [10]:
Base = declarative_base()

In [11]:
from sqlalchemy import Column, Integer, String

In [12]:
class User(Base):
    __tablename__ = 'users'
    id = Column(Integer, primary_key=True)
    name = Column(String, unique=True)

In [13]:
class UserGame(Base):
    __tablename__ = 'user_games'
    id = Column(Integer, primary_key=True)
    user_id = Column(Integer, nullable=False)
    bgg_game_id = Column(Integer, nullable=False)
    user_rating = Column(Integer)
    

In [14]:
class Game(Base):
    __tablename__ = 'games'
    id = Column(Integer, primary_key=True)
    bgg_game_id = Column(Integer, nullable=False, unique=True)
    title = Column(String, nullable=False)
    type = Column(String, nullable=False)
    year_published = Column(Integer, nullable=False)
    description = Column(String, nullable=False)
    image_url = Column(String, nullable=False)
    thumbnail_url = Column(String, nullable=False)
    min_players = Column(Integer, nullable=False)
    max_players = Column(Integer, nullable=False)
    playing_time = Column(Integer, nullable=False)
    min_playing_time = Column(Integer, nullable=False)
    max_playing_time = Column(Integer, nullable=False)
    min_age = Column(Integer, nullable=False)
    average_rating = Column(Integer, nullable=False)
    bayes_average_rating = Column(Integer, nullable=False)
    board_game_rank = Column(Integer, nullable=False)
    designers = Column(String, nullable=False)
    mechanics = Column(String, nullable=False)
    categories = Column(String, nullable=False)
    user_suggested_best_number_of_players = Column(String, nullable=False)
    user_suggested_recommended_number_of_players = Column(String, nullable=False)
    

In [15]:
Base.metadata.create_all(engine)

2021-05-16 21:02:33,587 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2021-05-16 21:02:33,588 INFO sqlalchemy.engine.base.Engine ()
2021-05-16 21:02:33,592 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2021-05-16 21:02:33,593 INFO sqlalchemy.engine.base.Engine ()
2021-05-16 21:02:33,596 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("users")
2021-05-16 21:02:33,598 INFO sqlalchemy.engine.base.Engine ()
2021-05-16 21:02:33,601 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("users")
2021-05-16 21:02:33,603 INFO sqlalchemy.engine.base.Engine ()
2021-05-16 21:02:33,604 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("user_games")
2021-05-16 21:02:33,605 INFO sqlalchemy.engine.base.Engine ()
2021-05-16 21:02:33,607 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("user_games")
2021-05-16 21:02:33,608 INFO sqlalchemy.engine.base.Engine ()
2021-05-16 21:02

In [16]:
from sqlalchemy.orm import sessionmaker
Session = sessionmaker(bind=engine)

In [38]:
session = Session()

In [39]:
l = []
for game in games_et:
    bg = BoardgameXMLParser(game)
    bg_id_already_in_database = session.query(Game.bgg_game_id).filter(Game.bgg_game_id==bg.id).first() is not None
    if not bg_id_already_in_database:
        bg_sql = Game(
                     bgg_game_id=bg.id,
                     title=bg.title,
                     type=bg.type,
                     description=bg.description,
                     year_published=bg.year_published,
                     image_url=bg.image,
                     thumbnail_url=bg.thumbnail,
                     min_players=bg.min_players_from_creators,
                     max_players=bg.max_players_from_creators,
                     playing_time=bg.playing_time,
                     min_playing_time=bg.min_playing_time,
                     max_playing_time=bg.max_playing_time,
                     min_age=bg.min_age,
                     average_rating=bg.average_rating,
                     bayes_average_rating=bg.bayes_average_rating,
                     board_game_rank=bg.board_game_rank,
                     designers=bg.designers,
                     mechanics=bg.mechanics,
                     categories=bg.categories,
                     user_suggested_best_number_of_players=bg.user_suggested_best_number_of_players,
                     user_suggested_recommended_number_of_players=bg.user_suggested_recommended_number_of_players
                     )
        l.append(bg_sql)
    else:
        print('game already in games.')
session.add_all(l)
    
session.commit()

2021-05-16 22:02:54,973 INFO sqlalchemy.engine.base.Engine BEGIN (implicit)
2021-05-16 22:02:54,976 INFO sqlalchemy.engine.base.Engine SELECT games.bgg_game_id AS games_bgg_game_id 
FROM games 
WHERE games.bgg_game_id = ?
 LIMIT ? OFFSET ?
2021-05-16 22:02:54,977 INFO sqlalchemy.engine.base.Engine ('68448', 1, 0)
game already in games.
2021-05-16 22:02:54,988 INFO sqlalchemy.engine.base.Engine SELECT games.bgg_game_id AS games_bgg_game_id 
FROM games 
WHERE games.bgg_game_id = ?
 LIMIT ? OFFSET ?
2021-05-16 22:02:54,990 INFO sqlalchemy.engine.base.Engine ('173346', 1, 0)
game already in games.
2021-05-16 22:02:55,006 INFO sqlalchemy.engine.base.Engine SELECT games.bgg_game_id AS games_bgg_game_id 
FROM games 
WHERE games.bgg_game_id = ?
 LIMIT ? OFFSET ?
2021-05-16 22:02:55,007 INFO sqlalchemy.engine.base.Engine ('31260', 1, 0)
game already in games.
2021-05-16 22:02:55,021 INFO sqlalchemy.engine.base.Engine SELECT games.bgg_game_id AS games_bgg_game_id 
FROM games 
WHERE games.bgg_gam