Reference https://boardgamegeek.com/wiki/page/BGG_XML_API2

In [16]:
import requests
import xmltodict

In [17]:
# Falling sky rules forum
fs_rules_forum_id = 1542958

In [18]:
# for testing purposes use the BGG XML API2 and see what we get
req = requests.get('https://www.boardgamegeek.com/xmlapi2/forum?id=1542958&page=1')
rules_forum = xmltodict.parse(req.text)

In [19]:
topics = rules_forum['forum']['threads']['thread']

In [20]:
# show the first topic on page 1
topics[0]

OrderedDict([('@id', '1779498'),
             ('@subject', 'No faction eligible for a card'),
             ('@author', 'Rheinlaender'),
             ('@numarticles', '2'),
             ('@postdate', 'Sat, 13 May 2017 03:55:18 +0000'),
             ('@lastpostdate', 'Sat, 13 May 2017 04:27:25 +0000')])

In [21]:
# display the id of the first topic on page 1
topics[0]['@id']

'1779498'

In [36]:
# list all the topics on page 1
for topic in topics[:5]:
    print(topic['@id'], '-', topic['@postdate'][5:16], '-', topic['@author'], '-', topic['@subject'])

1779498 - 13 May 2017 - Rheinlaender - No faction eligible for a card
1776616 - 08 May 2017 - Rheinlaender - Flight of Ambiorix - Averni Bot plays it?
1774999 - 04 May 2017 - marsss - caesar defending and counterattacking
1774924 - 04 May 2017 - Kaede11 - Aedui boii event (NONPLAYER)
1774094 - 03 May 2017 - Kaede11 - How do bots retreat?


In [23]:
# retrieve the data associated with 1692043
req = requests.get('https://www.boardgamegeek.com/xmlapi2/thread?id=1692043')
mythread = xmltodict.parse(req.text)

In [24]:
# the thread is broken down into articles.
# display all the articles from 1692043
mythread['thread']['articles']['article']

[OrderedDict([('@id', '24530423'),
              ('@username', 'familygaming'),
              ('@link', 'http://boardgamegeek.com/article/24530423#24530423'),
              ('@postdate', '2016-12-18T23:34:17-06:00'),
              ('@editdate', '2016-12-18T23:35:21-06:00'),
              ('@numedits', '1'),
              ('subject', 'PLAYBOOK - Confusion about Roman recruit (p. 13)'),
              ('body',
               'On page 13, left column, Fourth paragraph (above the diagram), it says that Romans have "Recruiting possibilities remain in the Nervii and Unii regions". Why can\'t they recruit in Treveri also, where they have a fort?<br/><br/>The rules clearly state (3.2.1) that Romans need control to get an Ally down, but they only need Leader/Fort/Ally to add Auxilia. Since Treveri is "No Control", why can\'t the Romans place Auxilia there? If it was under Gallic control could they?<br/><br/><br/>I love this community because I know I will have an answer in less than 15 minutes. 

In [25]:
from bs4 import BeautifulSoup

In [26]:
# Replace HTML line feeds and paragraphs using appropriate \n, then use BeautifulSoup to strip out HTML tags
print(BeautifulSoup(
    mythread['thread']['articles']['article'][0]['body'].replace('<br/>','\n').replace('</p>', '\n\n'),
    'lxml').text)

On page 13, left column, Fourth paragraph (above the diagram), it says that Romans have "Recruiting possibilities remain in the Nervii and Unii regions". Why can't they recruit in Treveri also, where they have a fort?

The rules clearly state (3.2.1) that Romans need control to get an Ally down, but they only need Leader/Fort/Ally to add Auxilia. Since Treveri is "No Control", why can't the Romans place Auxilia there? If it was under Gallic control could they?


I love this community because I know I will have an answer in less than 15 minutes. Thanks in advance.

EDIT: Added "place Auxilia" to be more specific.


In [27]:
# we want to see how things behave when we pull data for an non-existent page.
req = requests.get('https://www.boardgamegeek.com/xmlapi2/forum?id=1542958&page=10')
rules_forum = xmltodict.parse(req.text)

In [33]:
page_num = 0
threads = []
# Gather all threads from the Falling Sky rules forum (1542958) by looping through all topic pages.
# We don't know how many pages there are so we will use a while loop.
# When xml is returned with no thread data, we will get a TypeError exception when we try to access it. 
while True:
    page_num += 1
    uri = 'https://www.boardgamegeek.com/xmlapi2/forum?id=1542958&page={}'.format(page_num)
    req = requests.get(uri)
    rules_forum_topics_page = xmltodict.parse(req.text)
    try:
        # get the threads from the page
        thread_list = rules_forum_topics_page['forum']['threads']['thread']
        
    except TypeError:
        # TypeError exception thrown because there are no threads to access so break out of loop 
        break
        
    # create the link to get to the thread
    for thread in thread_list:
        thread['link'] = 'https://boardgamegeek.com/thread/{}'.format(thread['@id'])
    
    threads.extend(thread_list)

In [35]:
threads[:3]

[OrderedDict([('@id', '1779498'),
              ('@subject', 'No faction eligible for a card'),
              ('@author', 'Rheinlaender'),
              ('@numarticles', '2'),
              ('@postdate', 'Sat, 13 May 2017 03:55:18 +0000'),
              ('@lastpostdate', 'Sat, 13 May 2017 04:27:25 +0000'),
              ('link', 'https://boardgamegeek.com/thread/1779498')]),
 OrderedDict([('@id', '1776616'),
              ('@subject', 'Flight of Ambiorix - Averni Bot plays it?'),
              ('@author', 'Rheinlaender'),
              ('@numarticles', '4'),
              ('@postdate', 'Mon, 08 May 2017 01:01:47 +0000'),
              ('@lastpostdate', 'Mon, 08 May 2017 10:49:11 +0000'),
              ('link', 'https://boardgamegeek.com/thread/1776616')]),
 OrderedDict([('@id', '1774999'),
              ('@subject', 'caesar defending and counterattacking'),
              ('@author', 'marsss'),
              ('@numarticles', '2'),
              ('@postdate', 'Thu, 04 May 2017 21:06:13 +