# Data load II.

### What the code does:
* This code loads the data from the Board Game Atlas API and creates a csv file

**Import libraries**

In [1]:
import pandas as pd
import requests
import json
import time
import math

### API call options:
* In general search we can specify over 40 parameters such as: designer, publisher, maximum price etc.
* One call can return maximum 100 responses.
* If we search for name, it will return board games whose name contain the string specified (even if exact parameter is set to True).

### Technical notes:
* The full list of codes in this workbook could run for about 30-50 minutes (depending on connection to API server), most of which is from the two API calls.


In [2]:
# Get category list - Sample
total_url = 'https://api.boardgameatlas.com/api/search?pretty=true&client_id=6MXavgYBke'
total_response = requests.get(total_url)
total_dict = json.loads(total_response.text)


In [3]:
# Total number of board games listed
total_dict['count']

142173

### Main issues:
* With the highest limit 100, it would take 1421 calls to return all games listed. 
* However, the majority of games are depreciated without too much valuable information.
* If we were to search by name, we could only get 1 game at a time.
* Therefore, we decided to look for parameters that partition the results and likely return valuable information.
* For starters, we looked into games with categories listed

**Get games by categories**

In [4]:
# Get category list - Sample
categ_url = 'https://api.boardgameatlas.com/api/game/categories?client_id=6MXavgYBke'
categ_response = requests.get(categ_url)
categ_dict = json.loads(categ_response.text)
list(categ_dict.values())[0][0]

{'id': '2bdFPJUvFo',
 'name': '18XX',
 'url': 'https://www.boardgameatlas.com/category/2bdFPJUvFo/18xx'}

In [5]:
# Get category ids
categ_ids = []
for cat_num in list(range(0,len(categ_dict['categories']))):
    categ_ids.append(categ_dict['categories'][cat_num]['id'])
categ_ids[1:15]

['85OKv8p5Ow',
 'hBqZ3Ar4RJ',
 'GtuMb7ei27',
 'OE07lsfVqf',
 'KUBCKBkGxV',
 'DjAhqEHOD0',
 '20iDvpbh7A',
 'tJxatX2ZbW',
 'nWDac9tQzt',
 'dghLhwyxVb',
 '4mOtRRwSoj',
 'a8NM5cugJX',
 'MWoxgHrOJD',
 'eFaACC6y2c']

In [6]:
# Number of categories
len(categ_ids)

156

#### 2. Request board game data using the category ids 


In [7]:
# Sample request
categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories=2bdFPJUvFo'
response = requests.get(categ_url)
games_dict = json.loads(response.text)
list(games_dict.values())[0][0]

{'id': 'o6knI5ct0u',
 'handle': 'mayfair-games-1830-railways-and-robber-barons--north-east-us',
 'url': 'https://www.boardgameatlas.com/game/o6knI5ct0u/mayfair-games-1830-railways-and-robber-barons--north-east-us',
 'edit_url': 'https://www.boardgameatlas.com/game/o6knI5ct0u/edit',
 'name': '1830: Railways & Robber Barons',
 'price': '52.99',
 'price_ca': '74.99',
 'price_uk': '50.15',
 'price_au': '89.95',
 'msrp': 69.99,
 'msrps': [{'country': 'US', 'price': 69.99}],
 'discount': '0.24',
 'year_published': 1986,
 'min_players': 2,
 'max_players': 7,
 'min_playtime': 180,
 'max_playtime': 360,
 'min_age': 14,
 'description': "<p>1830. It is the dawn of the &quot;Age of Railroading&quot; in America. You're a wealthy investor and speculator betting that the new technology will revolutionize transport. Commerce will no longer depend on rutted roads and slow canals. Instead, it will ride the rails on swift, powerful &quot;Iron Horses.&quot;</p>\r\n<p><strong>1830</strong> is an acclaimed 

### Further issues:
* A single category may contain thousands of results.
* In theory we can skip any number results, but in practice the maximum skip value allowed is 1000.
* Therefore, with any call returning maximum 100 results, in total we can obtain up-to 1100 results from any category.


In [8]:
# Get board games
categ_df = pd.DataFrame()
for category_id in categ_ids:
    next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}'.format(category_id)
    next_response = requests.get(next_categ_url)
    next_games_dict = json.loads(next_response.text)
    categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
    time.sleep(1)
    # This is how we specify getting 1100 results.
    if next_games_dict['count'] > 1000:
        for hundred in list(range(1,11)):
            limit = 100 * hundred
            next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}&skip={}'.format(
                category_id , limit)
            next_response = requests.get(next_categ_url)
            next_games_dict = json.loads(next_response.text)
            categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
            time.sleep(1)
    elif next_games_dict['count'] > 100:
        for hundred in list(range(1,(math.ceil(next_games_dict['count'] / 100)))):
            limit = 100 * hundred
            next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}&skip={}'.format(
                category_id , limit)
            next_response = requests.get(next_categ_url)
            next_games_dict = json.loads(next_response.text)
            categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
            time.sleep(1)

In [9]:
pd.options.display.max_columns = None
display(categ_df.head())

Unnamed: 0,id,handle,url,edit_url,name,price,price_ca,price_uk,price_au,msrp,msrps,discount,year_published,min_players,max_players,min_playtime,max_playtime,min_age,description,commentary,faq,thumb_url,image_url,matches_specs,specs,mechanics,categories,publishers,designers,primary_publisher,primary_designer,developers,related_to,artists,names,rules_url,amazon_rank,official_url,sell_sheet_url,store_images_url,comment_count,num_user_ratings,average_user_rating,historical_low_prices,active,num_user_complexity_votes,average_learning_complexity,average_strategy_complexity,visits,lists,mentions,links,plays,rank,type,num_distributors,trending_rank,listing_clicks,is_historical_low,players,playtime,msrp_text,price_text,tags,images,description_preview,sku,upc,skus,sku_objects,size_height,size_depth,cs_rating,weight_amount,weight_units,size_width,isbn,availability_status,size_units,video_links
0,o6knI5ct0u,mayfair-games-1830-railways-and-robber-barons-...,https://www.boardgameatlas.com/game/o6knI5ct0u...,https://www.boardgameatlas.com/game/o6knI5ct0u...,1830: Railways & Robber Barons,52.99,74.99,50.15,89.95,69.99,"[{'country': 'US', 'price': 69.99}]",0.24,1986.0,2.0,7.0,180.0,360.0,14.0,<p>1830. It is the dawn of the &quot;Age of Ra...,,,https://s3-us-west-1.amazonaws.com/5cc.images/...,https://s3-us-west-1.amazonaws.com/5cc.images/...,,[],"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...","[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...","[{'id': 'w1jOf2uAOD', 'num_games': None, 'scor...","[{'id': 'RpZJ6vqsPR', 'num_games': None, 'scor...","{'id': 'w1jOf2uAOD', 'name': 'The Avalon Hill ...","{'id': 'RpZJ6vqsPR', 'name': 'Francis Tresham'...",[],[],"[Mike Atkinson, Jared Blando, Charles Kibler, ...",[],https://lookout-spiele.de/wp-content/uploads/1...,559911.0,http://www.mayfairgames.com/products/1830-nort...,,,0,36,4.006944,"[{'country': 'UK', 'date': '2021-12-18T12:07:5...",True,2,3.0,4.5,1920,273,4,0,244,874,game,1,0,0,False,2-7,180-360,$69.99,$52.99,"[1830: Ferrovie e Capitani d'Industria, 1830: ...",{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,"1830. It is the dawn of the ""Age of Railroadi...",,,,,,,,,,,,,,
1,eSCVHuUFPs,1846-the-race-for-the-midwest,https://www.boardgameatlas.com/game/eSCVHuUFPs...,https://www.boardgameatlas.com/game/eSCVHuUFPs...,1846: The Race for the Midwest,46.99,67.95,0.0,0.0,69.0,"[{'country': 'US', 'price': 69}, {'country': '...",0.32,2005.0,3.0,5.0,240.0,240.0,14.0,<p>1846 - the midwest railroad boom has gone b...,,,https://s3-us-west-1.amazonaws.com/5cc.images/...,https://s3-us-west-1.amazonaws.com/5cc.images/...,,[],"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...","[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...","[{'id': 'VKIPDDgZ2X', 'num_games': None, 'scor...","[{'id': 'CeUZNYVdQj', 'num_games': None, 'scor...","{'id': 'VKIPDDgZ2X', 'name': 'Deep Thought Gam...","{'id': 'CeUZNYVdQj', 'name': 'Thomas Lehmann',...",[],[],"[Chris Lawson, Rodger B. MacGowan, Kurt Miller...",[],https://s3-us-west-2.amazonaws.com/gmtwebsitea...,125448.0,https://www.gmtgames.com/p-847-1846-the-race-t...,,,0,37,3.885135,"[{'country': 'UK', 'date': '2021-12-14T18:14:1...",True,1,4.0,4.0,1095,228,2,0,316,927,game,3,0,11,False,3-5,240,$69.00,$46.99,"[1846 The Race to the Midwest Printing, 1846: ...",{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,1846 - the midwest railroad boom has gone bus...,GMT1605,817054011155.0,"[2149184041, 2149209107]","[{'name': 'Noble Knight Games', 'sku': '214918...",,,,,,,,,,
2,uEPtE5OOOU,1862-railway-mania-in-the-eastern-counties,https://www.boardgameatlas.com/game/uEPtE5OOOU...,https://www.boardgameatlas.com/game/uEPtE5OOOU...,1862: Railway Mania in the Eastern Counties,57.99,79.95,0.0,127.99,79.0,"[{'country': 'US', 'price': 79}]",0.27,2013.0,1.0,8.0,300.0,300.0,12.0,<p>1862: Railway Mania in the Eastern Counties...,,,https://s3-us-west-1.amazonaws.com/5cc.images/...,https://s3-us-west-1.amazonaws.com/5cc.images/...,,[],"[{'id': 'qCXa8MX0wk', 'url': 'https://www.boar...","[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...","[{'id': 'd5oY0duBgG', 'num_games': None, 'scor...","[{'id': 'xMrDEmBdTo', 'num_games': None, 'scor...","{'name': 'GMT Games', 'id': 'd5oY0duBgG', 'url...","{'name': 'Mike Hutton', 'id': 'xMrDEmBdTo', 'u...",[],[],[Mike Hutton],[],https://gmtwebsiteassets.s3-us-west-2.amazonaw...,,https://www.gmtgames.com/p-692-1862-railway-ma...,,,0,9,4.111111,"[{'country': 'UK', 'date': '2021-12-11T11:48:1...",True,0,0.0,0.0,1077,123,13,1,71,9999999,game,2,0,0,False,1-8,300,$79.00,$57.99,"[1862: Railway Mania in the Eastern Counties, ...",{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,1862: Railway Mania in the Eastern Counties i...,GMT1904,817054011704.0,[2149191519],"[{'name': 'Noble Knight Games', 'sku': '214919...",9.0,3.0,,,,,,,,
3,Krn8i8C0fI,18chesapeake,https://www.boardgameatlas.com/game/Krn8i8C0fI...,https://www.boardgameatlas.com/game/Krn8i8C0fI...,18Chesapeake,0.0,0.0,0.0,0.0,89.0,"[{'country': 'US', 'price': 89}, {'country': '...",0.0,2020.0,2.0,6.0,180.0,180.0,13.0,"<p>18Chesapeake is a member of the <a href=""ht...",,,https://s3-us-west-1.amazonaws.com/5cc.images/...,https://s3-us-west-1.amazonaws.com/5cc.images/...,,[],"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...","[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...","[{'id': 'ZOP4wDStJq', 'num_games': None, 'scor...","[{'id': 'fN0XCgUAPZ', 'num_games': None, 'scor...","{'id': 'ZOP4wDStJq', 'name': 'All-Aboard Games...","{'id': 'fN0XCgUAPZ', 'name': 'Scott Petersen',...",[],[],[Brigette Indelicato],[],,,https://all-aboardgames.com/products/us-only-1...,,,0,11,4.454545,"[{'country': 'US', 'date': '2022-02-08T01:34:2...",True,1,3.0,4.0,874,172,1,0,161,9999999,game,0,0,0,False,2-6,180,$89.00,Price: N/A,[18Chesapeake],{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,18Chesapeake is a member of the 18xx series...,,,,,,,,,,,,,,
4,eJuGpFzljd,184454-switzerland-and-austria-board-game,https://www.boardgameatlas.com/game/eJuGpFzljd...,https://www.boardgameatlas.com/game/eJuGpFzljd...,1844 / 1854,75.99,112.95,0.0,0.0,94.99,"[{'country': 'UK', 'price': 87.99}, {'country'...",0.2,2016.0,3.0,7.0,300.0,300.0,12.0,<p>The railway history of both Switzerland and...,,,https://s3-us-west-1.amazonaws.com/5cc.images/...,https://s3-us-west-1.amazonaws.com/5cc.images/...,,[],"[{'id': 'ohABM4GjbC', 'url': 'https://www.boar...","[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...","[{'id': '7GTti1NuCH', 'num_games': None, 'scor...","[{'id': 'gFwNqit3MM', 'num_games': None, 'scor...","{'name': 'Mayfair Games', 'id': '7GTti1NuCH', ...","{'name': 'Helmut Ohley', 'id': 'gFwNqit3MM', '...",[],[],[Klemens Franz],[],,473334.0,,,,0,6,4.0,"[{'country': 'UK', 'date': '2021-12-03T08:56:5...",True,0,0.0,0.0,507,51,0,0,5,9999999,game,0,0,0,False,3-7,300,$94.99,$75.99,"[1844/1854 Switzerland/Austria, 1844/1854, 184...",{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,The railway history of both Switzerland and A...,,,,,,,,,,,,,,


In [10]:
# Number of games 
len(categ_df)

16815

In [11]:
# Feature list
categ_df.dtypes

id                      object
handle                  object
url                     object
edit_url                object
name                    object
                        ...   
size_width             float64
isbn                    object
availability_status     object
size_units              object
video_links             object
Length: 80, dtype: object

#### 3. Keep only the needed fields

In [12]:
# Keep only useful fields
categ_df_short = categ_df.loc[:, categ_df.columns.isin([
    'artists',
    'average_learning_complexity',
    'average_strategy_complexity',
    'average_user_rating',
    'categories',
    'comment_count',
    'description',
    'description_preview',
    'designers',
    'discount',
    'id',
    'is_historical_low',
    'listing_clicks',
    'max_players',
    'max_playtime',
    'min_age',
    'min_players',
    'min_playtime',
    'name',
    'num_user_ratings',
    'players',
    'plays',
    'playtime',
    'price',
    'price_au',
    'price_ca',
    'price_uk',
    'primary_publisher',
    'publishers',
    'visits',
    'year_published'  
])]

In [13]:
categ_df_short.isnull().sum()

id                                0
name                              0
price                             0
price_ca                          0
price_uk                          0
price_au                          0
discount                          0
year_published                 2086
min_players                    1350
max_players                    1350
min_playtime                   1931
max_playtime                   1934
min_age                        2128
description                       0
categories                        0
publishers                        0
designers                         0
primary_publisher               747
artists                           0
comment_count                     0
num_user_ratings                  0
average_user_rating               0
average_learning_complexity       0
average_strategy_complexity       0
visits                            0
plays                             0
listing_clicks                    0
is_historical_low           

#### 4. Data cleaning

In [14]:
# Lowercase name
categ_df_short['name_lower']=categ_df_short['name'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categ_df_short['name_lower']=categ_df_short['name'].str.lower()


In [15]:
# Remove original fields
categ_df_short = categ_df_short.loc[:, ~categ_df_short.columns.isin(['artists', 'primary_publisher', 'categories', 'designers', 'publishers', 'description', 'description_preview'])]

In [16]:
# Feature list
categ_df_short.dtypes

id                              object
name                            object
price                           object
price_ca                        object
price_uk                        object
price_au                        object
discount                        object
year_published                 float64
min_players                    float64
max_players                    float64
min_playtime                   float64
max_playtime                   float64
min_age                        float64
comment_count                    int64
num_user_ratings                 int64
average_user_rating            float64
average_learning_complexity    float64
average_strategy_complexity    float64
visits                           int64
plays                            int64
listing_clicks                   int64
is_historical_low                 bool
players                         object
playtime                        object
name_lower                      object
dtype: object

In [17]:
pd.options.display.max_columns = None
categ_df_short.head()

Unnamed: 0,id,name,price,price_ca,price_uk,price_au,discount,year_published,min_players,max_players,min_playtime,max_playtime,min_age,comment_count,num_user_ratings,average_user_rating,average_learning_complexity,average_strategy_complexity,visits,plays,listing_clicks,is_historical_low,players,playtime,name_lower
0,o6knI5ct0u,1830: Railways & Robber Barons,52.99,74.99,50.15,89.95,0.24,1986.0,2.0,7.0,180.0,360.0,14.0,0,36,4.006944,3.0,4.5,1920,244,0,False,2-7,180-360,1830: railways & robber barons
1,eSCVHuUFPs,1846: The Race for the Midwest,46.99,67.95,0.0,0.0,0.32,2005.0,3.0,5.0,240.0,240.0,14.0,0,37,3.885135,4.0,4.0,1095,316,11,False,3-5,240,1846: the race for the midwest
2,uEPtE5OOOU,1862: Railway Mania in the Eastern Counties,57.99,79.95,0.0,127.99,0.27,2013.0,1.0,8.0,300.0,300.0,12.0,0,9,4.111111,0.0,0.0,1077,71,0,False,1-8,300,1862: railway mania in the eastern counties
3,Krn8i8C0fI,18Chesapeake,0.0,0.0,0.0,0.0,0.0,2020.0,2.0,6.0,180.0,180.0,13.0,0,11,4.454545,3.0,4.0,874,161,0,False,2-6,180,18chesapeake
4,eJuGpFzljd,1844 / 1854,75.99,112.95,0.0,0.0,0.2,2016.0,3.0,7.0,300.0,300.0,12.0,0,6,4.0,0.0,0.0,507,5,0,False,3-7,300,1844 / 1854


#### 5. Remove duplicates

In [18]:
# Remove duplicates
no_duplicate_categ_df = categ_df_short[~categ_df_short.duplicated(subset=['name_lower', 'year_published'])]

In [19]:
len(no_duplicate_categ_df)

8135

In [20]:
pd.options.display.max_columns = None
no_duplicate_categ_df.head()

Unnamed: 0,id,name,price,price_ca,price_uk,price_au,discount,year_published,min_players,max_players,min_playtime,max_playtime,min_age,comment_count,num_user_ratings,average_user_rating,average_learning_complexity,average_strategy_complexity,visits,plays,listing_clicks,is_historical_low,players,playtime,name_lower
0,o6knI5ct0u,1830: Railways & Robber Barons,52.99,74.99,50.15,89.95,0.24,1986.0,2.0,7.0,180.0,360.0,14.0,0,36,4.006944,3.0,4.5,1920,244,0,False,2-7,180-360,1830: railways & robber barons
1,eSCVHuUFPs,1846: The Race for the Midwest,46.99,67.95,0.0,0.0,0.32,2005.0,3.0,5.0,240.0,240.0,14.0,0,37,3.885135,4.0,4.0,1095,316,11,False,3-5,240,1846: the race for the midwest
2,uEPtE5OOOU,1862: Railway Mania in the Eastern Counties,57.99,79.95,0.0,127.99,0.27,2013.0,1.0,8.0,300.0,300.0,12.0,0,9,4.111111,0.0,0.0,1077,71,0,False,1-8,300,1862: railway mania in the eastern counties
3,Krn8i8C0fI,18Chesapeake,0.0,0.0,0.0,0.0,0.0,2020.0,2.0,6.0,180.0,180.0,13.0,0,11,4.454545,3.0,4.0,874,161,0,False,2-6,180,18chesapeake
4,eJuGpFzljd,1844 / 1854,75.99,112.95,0.0,0.0,0.2,2016.0,3.0,7.0,300.0,300.0,12.0,0,6,4.0,0.0,0.0,507,5,0,False,3-7,300,1844 / 1854


#### 5. Load More data

### Considerations:
* So far we have less names than in the original csv and we haven't matched names yet.
* To increase data we tried the second categorical variables in the API call, which is the list of mechanics in a game.

In [21]:
# The call is very similar, but this time we only select the fields we know we will use, you can specify this at the API call.

field_list = ""
for list_item in list(categ_df_short.columns):
    field_list = field_list + "," + list_item
field_list = field_list[1:]
field_list = field_list[:-11]
field_list

'id,name,price,price_ca,price_uk,price_au,discount,year_published,min_players,max_players,min_playtime,max_playtime,min_age,comment_count,num_user_ratings,average_user_rating,average_learning_complexity,average_strategy_complexity,visits,plays,listing_clicks,is_historical_low,players,playtime'

In [22]:
# Get mechanics list 
mechanics_url = 'https://api.boardgameatlas.com/api/game/mechanics?pretty=true&client_id=6MXavgYBke'
mechanics_response = requests.get(mechanics_url)
mechanics_dict = json.loads(mechanics_response.text)

In [23]:
# Get mechanics ids
mechanics_ids = []
for mech_num in list(range(0,len(mechanics_dict['mechanics']))):
    mechanics_ids.append(mechanics_dict['mechanics'][mech_num]['id'])

In [24]:
# Get board games
mech_df = pd.DataFrame()
for mech_id in mechanics_ids:
    next_mech_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&mechanics={}&fields={}'.format(mech_id,field_list)
    next_response = requests.get(next_mech_url)
    next_games_dict = json.loads(next_response.text)
    mech_df = mech_df.append(pd.DataFrame(next_games_dict['games']))
    time.sleep(1)
    if next_games_dict['count'] > 1000:
        for hundred in list(range(1,11)):
            limit = 100 * hundred
            next_mech_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&mechanics={}&skip={}&fields={}'.format(mech_id , limit, field_list)
            next_response = requests.get(next_mech_url)
            next_games_dict = json.loads(next_response.text)
            mech_df = mech_df.append(pd.DataFrame(next_games_dict['games']))
            time.sleep(1)
    elif next_games_dict['count'] > 100:
        for hundred in list(range(1,(math.ceil(next_games_dict['count'] / 100)))):
            limit = 100 * hundred
            next_mech_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&mechanics={}&skip={}&fields={}'.format(mech_id , limit, field_list)
            next_response = requests.get(next_mech_url)
            next_games_dict = json.loads(next_response.text)
            mech_df = mech_df.append(pd.DataFrame(next_games_dict['games']))
            time.sleep(1)

In [25]:
# Add lower name
mech_df_names = mech_df.copy()
mech_df_names['name_lower'] = mech_df_names['name'].str.lower()


In [26]:
# Remove duplicates

print(len(mech_df_names))
no_duplicate_mech_df = mech_df_names[~mech_df_names.duplicated(subset=['name_lower', 'year_published'])]
no_duplicate_mech_df = no_duplicate_mech_df.reset_index()
len(no_duplicate_mech_df)

38405


16638

#### 6. Finalizing results

### Other options:
* We managed to get a lot more results with the second query
* Our final option would be to request make individual requests based on game names, but it would take too long.
* Therefore, the last step is to join the category and mechnism tables and save the results.


In [27]:
board_games_api = pd.concat( [no_duplicate_categ_df, no_duplicate_mech_df])
pd.options.display.max_columns = None
board_games_api.head()


Unnamed: 0,id,name,price,price_ca,price_uk,price_au,discount,year_published,min_players,max_players,min_playtime,max_playtime,min_age,comment_count,num_user_ratings,average_user_rating,average_learning_complexity,average_strategy_complexity,visits,plays,listing_clicks,is_historical_low,players,playtime,name_lower,index
0,o6knI5ct0u,1830: Railways & Robber Barons,52.99,74.99,50.15,89.95,0.24,1986.0,2.0,7.0,180.0,360.0,14.0,0,36,4.006944,3.0,4.5,1920,244,0,False,2-7,180-360,1830: railways & robber barons,
1,eSCVHuUFPs,1846: The Race for the Midwest,46.99,67.95,0.0,0.0,0.32,2005.0,3.0,5.0,240.0,240.0,14.0,0,37,3.885135,4.0,4.0,1095,316,11,False,3-5,240,1846: the race for the midwest,
2,uEPtE5OOOU,1862: Railway Mania in the Eastern Counties,57.99,79.95,0.0,127.99,0.27,2013.0,1.0,8.0,300.0,300.0,12.0,0,9,4.111111,0.0,0.0,1077,71,0,False,1-8,300,1862: railway mania in the eastern counties,
3,Krn8i8C0fI,18Chesapeake,0.0,0.0,0.0,0.0,0.0,2020.0,2.0,6.0,180.0,180.0,13.0,0,11,4.454545,3.0,4.0,874,161,0,False,2-6,180,18chesapeake,
4,eJuGpFzljd,1844 / 1854,75.99,112.95,0.0,0.0,0.2,2016.0,3.0,7.0,300.0,300.0,12.0,0,6,4.0,0.0,0.0,507,5,0,False,3-7,300,1844 / 1854,


In [28]:
# Remove duplicates

print(len(board_games_api))
board_games_api = board_games_api[~board_games_api.duplicated(subset=['id'])]
len(board_games_api)

24773


20156

In [29]:
# Save dataset 
board_games_api.to_csv("2loading_output.csv", sep=',' , quotechar='"')

### Final note:
* Our approach to API calls netted over 20k results under 30 minutes which is much faster than individual requests.
* However, our selection was not random, we could only query the first 1100 results of any category/mechanism.
* Upon further investigation, we found that results are ordered by relevance and the ending pages we could not access contain
mainly missing information.
* Consider the last page of [card games category](https://www.boardgameatlas.com/category/eX8uuNlQkQ/card-game?skip=1900&categories=eX8uuNlQkQ): most prices and other relevant information are missing.
* As you will see in further notebooks, we would likely discard most of these results for our analysis, therefore the issue of representation would not be resolved with more APi calls.
