**Import libraries**

In [53]:
import pandas as pd
import requests
import json
import time
import math

#### **I. Load raw data from tidyTuesday**

In [4]:
# Import data
board_games_raw = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-12/board_games.csv")

In [5]:
# Check data
board_games_raw.head()

Unnamed: 0,game_id,description,image,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,...,artist,category,compilation,designer,expansion,family,mechanic,publisher,average_rating,users_rated
0,1,Die Macher is a game about seven sequential po...,//cf.geekdo-images.com/images/pic159509.jpg,5,240,14,3,240,Die Macher,240,...,Marcus Gschwendtner,"Economic,Negotiation,Political",,Karl-Heinz Schmiel,,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498
1,2,Dragonmaster is a trick-taking card game based...,//cf.geekdo-images.com/images/pic184174.jpg,4,30,12,3,30,Dragonmaster,30,...,Bob Pepper,"Card Game,Fantasy",,"G. W. ""Jerry"" D'Arcey",,Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478
2,3,"Part of the Knizia tile-laying trilogy, Samura...",//cf.geekdo-images.com/images/pic3211873.jpg,4,60,10,2,30,Samurai,60,...,Franz Vohwinkel,"Abstract Strategy,Medieval",,Reiner Knizia,,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019
3,4,When you see the triangular box and the luxuri...,//cf.geekdo-images.com/images/pic285299.jpg,4,60,12,2,60,Tal der Könige,60,...,,Ancient,,Christian Beierer,,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314
4,5,"In Acquire, each player strategically invests ...",//cf.geekdo-images.com/images/pic342163.jpg,6,90,12,3,90,Acquire,90,...,"Scott Okumura,Peter Whitley",Economic,,Sid Sackson,,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195


In [93]:
len(board_games_raw)

10532

In [95]:
# Lowercase name
board_games_raw['name_lower']=board_games_raw['name'].str.lower()

In [99]:
board_games_raw.head()

Unnamed: 0,game_id,description,image,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,...,category,compilation,designer,expansion,family,mechanic,publisher,average_rating,users_rated,name_lower
0,1,Die Macher is a game about seven sequential po...,//cf.geekdo-images.com/images/pic159509.jpg,5,240,14,3,240,Die Macher,240,...,"Economic,Negotiation,Political",,Karl-Heinz Schmiel,,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498,die macher
1,2,Dragonmaster is a trick-taking card game based...,//cf.geekdo-images.com/images/pic184174.jpg,4,30,12,3,30,Dragonmaster,30,...,"Card Game,Fantasy",,"G. W. ""Jerry"" D'Arcey",,Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478,dragonmaster
2,3,"Part of the Knizia tile-laying trilogy, Samura...",//cf.geekdo-images.com/images/pic3211873.jpg,4,60,10,2,30,Samurai,60,...,"Abstract Strategy,Medieval",,Reiner Knizia,,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019,samurai
3,4,When you see the triangular box and the luxuri...,//cf.geekdo-images.com/images/pic285299.jpg,4,60,12,2,60,Tal der Könige,60,...,Ancient,,Christian Beierer,,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314,tal der könige
4,5,"In Acquire, each player strategically invests ...",//cf.geekdo-images.com/images/pic342163.jpg,6,90,12,3,90,Acquire,90,...,Economic,,Sid Sackson,,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195,acquire


In [100]:
# Check number of nulls
board_games_raw.isnull().sum()

game_id               0
description           0
image                 1
max_players           0
max_playtime          0
min_age               0
min_players           0
min_playtime          0
name                  0
playing_time          0
thumbnail             1
year_published        0
artist             2773
category             94
compilation       10122
designer            126
expansion          7780
family             2808
mechanic            950
publisher             3
average_rating        0
users_rated           0
name_lower            0
dtype: int64

In [116]:
# Keep only useful fields
board_games_raw = board_games_raw.loc[:, ~board_games_raw.columns.isin(['image', 'thumbnail', 'compilation', ])]

In [None]:
# Todo: 
#  - unlist family, mechanic, publisher

In [117]:
# Save dataset 
board_games_raw.to_csv("board_games_raw.csv", sep=';' , quotechar='"')

**II. Call API**

In [42]:
# Get category list
categ_url = 'https://api.boardgameatlas.com/api/game/categories?client_id=6MXavgYBke'
categ_response = requests.get(categ_url)
categ_dict = json.loads(categ_response.text)
categ_dict

{'categories': [{'id': '2bdFPJUvFo',
   'name': '18XX',
   'url': 'https://www.boardgameatlas.com/category/2bdFPJUvFo/18xx'},
  {'id': '85OKv8p5Ow',
   'name': '4x',
   'url': 'https://www.boardgameatlas.com/category/85OKv8p5Ow/4x'},
  {'id': 'hBqZ3Ar4RJ',
   'name': 'Abstract',
   'url': 'https://www.boardgameatlas.com/category/hBqZ3Ar4RJ/abstract'},
  {'id': 'KUBCKBkGxV',
   'name': 'Adventure',
   'url': 'https://www.boardgameatlas.com/category/KUBCKBkGxV/adventure'},
  {'id': 'DjAhqEHOD0',
   'name': 'Aerial Warfare',
   'url': 'https://www.boardgameatlas.com/category/DjAhqEHOD0/aerial-warfare'},
  {'id': '20iDvpbh7A',
   'name': 'Age of Reason',
   'url': 'https://www.boardgameatlas.com/category/20iDvpbh7A/age-of-reason'},
  {'id': 'tJxatX2ZbW',
   'name': 'Aliens',
   'url': 'https://www.boardgameatlas.com/category/tJxatX2ZbW/aliens'},
  {'id': 'nWDac9tQzt',
   'name': 'Alternate History',
   'url': 'https://www.boardgameatlas.com/category/nWDac9tQzt/alternate-history'},
  {'id':

In [46]:
# Get category ids
categ_ids = []
for cat_num in list(range(0,len(categ_dict['categories']))):
    categ_ids.append(categ_dict['categories'][cat_num]['id'])
categ_ids

['2bdFPJUvFo',
 '85OKv8p5Ow',
 'hBqZ3Ar4RJ',
 'KUBCKBkGxV',
 'DjAhqEHOD0',
 '20iDvpbh7A',
 'tJxatX2ZbW',
 'nWDac9tQzt',
 'dghLhwyxVb',
 '4mOtRRwSoj',
 'a8NM5cugJX',
 'MWoxgHrOJD',
 'eFaACC6y2c',
 'CBboNLI1Uj',
 'k0dglq5j6N',
 'Bq6M0TJyg7',
 'QB4sEpx1Uu',
 'wpItJuRDiz',
 'PinhJrhnxU',
 'fW5vusE96B',
 'eX8uuNlQkQ',
 'HKaYVNIxAJ',
 '36WFElclV3',
 'ODWOjWAJj3',
 'w8XD66FUZ2',
 '329DxyFL9D',
 'bVMxJo31bS',
 'vXxLT0FDTZ',
 'bSzUpE5oOZ',
 'gscaL52VDG',
 'G5kfqnPBP6',
 'pacCjl7His',
 'ge8pIhEUGE',
 '7DfHn28Pcf',
 'nfQONtMbDU',
 'Ef4oYLHNhI',
 'bCBXJy9qDw',
 'bKrxqD9mYc',
 'mavSOM8vjH',
 '42pmul4oHH',
 'We3MM46qBr',
 'g2Hwv8t0Y5',
 'N0TkEGfEsF',
 'B3NRLMK4xD',
 'crxgUzJSEz',
 'gsekjrPJz0',
 'u5ZiYctU6T',
 'v4SfYtS2Lr',
 'yq6hVlbM2R',
 '7rV11PKqME',
 'ctumBZyj5l',
 'ZTneo8TaIO',
 'Wr8uXcoR9p',
 'upXZ8vNfNO',
 'ELg06sncRX',
 'YrDuNj8lvr',
 'JvIs75sWte',
 'Eyuf8PzjDo',
 'KzEQIwIub7',
 'cAIkk5aLdQ',
 'TYnxiuiI3X',
 'zqFmdU4Fp2',
 'rrvd68LjOR',
 '3souLOXUqI',
 'AGKGd3txp9',
 'XeYUw9159M',
 'HZ7kUiqE

In [47]:
# Number of categories
len(categ_ids)

141

In [51]:
# Sample
categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories=2bdFPJUvFo'
response = requests.get(categ_url)
games_dict = json.loads(response.text)
games_dict

{'games': [{'id': 'o6knI5ct0u',
   'handle': 'mayfair-games-1830-railways-and-robber-barons--north-east-us',
   'url': 'https://www.boardgameatlas.com/game/o6knI5ct0u/mayfair-games-1830-railways-and-robber-barons--north-east-us',
   'edit_url': 'https://www.boardgameatlas.com/game/o6knI5ct0u/edit',
   'name': '1830: Railways & Robber Barons',
   'price': '55.99',
   'distributors': [{'id': 'G7bYgDUpyD',
     'store_name': 'Asmodee',
     'name': '1830 (Revised Edition)',
     'url': 'https://www.asmodeena.com/active-catalog-csv',
     'msrp': 76.99,
     'updated_at_ago': '3 hours ago',
     'stock': 1,
     'sku': 'LK0043',
     'pfGameDistributorInfo': {'objectId': 'NeHDiBOndN'},
     'msrp_text': '$76.99'}],
   'price_ca': '74.99',
   'price_uk': '0.00',
   'price_au': '89.95',
   'msrp': 69.99,
   'msrps': [{'country': 'CA', 'price': 0},
    {'country': 'UK', 'price': 0},
    {'country': 'US', 'price': 69.99}],
   'discount': '0.20',
   'year_published': 1986,
   'min_players': 2,


In [118]:
# Get board games
categ_df = pd.DataFrame()
for category_id in categ_ids:
    next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}'.format(category_id)
    next_response = requests.get(next_categ_url)
    next_games_dict = json.loads(next_response.text)
    categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
    time.sleep(1)
    if next_games_dict['count'] > 1000:
        for hundred in list(range(1,11)):
            limit = 100 * hundred
            next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}&skip={}'.format(category_id , limit)
            next_response = requests.get(next_categ_url)
            next_games_dict = json.loads(next_response.text)
            categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
            time.sleep(1)
    elif next_games_dict['count'] > 100:
        for hundred in list(range(1,(math.ceil(next_games_dict['count'] / 100)))):
            limit = 100 * hundred
            next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}&skip={}'.format(category_id , limit)
            next_response = requests.get(next_categ_url)
            next_games_dict = json.loads(next_response.text)
            categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
            time.sleep(1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [86]:
# Save dataset part
categ_df_name = categ_df[['name', 'id']]
categ_df_name.to_csv("categ_df_name.csv", sep=';' , quotechar='"')

In [120]:
# Export data
categ_df.to_csv("categ_df.csv", sep=';' , quotechar='"')


In [122]:
# Feature list
categ_df.dtypes

active                            bool
amazon_rank                    float64
artists                         object
availability_status             object
average_learning_complexity    float64
average_strategy_complexity    float64
average_user_rating            float64
categories                      object
comment_count                    int64
commentary                      object
cs_rating                      float64
description                     object
description_preview             object
designers                       object
developers                      object
discount                        object
distributors                    object
edit_url                        object
faq                             object
handle                          object
historical_low_prices           object
id                              object
image_url                       object
images                          object
is_historical_low                 bool
isbn                     

In [111]:
# Check number of nulls
categ_df.isnull().sum()

active                             0
amazon_rank                    11353
artists                            0
availability_status            14311
average_learning_complexity        0
average_strategy_complexity        0
average_user_rating                0
categories                         0
comment_count                      0
commentary                         0
cs_rating                      13452
description                        0
description_preview                0
designers                          0
developers                         0
discount                           0
distributors                    9913
edit_url                           0
faq                                0
handle                             0
historical_low_prices           4625
id                                 0
image_url                          0
images                             0
is_historical_low                  0
isbn                           14311
links                              0
l

In [109]:
# Check dataset
categ_df.head()

Unnamed: 0,active,amazon_rank,artists,availability_status,average_learning_complexity,average_strategy_complexity,average_user_rating,categories,comment_count,commentary,...,thumb_url,trending_rank,type,upc,url,video_links,visits,weight_amount,weight_units,year_published
0,True,559911.0,"[Mike Atkinson, Jared Blando, Charles Kibler, ...",,3.0,4.5,4.006944,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/o6knI5ct0u...,,1869,,,1986.0
1,True,,[Mike Hutton],,0.0,0.0,4.111111,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,817054011704.0,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1048,,,2013.0
2,True,,[Brigette Indelicato],,3.0,4.0,4.454545,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,856,,,2020.0
3,True,473334.0,[Klemens Franz],,0.0,0.0,4.0,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/eJuGpFzljd...,,498,,,2016.0
4,True,,[],,0.0,0.0,4.2,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/IKKs90JOkr...,,367,,,2016.0


In [125]:
# Keep only useful fields
categ_df_short = categ_df.loc[:, categ_df.columns.isin([
'average_learning_complexity',
'average_strategy_complexity',
'average_user_rating',
'categories',
'comment_count',
'price',
'price_text',
'rank',
'year_published'    
])]

In [126]:
categ_df_short.isnull().sum()

average_learning_complexity       0
average_strategy_complexity       0
average_user_rating               0
categories                        0
comment_count                     0
price                             0
price_text                        0
rank                              0
year_published                 2119
dtype: int64

In [108]:
len(categ_df)

15270

Unnamed: 0,active,amazon_rank,artists,availability_status,average_learning_complexity,average_strategy_complexity,average_user_rating,categories,comment_count,commentary,...,thumb_url,trending_rank,type,upc,url,video_links,visits,weight_amount,weight_units,year_published
0,True,559911.0,"[Mike Atkinson, Jared Blando, Charles Kibler, ...",,3.0,4.5,4.006944,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/o6knI5ct0u...,,1869,,,1986.0
1,True,,[Mike Hutton],,0.0,0.0,4.111111,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,817054011704.0,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1048,,,2013.0
2,True,,[Brigette Indelicato],,3.0,4.0,4.454545,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,856,,,2020.0
3,True,473334.0,[Klemens Franz],,0.0,0.0,4.0,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/eJuGpFzljd...,,498,,,2016.0
4,True,,[],,0.0,0.0,4.2,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/IKKs90JOkr...,,367,,,2016.0


In [67]:
# Number of unique board games
len(categ_df['name'].unique())

7592

In [71]:
# Number of unique board games
sum(categ_df.duplicated(subset=['name', 'year_published']))

7647

In [74]:
sum(categ_df.duplicated(subset=['name']))

7678

In [75]:
# Remove duplicates
no_duplicate_categ_df = categ_df[~categ_df.duplicated(subset=['name'])]

In [76]:
len(no_duplicate_categ_df)

7592

In [90]:
# Lowercase name
no_duplicate_categ_df['name_lower']=no_duplicate_categ_df['name'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [91]:
no_duplicate_categ_df.head()

Unnamed: 0,active,amazon_rank,artists,availability_status,average_learning_complexity,average_strategy_complexity,average_user_rating,categories,comment_count,commentary,...,trending_rank,type,upc,url,video_links,visits,weight_amount,weight_units,year_published,name_lower
0,True,559911.0,"[Mike Atkinson, Jared Blando, Charles Kibler, ...",,3.0,4.5,4.006944,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,0,game,,https://www.boardgameatlas.com/game/o6knI5ct0u...,,1869,,,1986.0,1830: railways & robber barons
1,True,,[Mike Hutton],,0.0,0.0,4.111111,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,0,game,817054011704.0,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1048,,,2013.0,1862: railway mania in the eastern counties
2,True,,[Brigette Indelicato],,3.0,4.0,4.454545,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,0,game,,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,856,,,2020.0,18chesapeake
3,True,473334.0,[Klemens Franz],,0.0,0.0,4.0,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,0,game,,https://www.boardgameatlas.com/game/eJuGpFzljd...,,498,,,2016.0,1844 / 1854
4,True,,[],,0.0,0.0,4.2,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,...,0,game,,https://www.boardgameatlas.com/game/IKKs90JOkr...,,367,,,2016.0,1822: the railways of great britain


In [77]:
no_duplicate_categ_df['name'].head()

0                 1830: Railways & Robber Barons
1    1862: Railway Mania in the Eastern Counties
2                                   18Chesapeake
3                                    1844 / 1854
4            1822: The Railways of Great Britain
Name: name, dtype: object

**III. Join**

In [97]:
united_df = pd.merge(board_games_raw, no_duplicate_categ_df, how='inner', left_on='name_lower', right_on='name_lower')

In [98]:
len(united_df)

1333

In [92]:
board_games_raw['name'].head()

0        Die Macher
1      Dragonmaster
2           Samurai
3    Tal der Könige
4           Acquire
Name: name, dtype: object

In [223]:

united_df = pd.merge(board_games_raw, no_duplicate_categ_df, how='left', left_on='name', right_on='name')

In [225]:
len(united_df)

10532

In [229]:
len(united_df) - united_df['id'].isnull().sum()

1266

In [248]:
united_df['id'].isnull().sum()

9266

In [252]:
pop_df = pd.DataFrame()
for offset in list(range(0,11)):
    pop_url = 'https://api.boardgameatlas.com/api/search?limit=100&client_id=6MXavgYBke&skip={}'.format(num*100)
    pop_response = requests.get(pop_url)
    pop_dict = json.loads(pop_response.text)
    pop_df = pop_df.append(pd.DataFrame(pop_dict['games']))
    time.sleep(1)

In [253]:
len(pop_df)

1100

In [254]:
len(no_duplicate_categ_df)

7421

In [255]:
combined_df = no_duplicate_categ_df.append(pop_df)
len(combined_df)

8521

In [256]:
combined_df = combined_df[~combined_df.duplicated(subset=['name'])]
len(combined_df)

7475

In [257]:
united_df = pd.merge(board_games_raw, combined_df, how='left', left_on='name', right_on='name')
print(len(united_df))
len(united_df) - united_df['id'].isnull().sum()

10532


1301