**Import libraries**

In [1]:
import pandas as pd
import requests
import json
import time
import math

## **I. Data from tidyTuesday**

####  1. Import and check data

In [159]:
# Import and check data
board_games_raw = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-12/board_games.csv")

In [77]:
# Check data
board_games_raw.head()

Unnamed: 0,game_id,description,image,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,thumbnail,year_published,artist,category,compilation,designer,expansion,family,mechanic,publisher,average_rating,users_rated
0,1,Die Macher is a game about seven sequential po...,//cf.geekdo-images.com/images/pic159509.jpg,5,240,14,3,240,Die Macher,240,//cf.geekdo-images.com/images/pic159509_t.jpg,1986,Marcus Gschwendtner,"Economic,Negotiation,Political",,Karl-Heinz Schmiel,,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498
1,2,Dragonmaster is a trick-taking card game based...,//cf.geekdo-images.com/images/pic184174.jpg,4,30,12,3,30,Dragonmaster,30,//cf.geekdo-images.com/images/pic184174_t.jpg,1981,Bob Pepper,"Card Game,Fantasy",,"G. W. ""Jerry"" D'Arcey",,Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478
2,3,"Part of the Knizia tile-laying trilogy, Samura...",//cf.geekdo-images.com/images/pic3211873.jpg,4,60,10,2,30,Samurai,60,//cf.geekdo-images.com/images/pic3211873_t.jpg,1998,Franz Vohwinkel,"Abstract Strategy,Medieval",,Reiner Knizia,,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019
3,4,When you see the triangular box and the luxuri...,//cf.geekdo-images.com/images/pic285299.jpg,4,60,12,2,60,Tal der Könige,60,//cf.geekdo-images.com/images/pic285299_t.jpg,1992,,Ancient,,Christian Beierer,,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314
4,5,"In Acquire, each player strategically invests ...",//cf.geekdo-images.com/images/pic342163.jpg,6,90,12,3,90,Acquire,90,//cf.geekdo-images.com/images/pic342163_t.jpg,1964,"Scott Okumura,Peter Whitley",Economic,,Sid Sackson,,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195


In [160]:
len(board_games_raw)

10532

#### 2. Remove duplicates

In [161]:
# Number of duplicate board games
sum(board_games_raw.duplicated(subset=['name']))

175

In [162]:
# Number of duplicate board games
sum(board_games_raw.duplicated(subset=['name', 'year_published']))

2

In [163]:
# Remove duplicates for name + year_published
board_games_raw = board_games_raw[~board_games_raw.duplicated(subset=['name', 'year_published'])]

In [164]:
len(board_games_raw)

10530

#### 3. Keep only the needed fields

In [165]:
# Check number of nulls
board_games_raw.isnull().sum()

game_id               0
description           0
image                 1
max_players           0
max_playtime          0
min_age               0
min_players           0
min_playtime          0
name                  0
playing_time          0
thumbnail             1
year_published        0
artist             2773
category             94
compilation       10120
designer            126
expansion          7778
family             2808
mechanic            949
publisher             3
average_rating        0
users_rated           0
dtype: int64

In [166]:
# Feature list
board_games_raw.dtypes

game_id             int64
description        object
image              object
max_players         int64
max_playtime        int64
min_age             int64
min_players         int64
min_playtime        int64
name               object
playing_time        int64
thumbnail          object
year_published      int64
artist             object
category           object
compilation        object
designer           object
expansion          object
family             object
mechanic           object
publisher          object
average_rating    float64
users_rated         int64
dtype: object

In [167]:
# Remove fields with high null rate
board_games_raw = board_games_raw.loc[:, ~board_games_raw.columns.isin(['image', 'thumbnail', 'compilation', 'expansion'])]

#### 4. Data cleaning

In [168]:
# Lowercase name
board_games_raw['name_lower']=board_games_raw['name'].str.lower()

In [169]:
# Todo: make list of family, mechanic, publisher like below but here we need to keep it as a sting to save it as csv
# board_games_raw['family_split']=board_games_raw['family'].str.split(',')
# board_games_raw['family_split'][0][0]

#### 5. Save dataset

In [170]:
board_games_raw.head()

Unnamed: 0,game_id,description,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,year_published,artist,category,designer,family,mechanic,publisher,average_rating,users_rated,name_lower
0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,1986,Marcus Gschwendtner,"Economic,Negotiation,Political",Karl-Heinz Schmiel,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498,die macher
1,2,Dragonmaster is a trick-taking card game based...,4,30,12,3,30,Dragonmaster,30,1981,Bob Pepper,"Card Game,Fantasy","G. W. ""Jerry"" D'Arcey",Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478,dragonmaster
2,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,1998,Franz Vohwinkel,"Abstract Strategy,Medieval",Reiner Knizia,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019,samurai
3,4,When you see the triangular box and the luxuri...,4,60,12,2,60,Tal der Könige,60,1992,,Ancient,Christian Beierer,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314,tal der könige
4,5,"In Acquire, each player strategically invests ...",6,90,12,3,90,Acquire,90,1964,"Scott Okumura,Peter Whitley",Economic,Sid Sackson,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195,acquire


In [171]:
# Save dataset 
board_games_raw.to_csv("board_games_raw.csv", sep=';' , quotechar='"')

## **II. Call API**

#### 1. Get list of board game categories

In [95]:
# Get category list - Sample
categ_url = 'https://api.boardgameatlas.com/api/game/categories?client_id=6MXavgYBke'
categ_response = requests.get(categ_url)
categ_dict = json.loads(categ_response.text)
list(categ_dict.values())[0][0]

{'id': '2bdFPJUvFo',
 'name': '18XX',
 'url': 'https://www.boardgameatlas.com/category/2bdFPJUvFo/18xx'}

In [97]:
# Get category ids
categ_ids = []
for cat_num in list(range(0,len(categ_dict['categories']))):
    categ_ids.append(categ_dict['categories'][cat_num]['id'])
categ_ids[1:15]

['85OKv8p5Ow',
 'hBqZ3Ar4RJ',
 'KUBCKBkGxV',
 'DjAhqEHOD0',
 '20iDvpbh7A',
 'tJxatX2ZbW',
 'nWDac9tQzt',
 'dghLhwyxVb',
 '4mOtRRwSoj',
 'a8NM5cugJX',
 'MWoxgHrOJD',
 'eFaACC6y2c',
 'CBboNLI1Uj',
 'k0dglq5j6N']

In [99]:
# Number of categories
len(categ_ids)

143

#### 2. Request board game data using the category ids 


In [100]:
# Sample request
categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories=2bdFPJUvFo'
response = requests.get(categ_url)
games_dict = json.loads(response.text)
list(games_dict.values())[0][0]

{'id': 'o6knI5ct0u',
 'handle': 'mayfair-games-1830-railways-and-robber-barons--north-east-us',
 'url': 'https://www.boardgameatlas.com/game/o6knI5ct0u/mayfair-games-1830-railways-and-robber-barons--north-east-us',
 'edit_url': 'https://www.boardgameatlas.com/game/o6knI5ct0u/edit',
 'name': '1830: Railways & Robber Barons',
 'price': '55.99',
 'price_ca': '74.99',
 'price_uk': '0.00',
 'price_au': '89.95',
 'msrp': 69.99,
 'msrps': [{'country': 'US', 'price': 69.99}],
 'discount': '0.20',
 'year_published': 1986,
 'min_players': 2,
 'max_players': 7,
 'min_playtime': 180,
 'max_playtime': 360,
 'min_age': 14,
 'description': "<p>1830. It is the dawn of the &quot;Age of Railroading&quot; in America. You're a wealthy investor and speculator betting that the new technology will revolutionize transport. Commerce will no longer depend on rutted roads and slow canals. Instead, it will ride the rails on swift, powerful &quot;Iron Horses.&quot;</p>\r\n<p><strong>1830</strong> is an acclaimed r

In [46]:
# Get board games
categ_df = pd.DataFrame()
for category_id in categ_ids:
    next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}'.format(category_id)
    next_response = requests.get(next_categ_url)
    next_games_dict = json.loads(next_response.text)
    categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
    time.sleep(1)
    if next_games_dict['count'] > 1000:
        for hundred in list(range(1,11)):
            limit = 100 * hundred
            next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}&skip={}'.format(category_id , limit)
            next_response = requests.get(next_categ_url)
            next_games_dict = json.loads(next_response.text)
            categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
            time.sleep(1)
    elif next_games_dict['count'] > 100:
        for hundred in list(range(1,(math.ceil(next_games_dict['count'] / 100)))):
            limit = 100 * hundred
            next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}&skip={}'.format(category_id , limit)
            next_response = requests.get(next_categ_url)
            next_games_dict = json.loads(next_response.text)
            categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
            time.sleep(1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [102]:
pd.options.display.max_columns = None
display(categ_df.head())

Unnamed: 0,active,amazon_rank,artists,availability_status,average_learning_complexity,average_strategy_complexity,average_user_rating,categories,comment_count,commentary,cs_rating,description,description_preview,designers,developers,discount,edit_url,faq,handle,historical_low_prices,id,image_url,images,is_historical_low,isbn,links,listing_clicks,lists,matches_specs,max_players,max_playtime,mechanics,mentions,min_age,min_players,min_playtime,msrp,msrp_text,msrps,name,names,num_distributors,num_user_complexity_votes,num_user_ratings,official_url,players,plays,playtime,price,price_au,price_ca,price_text,price_uk,primary_designer,primary_publisher,publishers,rank,related_to,rules_url,sell_sheet_url,size_depth,size_height,size_units,size_width,sku,sku_objects,skus,specs,store_images_url,tags,thumb_url,trending_rank,type,upc,url,video_links,visits,weight_amount,weight_units,year_published
0,True,559911.0,"[Mike Atkinson, Jared Blando, Charles Kibler, ...",,3.0,4.5,4.006944,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1830. It is the dawn of the &quot;Age of Ra...,"1830. It is the dawn of the ""Age of Railroadi...","[{'id': 'RpZJ6vqsPR', 'num_games': None, 'scor...",[],0.2,https://www.boardgameatlas.com/game/o6knI5ct0u...,,mayfair-games-1830-railways-and-robber-barons-...,"[{'country': 'UK', 'date': '2021-12-18T12:07:5...",o6knI5ct0u,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,273,,7.0,360.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",3,14.0,2.0,180.0,69.99,$69.99,"[{'country': 'US', 'price': 69.99}]",1830: Railways & Robber Barons,[],1,2,36,http://www.mayfairgames.com/products/1830-nort...,2-7,240,180-360,55.99,89.95,74.99,$55.99,0.0,"{'id': 'RpZJ6vqsPR', 'name': 'Francis Tresham'...","{'id': 'w1jOf2uAOD', 'name': 'The Avalon Hill ...","[{'id': 'w1jOf2uAOD', 'num_games': None, 'scor...",874,[],https://lookout-spiele.de/wp-content/uploads/1...,,,,,,,,,[],,"[1830: Ferrovie e Capitani d'Industria, 1830: ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,98,game,,https://www.boardgameatlas.com/game/o6knI5ct0u...,,1894,,,1986.0
1,True,125448.0,"[Chris Lawson, Rodger B. MacGowan, Kurt Miller...",,4.0,4.0,3.885135,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1846 - the midwest railroad boom has gone b...,1846 - the midwest railroad boom has gone bus...,"[{'id': 'CeUZNYVdQj', 'num_games': None, 'scor...",[],0.32,https://www.boardgameatlas.com/game/eSCVHuUFPs...,,1846-the-race-for-the-midwest,"[{'country': 'UK', 'date': '2021-12-14T18:14:1...",eSCVHuUFPs,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,226,,5.0,240.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",1,14.0,3.0,240.0,69.0,$69.00,"[{'country': 'US', 'price': 69}, {'country': '...",1846: The Race for the Midwest,[],3,1,37,https://www.gmtgames.com/p-847-1846-the-race-t...,3-5,313,240,46.99,0.0,67.95,$46.99,0.0,"{'id': 'CeUZNYVdQj', 'name': 'Thomas Lehmann',...","{'id': 'VKIPDDgZ2X', 'name': 'Deep Thought Gam...","[{'id': 'VKIPDDgZ2X', 'num_games': None, 'scor...",927,[],https://s3-us-west-2.amazonaws.com/gmtwebsitea...,,,,,,GMT1605,"[{'name': 'Noble Knight Games', 'sku': '214918...","[2149184041, 2149209107]",[],,"[1846 The Race to the Midwest Printing, 1846: ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,817054011155.0,https://www.boardgameatlas.com/game/eSCVHuUFPs...,,1072,,,2005.0
2,True,,[Mike Hutton],,0.0,0.0,4.111111,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1862: Railway Mania in the Eastern Counties...,1862: Railway Mania in the Eastern Counties i...,"[{'id': 'xMrDEmBdTo', 'num_games': None, 'scor...",[],0.27,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1862-railway-mania-in-the-eastern-counties,"[{'country': 'UK', 'date': '2021-12-11T11:48:1...",uEPtE5OOOU,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,1,0,120,,8.0,300.0,"[{'id': 'qCXa8MX0wk', 'url': 'https://www.boar...",13,12.0,1.0,300.0,79.0,$79.00,"[{'country': 'US', 'price': 79}]",1862: Railway Mania in the Eastern Counties,[],2,0,9,https://www.gmtgames.com/p-692-1862-railway-ma...,1-8,70,300,57.99,127.99,79.95,$57.99,0.0,"{'name': 'Mike Hutton', 'id': 'xMrDEmBdTo', 'u...","{'name': 'GMT Games', 'id': 'd5oY0duBgG', 'url...","[{'id': 'd5oY0duBgG', 'num_games': None, 'scor...",9999999,[],https://gmtwebsiteassets.s3-us-west-2.amazonaw...,,3.0,9.0,,,GMT1904,"[{'name': 'Noble Knight Games', 'sku': '214919...",[2149191519],[],,"[1862: Railway Mania in the Eastern Counties, ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,817054011704.0,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1053,,,2013.0
3,True,,[Brigette Indelicato],,3.0,4.0,4.454545,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,"<p>18Chesapeake is a member of the <a href=""ht...",18Chesapeake is a member of the 18xx series...,"[{'id': 'fN0XCgUAPZ', 'num_games': None, 'scor...",[],0.0,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,18chesapeake,"[{'country': 'US', 'date': '2022-02-08T01:34:2...",Krn8i8C0fI,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,171,,6.0,180.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",1,13.0,2.0,180.0,89.0,$89.00,"[{'country': 'US', 'price': 89}, {'country': '...",18Chesapeake,[],0,1,11,https://all-aboardgames.com/products/us-only-1...,2-6,158,180,0.0,0.0,0.0,Price: N/A,0.0,"{'id': 'fN0XCgUAPZ', 'name': 'Scott Petersen',...","{'id': 'ZOP4wDStJq', 'name': 'All-Aboard Games...","[{'id': 'ZOP4wDStJq', 'num_games': None, 'scor...",9999999,[],,,,,,,,,,[],,[18Chesapeake],https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,862,,,2020.0
4,True,473334.0,[Klemens Franz],,0.0,0.0,4.0,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>The railway history of both Switzerland and...,The railway history of both Switzerland and A...,"[{'id': 'gFwNqit3MM', 'num_games': None, 'scor...",[],0.26,https://www.boardgameatlas.com/game/eJuGpFzljd...,,184454-switzerland-and-austria-board-game,"[{'country': 'UK', 'date': '2021-12-03T08:56:5...",eJuGpFzljd,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,51,,7.0,300.0,"[{'id': 'ohABM4GjbC', 'url': 'https://www.boar...",0,12.0,3.0,300.0,94.99,$94.99,"[{'country': 'UK', 'price': 87.99}, {'country'...",1844 / 1854,[],0,0,6,,3-7,5,300,69.99,0.0,112.95,$69.99,0.0,"{'name': 'Helmut Ohley', 'id': 'gFwNqit3MM', '...","{'name': 'Mayfair Games', 'id': '7GTti1NuCH', ...","[{'id': '7GTti1NuCH', 'num_games': None, 'scor...",9999999,[],,,,,,,,,,[],,"[1844/1854 Switzerland/Austria, 1844/1854, 184...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/eJuGpFzljd...,,500,,,2016.0


In [106]:
# Number of games 
len(categ_df)

15699

In [105]:
# Feature list
categ_df.dtypes

active                            bool
amazon_rank                    float64
artists                         object
availability_status             object
average_learning_complexity    float64
average_strategy_complexity    float64
average_user_rating            float64
categories                      object
comment_count                    int64
commentary                      object
cs_rating                      float64
description                     object
description_preview             object
designers                       object
developers                      object
discount                        object
edit_url                        object
faq                             object
handle                          object
historical_low_prices           object
id                              object
image_url                       object
images                          object
is_historical_low                 bool
isbn                            object
links                    

#### 3. Keep only the needed fields

In [158]:
# Keep only useful fields
categ_df_short = categ_df.loc[:, categ_df.columns.isin([
    'artists',
    'average_learning_complexity',
    'average_strategy_complexity',
    'average_user_rating',
    'categories',
    'comment_count',
    'description',
    'description_preview',
    'designers',
    'discount',
    'id',
    'is_historical_low',
    'listing_clicks',
    'max_players',
    'max_playtime',
    'min_age',
    'min_players',
    'min_playtime',
    'name',
    'num_user_ratings',
    'players',
    'plays',
    'playtime',
    'price',
    'price_au',
    'price_ca',
    'price_uk',
    'primary_publisher',
    'publishers',
    'visits',
    'year_published'  
])]

In [104]:
categ_df_short.isnull().sum()

artists                           0
average_learning_complexity       0
average_strategy_complexity       0
average_user_rating               0
categories                        0
comment_count                     0
description                       0
description_preview               0
designers                         0
discount                          0
id                                0
is_historical_low                 0
listing_clicks                    0
max_players                    1361
max_playtime                   1915
min_age                        2082
min_players                    1361
min_playtime                   1912
msrp                              0
name                              0
num_user_ratings                  0
players                        1396
plays                             0
playtime                       2073
price                             0
price_au                          0
price_ca                          0
price_uk                    

#### 4. Data cleaning

In [131]:
# Join lists / dictionaries to string
categ_df_short['artists_mod'] = ''.join(str(e) for e in categ_df_short['artists'])
categ_df_short['primary_publisher_mod'] = ''.join(str(e) for e in categ_df_short['primary_publisher'])
categ_df_short['categories_mod'] = ''.join(str(e) for e in categ_df_short['categories'])
categ_df_short['designers_mod'] = ''.join(str(e) for e in categ_df_short['designers'])
categ_df_short['publishers_mod'] = ''.join(str(e) for e in categ_df_short['publishers'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [132]:
# Lowercase name
categ_df_short['name_lower']=categ_df_short['name'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [133]:
# Remove original fields
categ_df_short = categ_df_short.loc[:, ~categ_df_short.columns.isin(['artists', 'primary_publisher', 'categories', 'designers', 'publishers'])]

In [134]:
# Feature list
categ_df_short.dtypes

average_learning_complexity    float64
average_strategy_complexity    float64
average_user_rating            float64
comment_count                    int64
description                     object
description_preview             object
discount                        object
id                              object
is_historical_low                 bool
listing_clicks                   int64
max_players                    float64
max_playtime                   float64
min_age                        float64
min_players                    float64
min_playtime                   float64
msrp                            object
name                            object
num_user_ratings                 int64
players                         object
plays                            int64
playtime                        object
price                           object
price_au                        object
price_ca                        object
price_uk                        object
visits                   

In [124]:
categ_df_short.head()

Unnamed: 0,average_learning_complexity,average_strategy_complexity,average_user_rating,comment_count,description,description_preview,discount,id,is_historical_low,listing_clicks,max_players,max_playtime,min_age,min_players,min_playtime,msrp,name,num_user_ratings,players,plays,playtime,price,price_au,price_ca,price_uk,visits,year_published,artists_mod,primary_publisher_mod,categories_mod,designers_mod,publishers_mod,name_lower
0,3.0,4.5,4.006944,0,<p>1830. It is the dawn of the &quot;Age of Ra...,"1830. It is the dawn of the ""Age of Railroadi...",0.2,o6knI5ct0u,False,0,7.0,360.0,14.0,2.0,180.0,69.99,1830: Railways & Robber Barons,36,2-7,240,180-360,55.99,89.95,74.99,0.0,1894,1986.0,"['Mike Atkinson', 'Jared Blando', 'Charles Kib...","{'id': 'w1jOf2uAOD', 'name': 'The Avalon Hill ...","[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...","[{'id': 'RpZJ6vqsPR', 'num_games': None, 'scor...","[{'id': 'w1jOf2uAOD', 'num_games': None, 'scor...",1830: railways & robber barons
1,4.0,4.0,3.885135,0,<p>1846 - the midwest railroad boom has gone b...,1846 - the midwest railroad boom has gone bus...,0.32,eSCVHuUFPs,False,0,5.0,240.0,14.0,3.0,240.0,69.0,1846: The Race for the Midwest,37,3-5,313,240,46.99,0.0,67.95,0.0,1072,2005.0,"['Mike Atkinson', 'Jared Blando', 'Charles Kib...","{'id': 'w1jOf2uAOD', 'name': 'The Avalon Hill ...","[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...","[{'id': 'RpZJ6vqsPR', 'num_games': None, 'scor...","[{'id': 'w1jOf2uAOD', 'num_games': None, 'scor...",1846: the race for the midwest
2,0.0,0.0,4.111111,0,<p>1862: Railway Mania in the Eastern Counties...,1862: Railway Mania in the Eastern Counties i...,0.27,uEPtE5OOOU,False,0,8.0,300.0,12.0,1.0,300.0,79.0,1862: Railway Mania in the Eastern Counties,9,1-8,70,300,57.99,127.99,79.95,0.0,1053,2013.0,"['Mike Atkinson', 'Jared Blando', 'Charles Kib...","{'id': 'w1jOf2uAOD', 'name': 'The Avalon Hill ...","[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...","[{'id': 'RpZJ6vqsPR', 'num_games': None, 'scor...","[{'id': 'w1jOf2uAOD', 'num_games': None, 'scor...",1862: railway mania in the eastern counties
3,3.0,4.0,4.454545,0,"<p>18Chesapeake is a member of the <a href=""ht...",18Chesapeake is a member of the 18xx series...,0.0,Krn8i8C0fI,False,0,6.0,180.0,13.0,2.0,180.0,89.0,18Chesapeake,11,2-6,158,180,0.0,0.0,0.0,0.0,862,2020.0,"['Mike Atkinson', 'Jared Blando', 'Charles Kib...","{'id': 'w1jOf2uAOD', 'name': 'The Avalon Hill ...","[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...","[{'id': 'RpZJ6vqsPR', 'num_games': None, 'scor...","[{'id': 'w1jOf2uAOD', 'num_games': None, 'scor...",18chesapeake
4,0.0,0.0,4.0,0,<p>The railway history of both Switzerland and...,The railway history of both Switzerland and A...,0.26,eJuGpFzljd,False,0,7.0,300.0,12.0,3.0,300.0,94.99,1844 / 1854,6,3-7,5,300,69.99,0.0,112.95,0.0,500,2016.0,"['Mike Atkinson', 'Jared Blando', 'Charles Kib...","{'id': 'w1jOf2uAOD', 'name': 'The Avalon Hill ...","[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...","[{'id': 'RpZJ6vqsPR', 'num_games': None, 'scor...","[{'id': 'w1jOf2uAOD', 'num_games': None, 'scor...",1844 / 1854


#### 5. Remove duplicates

In [125]:
# Number of duplicate board games
sum(categ_df_short.duplicated(subset=['name', 'year_published']))

7946

In [126]:
# Number of duplicate board games
sum(categ_df_short.duplicated(subset=['name']))

7977

In [127]:
# Remove duplicates
no_duplicate_categ_df = categ_df[~categ_df.duplicated(subset=['name', 'year_published'])]

In [128]:
len(no_duplicate_categ_df)

7753

In [129]:
no_duplicate_categ_df.head()

Unnamed: 0,active,amazon_rank,artists,availability_status,average_learning_complexity,average_strategy_complexity,average_user_rating,categories,comment_count,commentary,cs_rating,description,description_preview,designers,developers,discount,edit_url,faq,handle,historical_low_prices,id,image_url,images,is_historical_low,isbn,links,listing_clicks,lists,matches_specs,max_players,max_playtime,mechanics,mentions,min_age,min_players,min_playtime,msrp,msrp_text,msrps,name,names,num_distributors,num_user_complexity_votes,num_user_ratings,official_url,players,plays,playtime,price,price_au,price_ca,price_text,price_uk,primary_designer,primary_publisher,publishers,rank,related_to,rules_url,sell_sheet_url,size_depth,size_height,size_units,size_width,sku,sku_objects,skus,specs,store_images_url,tags,thumb_url,trending_rank,type,upc,url,video_links,visits,weight_amount,weight_units,year_published
0,True,559911.0,"[Mike Atkinson, Jared Blando, Charles Kibler, ...",,3.0,4.5,4.006944,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1830. It is the dawn of the &quot;Age of Ra...,"1830. It is the dawn of the ""Age of Railroadi...","[{'id': 'RpZJ6vqsPR', 'num_games': None, 'scor...",[],0.2,https://www.boardgameatlas.com/game/o6knI5ct0u...,,mayfair-games-1830-railways-and-robber-barons-...,"[{'country': 'UK', 'date': '2021-12-18T12:07:5...",o6knI5ct0u,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,273,,7.0,360.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",3,14.0,2.0,180.0,69.99,$69.99,"[{'country': 'US', 'price': 69.99}]",1830: Railways & Robber Barons,[],1,2,36,http://www.mayfairgames.com/products/1830-nort...,2-7,240,180-360,55.99,89.95,74.99,$55.99,0.0,"{'id': 'RpZJ6vqsPR', 'name': 'Francis Tresham'...","{'id': 'w1jOf2uAOD', 'name': 'The Avalon Hill ...","[{'id': 'w1jOf2uAOD', 'num_games': None, 'scor...",874,[],https://lookout-spiele.de/wp-content/uploads/1...,,,,,,,,,[],,"[1830: Ferrovie e Capitani d'Industria, 1830: ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,98,game,,https://www.boardgameatlas.com/game/o6knI5ct0u...,,1894,,,1986.0
1,True,125448.0,"[Chris Lawson, Rodger B. MacGowan, Kurt Miller...",,4.0,4.0,3.885135,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1846 - the midwest railroad boom has gone b...,1846 - the midwest railroad boom has gone bus...,"[{'id': 'CeUZNYVdQj', 'num_games': None, 'scor...",[],0.32,https://www.boardgameatlas.com/game/eSCVHuUFPs...,,1846-the-race-for-the-midwest,"[{'country': 'UK', 'date': '2021-12-14T18:14:1...",eSCVHuUFPs,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,226,,5.0,240.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",1,14.0,3.0,240.0,69.0,$69.00,"[{'country': 'US', 'price': 69}, {'country': '...",1846: The Race for the Midwest,[],3,1,37,https://www.gmtgames.com/p-847-1846-the-race-t...,3-5,313,240,46.99,0.0,67.95,$46.99,0.0,"{'id': 'CeUZNYVdQj', 'name': 'Thomas Lehmann',...","{'id': 'VKIPDDgZ2X', 'name': 'Deep Thought Gam...","[{'id': 'VKIPDDgZ2X', 'num_games': None, 'scor...",927,[],https://s3-us-west-2.amazonaws.com/gmtwebsitea...,,,,,,GMT1605,"[{'name': 'Noble Knight Games', 'sku': '214918...","[2149184041, 2149209107]",[],,"[1846 The Race to the Midwest Printing, 1846: ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,817054011155.0,https://www.boardgameatlas.com/game/eSCVHuUFPs...,,1072,,,2005.0
2,True,,[Mike Hutton],,0.0,0.0,4.111111,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1862: Railway Mania in the Eastern Counties...,1862: Railway Mania in the Eastern Counties i...,"[{'id': 'xMrDEmBdTo', 'num_games': None, 'scor...",[],0.27,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1862-railway-mania-in-the-eastern-counties,"[{'country': 'UK', 'date': '2021-12-11T11:48:1...",uEPtE5OOOU,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,1,0,120,,8.0,300.0,"[{'id': 'qCXa8MX0wk', 'url': 'https://www.boar...",13,12.0,1.0,300.0,79.0,$79.00,"[{'country': 'US', 'price': 79}]",1862: Railway Mania in the Eastern Counties,[],2,0,9,https://www.gmtgames.com/p-692-1862-railway-ma...,1-8,70,300,57.99,127.99,79.95,$57.99,0.0,"{'name': 'Mike Hutton', 'id': 'xMrDEmBdTo', 'u...","{'name': 'GMT Games', 'id': 'd5oY0duBgG', 'url...","[{'id': 'd5oY0duBgG', 'num_games': None, 'scor...",9999999,[],https://gmtwebsiteassets.s3-us-west-2.amazonaw...,,3.0,9.0,,,GMT1904,"[{'name': 'Noble Knight Games', 'sku': '214919...",[2149191519],[],,"[1862: Railway Mania in the Eastern Counties, ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,817054011704.0,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1053,,,2013.0
3,True,,[Brigette Indelicato],,3.0,4.0,4.454545,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,"<p>18Chesapeake is a member of the <a href=""ht...",18Chesapeake is a member of the 18xx series...,"[{'id': 'fN0XCgUAPZ', 'num_games': None, 'scor...",[],0.0,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,18chesapeake,"[{'country': 'US', 'date': '2022-02-08T01:34:2...",Krn8i8C0fI,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,171,,6.0,180.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",1,13.0,2.0,180.0,89.0,$89.00,"[{'country': 'US', 'price': 89}, {'country': '...",18Chesapeake,[],0,1,11,https://all-aboardgames.com/products/us-only-1...,2-6,158,180,0.0,0.0,0.0,Price: N/A,0.0,"{'id': 'fN0XCgUAPZ', 'name': 'Scott Petersen',...","{'id': 'ZOP4wDStJq', 'name': 'All-Aboard Games...","[{'id': 'ZOP4wDStJq', 'num_games': None, 'scor...",9999999,[],,,,,,,,,,[],,[18Chesapeake],https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,862,,,2020.0
4,True,473334.0,[Klemens Franz],,0.0,0.0,4.0,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>The railway history of both Switzerland and...,The railway history of both Switzerland and A...,"[{'id': 'gFwNqit3MM', 'num_games': None, 'scor...",[],0.26,https://www.boardgameatlas.com/game/eJuGpFzljd...,,184454-switzerland-and-austria-board-game,"[{'country': 'UK', 'date': '2021-12-03T08:56:5...",eJuGpFzljd,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,51,,7.0,300.0,"[{'id': 'ohABM4GjbC', 'url': 'https://www.boar...",0,12.0,3.0,300.0,94.99,$94.99,"[{'country': 'UK', 'price': 87.99}, {'country'...",1844 / 1854,[],0,0,6,,3-7,5,300,69.99,0.0,112.95,$69.99,0.0,"{'name': 'Helmut Ohley', 'id': 'gFwNqit3MM', '...","{'name': 'Mayfair Games', 'id': '7GTti1NuCH', ...","[{'id': '7GTti1NuCH', 'num_games': None, 'scor...",9999999,[],,,,,,,,,,[],,"[1844/1854 Switzerland/Austria, 1844/1854, 184...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/eJuGpFzljd...,,500,,,2016.0


##### 6. Save dataset

In [136]:
# Save dataset 
no_duplicate_categ_df.to_csv("board_games_api.csv", sep=';' , quotechar='"')

## **III. Join datasets**

In [172]:
# Import datasets
board_games = pd.read_csv("board_games_raw.csv", sep=';')
# board_games.columns = [str(col) + '_base' for col in board_games.columns]
board_games.head()

Unnamed: 0.1,Unnamed: 0,game_id,description,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,year_published,artist,category,designer,family,mechanic,publisher,average_rating,users_rated,name_lower
0,0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,1986,Marcus Gschwendtner,"Economic,Negotiation,Political",Karl-Heinz Schmiel,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498,die macher
1,1,2,Dragonmaster is a trick-taking card game based...,4,30,12,3,30,Dragonmaster,30,1981,Bob Pepper,"Card Game,Fantasy","G. W. ""Jerry"" D'Arcey",Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478,dragonmaster
2,2,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,1998,Franz Vohwinkel,"Abstract Strategy,Medieval",Reiner Knizia,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019,samurai
3,3,4,When you see the triangular box and the luxuri...,4,60,12,2,60,Tal der Könige,60,1992,,Ancient,Christian Beierer,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314,tal der könige
4,4,5,"In Acquire, each player strategically invests ...",6,90,12,3,90,Acquire,90,1964,"Scott Okumura,Peter Whitley",Economic,Sid Sackson,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195,acquire


In [173]:
# Import datasets
board_games_api = pd.read_csv("board_games_api.csv", sep=';')
# board_games_api.columns = [str(col) + '_api' for col in board_games_api.columns]
board_games_api.head()

Unnamed: 0.1,Unnamed: 0,active,amazon_rank,artists,availability_status,average_learning_complexity,average_strategy_complexity,average_user_rating,categories,comment_count,commentary,cs_rating,description,description_preview,designers,developers,discount,edit_url,faq,handle,historical_low_prices,id,image_url,images,is_historical_low,isbn,links,listing_clicks,lists,matches_specs,max_players,max_playtime,mechanics,mentions,min_age,min_players,min_playtime,msrp,msrp_text,msrps,name,names,num_distributors,num_user_complexity_votes,num_user_ratings,official_url,players,plays,playtime,price,price_au,price_ca,price_text,price_uk,primary_designer,primary_publisher,publishers,rank,related_to,rules_url,sell_sheet_url,size_depth,size_height,size_units,size_width,sku,sku_objects,skus,specs,store_images_url,tags,thumb_url,trending_rank,type,upc,url,video_links,visits,weight_amount,weight_units,year_published
0,0,True,559911.0,"['Mike Atkinson', 'Jared Blando', 'Charles Kib...",,3.0,4.5,4.006944,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1830. It is the dawn of the &quot;Age of Ra...,"1830. It is the dawn of the ""Age of Railroadi...","[{'id': 'RpZJ6vqsPR', 'num_games': None, 'scor...",[],0.2,https://www.boardgameatlas.com/game/o6knI5ct0u...,,mayfair-games-1830-railways-and-robber-barons-...,"[{'country': 'UK', 'date': '2021-12-18T12:07:5...",o6knI5ct0u,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,273,,7.0,360.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",3,14.0,2.0,180.0,69.99,$69.99,"[{'country': 'US', 'price': 69.99}]",1830: Railways & Robber Barons,[],1,2,36,http://www.mayfairgames.com/products/1830-nort...,2-7,240,180-360,55.99,89.95,74.99,$55.99,0.0,"{'id': 'RpZJ6vqsPR', 'name': 'Francis Tresham'...","{'id': 'w1jOf2uAOD', 'name': 'The Avalon Hill ...","[{'id': 'w1jOf2uAOD', 'num_games': None, 'scor...",874,[],https://lookout-spiele.de/wp-content/uploads/1...,,,,,,,,,[],,"[""1830: Ferrovie e Capitani d'Industria"", '183...",https://s3-us-west-1.amazonaws.com/5cc.images/...,98,game,,https://www.boardgameatlas.com/game/o6knI5ct0u...,,1894,,,1986.0
1,1,True,125448.0,"['Chris Lawson', 'Rodger B. MacGowan', 'Kurt M...",,4.0,4.0,3.885135,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1846 - the midwest railroad boom has gone b...,1846 - the midwest railroad boom has gone bus...,"[{'id': 'CeUZNYVdQj', 'num_games': None, 'scor...",[],0.32,https://www.boardgameatlas.com/game/eSCVHuUFPs...,,1846-the-race-for-the-midwest,"[{'country': 'UK', 'date': '2021-12-14T18:14:1...",eSCVHuUFPs,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,226,,5.0,240.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",1,14.0,3.0,240.0,69.0,$69.00,"[{'country': 'US', 'price': 69}, {'country': '...",1846: The Race for the Midwest,[],3,1,37,https://www.gmtgames.com/p-847-1846-the-race-t...,3-5,313,240,46.99,0.0,67.95,$46.99,0.0,"{'id': 'CeUZNYVdQj', 'name': 'Thomas Lehmann',...","{'id': 'VKIPDDgZ2X', 'name': 'Deep Thought Gam...","[{'id': 'VKIPDDgZ2X', 'num_games': None, 'scor...",927,[],https://s3-us-west-2.amazonaws.com/gmtwebsitea...,,,,,,GMT1605,"[{'name': 'Noble Knight Games', 'sku': '214918...","['2149184041', '2149209107']",[],,"['1846 The Race to the Midwest Printing', '184...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,817054011155.0,https://www.boardgameatlas.com/game/eSCVHuUFPs...,,1072,,,2005.0
2,2,True,,['Mike Hutton'],,0.0,0.0,4.111111,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1862: Railway Mania in the Eastern Counties...,1862: Railway Mania in the Eastern Counties i...,"[{'id': 'xMrDEmBdTo', 'num_games': None, 'scor...",[],0.27,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1862-railway-mania-in-the-eastern-counties,"[{'country': 'UK', 'date': '2021-12-11T11:48:1...",uEPtE5OOOU,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,1,0,120,,8.0,300.0,"[{'id': 'qCXa8MX0wk', 'url': 'https://www.boar...",13,12.0,1.0,300.0,79.0,$79.00,"[{'country': 'US', 'price': 79}]",1862: Railway Mania in the Eastern Counties,[],2,0,9,https://www.gmtgames.com/p-692-1862-railway-ma...,1-8,70,300,57.99,127.99,79.95,$57.99,0.0,"{'name': 'Mike Hutton', 'id': 'xMrDEmBdTo', 'u...","{'name': 'GMT Games', 'id': 'd5oY0duBgG', 'url...","[{'id': 'd5oY0duBgG', 'num_games': None, 'scor...",9999999,[],https://gmtwebsiteassets.s3-us-west-2.amazonaw...,,3.0,9.0,,,GMT1904,"[{'name': 'Noble Knight Games', 'sku': '214919...",['2149191519'],[],,['1862: Railway Mania in the Eastern Counties'...,https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,817054011704.0,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1053,,,2013.0
3,3,True,,['Brigette Indelicato'],,3.0,4.0,4.454545,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,"<p>18Chesapeake is a member of the <a href=""ht...",18Chesapeake is a member of the 18xx series...,"[{'id': 'fN0XCgUAPZ', 'num_games': None, 'scor...",[],0.0,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,18chesapeake,"[{'country': 'US', 'date': '2022-02-08T01:34:2...",Krn8i8C0fI,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,171,,6.0,180.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",1,13.0,2.0,180.0,89.0,$89.00,"[{'country': 'US', 'price': 89}, {'country': '...",18Chesapeake,[],0,1,11,https://all-aboardgames.com/products/us-only-1...,2-6,158,180,0.0,0.0,0.0,Price: N/A,0.0,"{'id': 'fN0XCgUAPZ', 'name': 'Scott Petersen',...","{'id': 'ZOP4wDStJq', 'name': 'All-Aboard Games...","[{'id': 'ZOP4wDStJq', 'num_games': None, 'scor...",9999999,[],,,,,,,,,,[],,['18Chesapeake'],https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,862,,,2020.0
4,4,True,473334.0,['Klemens Franz'],,0.0,0.0,4.0,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>The railway history of both Switzerland and...,The railway history of both Switzerland and A...,"[{'id': 'gFwNqit3MM', 'num_games': None, 'scor...",[],0.26,https://www.boardgameatlas.com/game/eJuGpFzljd...,,184454-switzerland-and-austria-board-game,"[{'country': 'UK', 'date': '2021-12-03T08:56:5...",eJuGpFzljd,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,51,,7.0,300.0,"[{'id': 'ohABM4GjbC', 'url': 'https://www.boar...",0,12.0,3.0,300.0,94.99,$94.99,"[{'country': 'UK', 'price': 87.99}, {'country'...",1844 / 1854,[],0,0,6,,3-7,5,300,69.99,0.0,112.95,$69.99,0.0,"{'name': 'Helmut Ohley', 'id': 'gFwNqit3MM', '...","{'name': 'Mayfair Games', 'id': '7GTti1NuCH', ...","[{'id': '7GTti1NuCH', 'num_games': None, 'scor...",9999999,[],,,,,,,,,,[],,"['1844/1854 Switzerland/Austria', '1844/1854',...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/eJuGpFzljd...,,500,,,2016.0


In [177]:
# Merge datasets on name and year_published
united_df = pd.merge(board_games, 
                     board_games_api, 
                     how='inner', 
                     left_on=['name', 'year_published'], 
                     right_on=['name', 'year_published'])

In [178]:
len(united_df)

1176

In [179]:
# Merge datasets on name and year_published
united_df2 = pd.merge(board_games_raw, 
                     board_games_api, 
                     how='inner', 
                     left_on=['name'], 
                     right_on=['name'])

In [156]:
len(united_df2)

1336

In [157]:
united_df.head()

Unnamed: 0,game_id,description_x,max_players_x,max_playtime_x,min_age_x,min_players_x,min_playtime_x,name,playing_time,year_published,artist,category,designer,family,mechanic,publisher,average_rating,users_rated,name_lower,active,amazon_rank,artists,availability_status,average_learning_complexity,average_strategy_complexity,average_user_rating,categories,comment_count,commentary,cs_rating,description_y,description_preview,designers,developers,discount,edit_url,faq,handle,historical_low_prices,id,image_url,images,is_historical_low,isbn,links,listing_clicks,lists,matches_specs,max_players_y,max_playtime_y,mechanics,mentions,min_age_y,min_players_y,min_playtime_y,msrp,msrp_text,msrps,names,num_distributors,num_user_complexity_votes,num_user_ratings,official_url,players,plays,playtime,price,price_au,price_ca,price_text,price_uk,primary_designer,primary_publisher,publishers,rank,related_to,rules_url,sell_sheet_url,size_depth,size_height,size_units,size_width,sku,sku_objects,skus,specs,store_images_url,tags,thumb_url,trending_rank,type,upc,url,video_links,visits,weight_amount,weight_units
0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,1986,Marcus Gschwendtner,"Economic,Negotiation,Political",Karl-Heinz Schmiel,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498,die macher,True,1320612.0,"[Marcus Gschwendtner, Harald Lieske]",,0.0,0.0,3.684783,"[{'id': 'N0TkEGfEsF', 'url': 'https://www.boar...",0,,,<p>The classic game of German politics for 3 –...,The classic game of German politics for 3 – 5...,"[{'id': 'JzHNgQv2Kn', 'num_games': None, 'scor...",[],0.0,https://www.boardgameatlas.com/game/3hGLSZ61Yk...,,die-macher-international-edition,"[{'country': 'US', 'date': '2022-03-05T03:33:2...",3hGLSZ61Yk,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,265,,5.0,240.0,"[{'id': '9NBcz45nN7', 'url': 'https://www.boar...",2,14.0,3.0,240.0,80.0,$80.00,"[{'country': 'US', 'price': 80}, {'country': '...",[],0,0,46,https://www.kickstarter.com/projects/indiegame...,3-5,161,240,0.0,0.0,0.0,Price: N/A,0.0,"{'id': 'JzHNgQv2Kn', 'name': 'Karl-Heinz Schmi...","{'id': 'M0j3iwMtqL', 'name': 'Hans im Glück', ...","[{'id': 'M0j3iwMtqL', 'num_games': None, 'scor...",872,[],,,,,,,,"[{'name': 'Noble Knight Games', 'sku': '214926...","[2149268339, 2149203479]",[],,"[德国大选, Die Macher]",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/3hGLSZ61Yk...,,737,,
1,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,1998,Franz Vohwinkel,"Abstract Strategy,Medieval",Reiner Knizia,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019,samurai,True,223428.0,[Franz Vohwinkel],,0.0,0.0,3.701527,"[{'id': 'hBqZ3Ar4RJ', 'url': 'https://www.boar...",0,,4.6,<p><em>The year is 1336. Japan's emperor has l...,The year is 1336. Japan's emperor has lost a...,"[{'id': '6wbg6EN7xD', 'num_games': None, 'scor...",[],0.0,https://www.boardgameatlas.com/game/uDl7CD01TP...,,samurai,"[{'country': 'US', 'date': '2021-09-02T17:40:3...",uDl7CD01TP,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,1,0,416,,4.0,60.0,"[{'id': '9NBcz45nN7', 'url': 'https://www.boar...",0,10.0,2.0,30.0,49.95,$49.95,"[{'country': 'US', 'price': 49.95}, {'country'...",[],0,0,130,https://www.riograndegames.com/games.html?utm_...,2-4,651,30-60,0.0,0.0,0.0,Price: N/A,0.0,"{'id': '6wbg6EN7xD', 'name': 'Reiner Knizia', ...","{'id': 'M0j3iwMtqL', 'name': 'Hans im Glück', ...","[{'id': 'M0j3iwMtqL', 'num_games': None, 'scor...",342,[],https://www.riograndegames.com/getFile.php?id=134,,,,,,,"[{'name': 'Noble Knight Games', 'sku': '214897...","[2148978132, 2148698946]",[],,[Samurai],https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/uDl7CD01TP...,,1457,,
2,11,Bohnanza is the first in the Bohnanza family o...,7,45,13,2,45,Bohnanza,45,1997,"Fréderic Bertrand,Marek Bláha,Andrea Boekhoff,...","Card Game,Farming,Negotiation",Uwe Rosenberg,"Bohnanza,Lookout Bean Games","Hand Management,Set Collection,Trading","AMIGO Spiel + Freizeit GmbH,999 Games,Brain Ga...",7.06751,28354,bohnanza,True,2960.0,"[Fréderic Bertrand, Marek Bláha, Andrea Boekho...",,2.0,2.5,3.416414,"[{'id': 'eX8uuNlQkQ', 'url': 'https://www.boar...",0,,4.0,"<p>This great card game is about planting, tra...","This great card game is about planting, tradi...","[{'id': 'T7EeH3OcxR', 'num_games': None, 'scor...",[],-0.25,https://www.boardgameatlas.com/game/F1aw7kyGTA...,,bohnanza,"[{'country': 'UK', 'date': '2021-06-30T03:52:3...",F1aw7kyGTA,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,6,0,888,,7.0,60.0,"[{'id': 'WPytek5P8l', 'url': 'https://www.boar...",20,13.0,2.0,45.0,19.95,$19.95,"[{'country': 'US', 'price': 19.95}, {'country'...",[],4,2,348,http://riograndegames.com/Game/36-Bohnanza?utm...,2-7,2033,45-60,24.9,15.96,18.89,$24.90,0.0,"{'id': 'T7EeH3OcxR', 'name': 'Uwe Rosenberg', ...","{'id': 'oFIGIC4bxh', 'name': 'AMIGO', 'url': '...","[{'id': 'oFIGIC4bxh', 'num_games': None, 'scor...",103,[],http://riograndegames.com/getFile.php?id=535,,,6.0,,,RIO155,"[{'name': 'Boarding School Games', 'sku': 'RGG...","[RGG 155, 2149270748]",[],,"[Babszüret, Bohnanza Pocket-Version, Bohnanza ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,655132001557.0,https://www.boardgameatlas.com/game/F1aw7kyGTA...,,2564,,
3,13,"In Catan (formerly The Settlers of Catan), pla...",4,120,10,3,60,Catan,120,1995,"Volkan Baga,Tanja Donner,Pete Fenlon,Jason Haw...",Negotiation,Klaus Teuber,"Catan,Promotional Board Games","Dice Rolling,Hand Management,Modular Board,Rou...","KOSMOS,999 Games,Albi,Astrel Games,Bergsala En...",7.26569,67655,catan,True,133.0,"[Volkan Baga, Tanja Donner, Pete Fenlon, Jason...",,2.090909,2.545455,3.253205,"[{'id': 'mavSOM8vjH', 'url': 'https://www.boar...",0,<p>Fun fact: The largest game of Catan was hel...,,<p>The women and men of your expedition build ...,The women and men of your expedition build th...,"[{'id': 'LCjyh7WnHd', 'num_games': None, 'scor...",[],0.29,https://www.boardgameatlas.com/game/OIXt3DmJU0...,,catan,"[{'country': 'AU', 'date': '2022-01-21T16:44:3...",OIXt3DmJU0,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,24,12,1665,,4.0,90.0,"[{'id': 'R0bGq4cAl4', 'url': 'https://www.boar...",120,10.0,3.0,45.0,55.0,$55.00,"[{'country': 'CA', 'price': 0}, {'country': 'U...",[],2,11,854,https://www.catan.com/?utm_source=boardgameatl...,3-4,5099,45-90,39.19,52.0,36.97,$39.19,35.78,"{'id': 'LCjyh7WnHd', 'name': 'Klaus Teuber', '...","{'id': 'RY4XltbNAz', 'name': 'KOSMOS', 'url': ...","[{'id': 'RY4XltbNAz', 'num_games': None, 'scor...",22,[],https://www.catan.com/game/catan#prof-easy,,3.0,11.6,inches,,CNS3071,"[{'name': 'Boarding School Games', 'sku': 'CN3...","[CN3071, 2149146897, 2149270668, 2149187433, 2...",[],,"[Die Siedler von Catan , Les Colons de Catane,...",https://cdn.shopify.com/s/files/1/0513/4077/15...,0,game,29877030712.0,https://www.boardgameatlas.com/game/OIXt3DmJU0...,,7118,2.0,lbs
4,19,"A long time ago, in a village far, far away, s...",4,45,9,2,30,Wacky Wacky West,45,1991,"Chris McGloughlin,Franz Vohwinkel","American West,Bluffing,City Building",Klaus Teuber,,"Tile Placement,Voting","Hans im Glück Verlags-GmbH,Mayfair Games",6.31166,1459,wacky wacky west,True,788427.0,"[Chris McGloughlin, Franz Vohwinkel]",,0.0,0.0,2.857143,"[{'id': '4mOtRRwSoj', 'url': 'https://www.boar...",0,,,"<p>Just south of nowhere, there’s the junction...","Just south of nowhere, there’s the junction o...","[{'id': 'LCjyh7WnHd', 'num_games': None, 'scor...",[],0.0,https://www.boardgameatlas.com/game/MJ7RmhalO3...,,wacky-wacky-west,"[{'country': 'CA', 'date': '2021-02-01T23:59:1...",MJ7RmhalO3,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,58,,4.0,45.0,"[{'id': '8PN2HE86wg', 'url': 'https://www.boar...",0,9.0,2.0,30.0,0.0,,"[{'country': 'US', 'price': 0}, {'country': 'C...",[],0,0,7,http://www.hans-im-glueck.de/drunterdrueber/?u...,2-4,30,30-45,0.0,0.0,0.0,Price: N/A,0.0,"{'id': 'LCjyh7WnHd', 'name': 'Klaus Teuber', '...","{'id': 'M0j3iwMtqL', 'name': 'Hans im Glück', ...","[{'id': 'M0j3iwMtqL', 'num_games': None, 'scor...",9999999,[],,,,,,,,,,[],,"[Drunter & Drüber, Sans dessus dessous, Wacky ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/MJ7RmhalO3...,,132,,


In [98]:
len(united_df)

1333

In [183]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def fuzzy_merge(df_1, df_2, key1, key2, threshold=90, limit=2):
    s = df_2[key2].tolist()
    
    m = df_1[key1].apply(lambda x: process.extract(x, s, limit=limit))    
    df_1['matches'] = m
    
    m2 = df_1['matches'].apply(lambda x: ', '.join([i[0] for i in x if i[1] >= threshold]))
    df_1['matches'] = m2
    
    return df_1

fuzzy_merge(board_games, board_games_api, 'name', 'name', threshold=99)

KeyboardInterrupt: 