# Data load 

### What the code does:
* This code loads the data from the two main sources and creates the work file

**Import libraries**

In [6]:
import pandas as pd
import requests
import json
import time
import math

## **I. Data from tidyTuesday**

####  1. Import and check data

In [7]:
# Import and check data
board_games_raw = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-12/board_games.csv")

In [8]:
# Check data
pd.set_option("display.max_columns", None)
board_games_raw.head()

Unnamed: 0,game_id,description,image,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,thumbnail,year_published,artist,category,compilation,designer,expansion,family,mechanic,publisher,average_rating,users_rated
0,1,Die Macher is a game about seven sequential po...,//cf.geekdo-images.com/images/pic159509.jpg,5,240,14,3,240,Die Macher,240,//cf.geekdo-images.com/images/pic159509_t.jpg,1986,Marcus Gschwendtner,"Economic,Negotiation,Political",,Karl-Heinz Schmiel,,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498
1,2,Dragonmaster is a trick-taking card game based...,//cf.geekdo-images.com/images/pic184174.jpg,4,30,12,3,30,Dragonmaster,30,//cf.geekdo-images.com/images/pic184174_t.jpg,1981,Bob Pepper,"Card Game,Fantasy",,"G. W. ""Jerry"" D'Arcey",,Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478
2,3,"Part of the Knizia tile-laying trilogy, Samura...",//cf.geekdo-images.com/images/pic3211873.jpg,4,60,10,2,30,Samurai,60,//cf.geekdo-images.com/images/pic3211873_t.jpg,1998,Franz Vohwinkel,"Abstract Strategy,Medieval",,Reiner Knizia,,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019
3,4,When you see the triangular box and the luxuri...,//cf.geekdo-images.com/images/pic285299.jpg,4,60,12,2,60,Tal der Könige,60,//cf.geekdo-images.com/images/pic285299_t.jpg,1992,,Ancient,,Christian Beierer,,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314
4,5,"In Acquire, each player strategically invests ...",//cf.geekdo-images.com/images/pic342163.jpg,6,90,12,3,90,Acquire,90,//cf.geekdo-images.com/images/pic342163_t.jpg,1964,"Scott Okumura,Peter Whitley",Economic,,Sid Sackson,,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195


In [9]:
len(board_games_raw)

10532

#### 2. Remove duplicates

In [10]:
# Number of duplicate board games
sum(board_games_raw.duplicated(subset=['name']))

175

In [11]:
# Number of duplicate board games
sum(board_games_raw.duplicated(subset=['name', 'year_published']))

2

In [12]:
# Remove duplicates for name + year_published
board_games_raw = board_games_raw[~board_games_raw.duplicated(subset=['name', 'year_published'])]

In [13]:
len(board_games_raw)

10530

#### 3. Keep only the needed fields

In [14]:
# Check number of nulls
board_games_raw.isnull().sum()

game_id               0
description           0
image                 1
max_players           0
max_playtime          0
min_age               0
min_players           0
min_playtime          0
name                  0
playing_time          0
thumbnail             1
year_published        0
artist             2773
category             94
compilation       10120
designer            126
expansion          7778
family             2808
mechanic            949
publisher             3
average_rating        0
users_rated           0
dtype: int64

In [15]:
# Feature list
board_games_raw.dtypes

game_id             int64
description        object
image              object
max_players         int64
max_playtime        int64
min_age             int64
min_players         int64
min_playtime        int64
name               object
playing_time        int64
thumbnail          object
year_published      int64
artist             object
category           object
compilation        object
designer           object
expansion          object
family             object
mechanic           object
publisher          object
average_rating    float64
users_rated         int64
dtype: object

In [16]:
# Remove fields with high null rate
board_games_raw = board_games_raw.loc[:, ~board_games_raw.columns.isin(['image', 'thumbnail', 'compilation', 'expansion'])]

#### 4. Data cleaning

In [17]:
# Lowercase name
board_games_raw['name_lower']=board_games_raw['name'].str.lower()

In [14]:
# Todo: make list of family, mechanic, publisher like below but here we need to keep it as a sting to save it as csv
# board_games_raw['family_split']=board_games_raw['family'].str.split(',')
# board_games_raw['family_split'][0][0]

#### 5. Save dataset

In [18]:
pd.set_option("display.max_columns", None)
board_games_raw.head()

Unnamed: 0,game_id,description,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,year_published,artist,category,designer,family,mechanic,publisher,average_rating,users_rated,name_lower
0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,1986,Marcus Gschwendtner,"Economic,Negotiation,Political",Karl-Heinz Schmiel,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498,die macher
1,2,Dragonmaster is a trick-taking card game based...,4,30,12,3,30,Dragonmaster,30,1981,Bob Pepper,"Card Game,Fantasy","G. W. ""Jerry"" D'Arcey",Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478,dragonmaster
2,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,1998,Franz Vohwinkel,"Abstract Strategy,Medieval",Reiner Knizia,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019,samurai
3,4,When you see the triangular box and the luxuri...,4,60,12,2,60,Tal der Könige,60,1992,,Ancient,Christian Beierer,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314,tal der könige
4,5,"In Acquire, each player strategically invests ...",6,90,12,3,90,Acquire,90,1964,"Scott Okumura,Peter Whitley",Economic,Sid Sackson,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195,acquire


In [19]:
# Save dataset 
board_games_raw.to_csv("board_games_raw.csv", sep=';' , quotechar='"')

## **II. Call API**

#### 1. Get list of board game categories

In [20]:
# Get category list - Sample
categ_url = 'https://api.boardgameatlas.com/api/game/categories?client_id=6MXavgYBke'
categ_response = requests.get(categ_url)
categ_dict = json.loads(categ_response.text)
list(categ_dict.values())[0][0]

{'id': '2bdFPJUvFo',
 'name': '18XX',
 'url': 'https://www.boardgameatlas.com/category/2bdFPJUvFo/18xx'}

In [21]:
# Get category ids
categ_ids = []
for cat_num in list(range(0,len(categ_dict['categories']))):
    categ_ids.append(categ_dict['categories'][cat_num]['id'])
categ_ids[1:15]

['85OKv8p5Ow',
 'hBqZ3Ar4RJ',
 'GtuMb7ei27',
 'KUBCKBkGxV',
 'DjAhqEHOD0',
 '20iDvpbh7A',
 'tJxatX2ZbW',
 'nWDac9tQzt',
 'dghLhwyxVb',
 '4mOtRRwSoj',
 'a8NM5cugJX',
 'MWoxgHrOJD',
 'eFaACC6y2c',
 'CBboNLI1Uj']

In [22]:
# Number of categories
len(categ_ids)

151

#### 2. Request board game data using the category ids 


In [23]:
# Sample request
categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories=2bdFPJUvFo'
response = requests.get(categ_url)
games_dict = json.loads(response.text)
list(games_dict.values())[0][0]

{'id': 'o6knI5ct0u',
 'handle': 'mayfair-games-1830-railways-and-robber-barons--north-east-us',
 'url': 'https://www.boardgameatlas.com/game/o6knI5ct0u/mayfair-games-1830-railways-and-robber-barons--north-east-us',
 'edit_url': 'https://www.boardgameatlas.com/game/o6knI5ct0u/edit',
 'name': '1830: Railways & Robber Barons',
 'price': '52.99',
 'price_ca': '74.99',
 'price_uk': '50.09',
 'price_au': '89.95',
 'msrp': 69.99,
 'msrps': [{'country': 'US', 'price': 69.99}],
 'discount': '0.24',
 'year_published': 1986,
 'min_players': 2,
 'max_players': 7,
 'min_playtime': 180,
 'max_playtime': 360,
 'min_age': 14,
 'description': "<p>1830. It is the dawn of the &quot;Age of Railroading&quot; in America. You're a wealthy investor and speculator betting that the new technology will revolutionize transport. Commerce will no longer depend on rutted roads and slow canals. Instead, it will ride the rails on swift, powerful &quot;Iron Horses.&quot;</p>\r\n<p><strong>1830</strong> is an acclaimed 

In [24]:
# Get board games
categ_df = pd.DataFrame()
for category_id in categ_ids:
    next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}'.format(category_id)
    next_response = requests.get(next_categ_url)
    next_games_dict = json.loads(next_response.text)
    categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
    time.sleep(1)
    if next_games_dict['count'] > 1000:
        for hundred in list(range(1,11)):
            limit = 100 * hundred
            next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}&skip={}'.format(category_id , limit)
            next_response = requests.get(next_categ_url)
            next_games_dict = json.loads(next_response.text)
            categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
            time.sleep(1)
    elif next_games_dict['count'] > 100:
        for hundred in list(range(1,(math.ceil(next_games_dict['count'] / 100)))):
            limit = 100 * hundred
            next_categ_url = 'https://api.boardgameatlas.com/api/search?client_id=6MXavgYBke&limit=100&categories={}&skip={}'.format(category_id , limit)
            next_response = requests.get(next_categ_url)
            next_games_dict = json.loads(next_response.text)
            categ_df = categ_df.append(pd.DataFrame(next_games_dict['games']))
            time.sleep(1)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [25]:
pd.options.display.max_columns = None
display(categ_df.head())

Unnamed: 0,active,amazon_rank,artists,availability_status,average_learning_complexity,average_strategy_complexity,average_user_rating,categories,comment_count,commentary,cs_rating,description,description_preview,designers,developers,discount,edit_url,faq,handle,historical_low_prices,id,image_url,images,is_historical_low,isbn,links,listing_clicks,lists,matches_specs,max_players,max_playtime,mechanics,mentions,min_age,min_players,min_playtime,msrp,msrp_text,msrps,name,names,num_distributors,num_user_complexity_votes,num_user_ratings,official_url,players,plays,playtime,price,price_au,price_ca,price_text,price_uk,primary_designer,primary_publisher,publishers,rank,related_to,rules_url,sell_sheet_url,size_depth,size_height,size_units,size_width,sku,sku_objects,skus,specs,store_images_url,tags,thumb_url,trending_rank,type,upc,url,video_links,visits,weight_amount,weight_units,year_published
0,True,559911.0,"[Mike Atkinson, Jared Blando, Charles Kibler, ...",,3.0,4.5,4.006944,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1830. It is the dawn of the &quot;Age of Ra...,"1830. It is the dawn of the ""Age of Railroadi...","[{'id': 'RpZJ6vqsPR', 'num_games': None, 'scor...",[],0.24,https://www.boardgameatlas.com/game/o6knI5ct0u...,,mayfair-games-1830-railways-and-robber-barons-...,"[{'country': 'UK', 'date': '2021-12-18T12:07:5...",o6knI5ct0u,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,273,,7.0,360.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",3,14.0,2.0,180.0,69.99,$69.99,"[{'country': 'US', 'price': 69.99}]",1830: Railways & Robber Barons,[],1,2,36,http://www.mayfairgames.com/products/1830-nort...,2-7,244,180-360,52.99,89.95,74.99,$52.99,50.09,"{'id': 'RpZJ6vqsPR', 'name': 'Francis Tresham'...","{'id': 'w1jOf2uAOD', 'name': 'The Avalon Hill ...","[{'id': 'w1jOf2uAOD', 'num_games': None, 'scor...",874,[],https://lookout-spiele.de/wp-content/uploads/1...,,,,,,,,,[],,"[1830: Ferrovie e Capitani d'Industria, 1830: ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/o6knI5ct0u...,,1903,,,1986.0
1,True,125448.0,"[Chris Lawson, Rodger B. MacGowan, Kurt Miller...",,4.0,4.0,3.885135,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1846 - the midwest railroad boom has gone b...,1846 - the midwest railroad boom has gone bus...,"[{'id': 'CeUZNYVdQj', 'num_games': None, 'scor...",[],0.39,https://www.boardgameatlas.com/game/eSCVHuUFPs...,,1846-the-race-for-the-midwest,"[{'country': 'UK', 'date': '2021-12-14T18:14:1...",eSCVHuUFPs,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,228,,5.0,240.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",1,14.0,3.0,240.0,69.0,$69.00,"[{'country': 'US', 'price': 69}, {'country': '...",1846: The Race for the Midwest,[],3,1,37,https://www.gmtgames.com/p-847-1846-the-race-t...,3-5,315,240,42.25,0.0,67.95,$42.25,50.88,"{'id': 'CeUZNYVdQj', 'name': 'Thomas Lehmann',...","{'id': 'VKIPDDgZ2X', 'name': 'Deep Thought Gam...","[{'id': 'VKIPDDgZ2X', 'num_games': None, 'scor...",927,[],https://s3-us-west-2.amazonaws.com/gmtwebsitea...,,,,,,GMT1605,"[{'name': 'Noble Knight Games', 'sku': '214918...","[2149184041, 2149209107]",[],,"[1846 The Race to the Midwest Printing, 1846: ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,817054011155.0,https://www.boardgameatlas.com/game/eSCVHuUFPs...,,1085,,,2005.0
2,True,,[Mike Hutton],,0.0,0.0,4.111111,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>1862: Railway Mania in the Eastern Counties...,1862: Railway Mania in the Eastern Counties i...,"[{'id': 'xMrDEmBdTo', 'num_games': None, 'scor...",[],0.42,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1862-railway-mania-in-the-eastern-counties,"[{'country': 'UK', 'date': '2021-12-11T11:48:1...",uEPtE5OOOU,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,1,0,122,,8.0,300.0,"[{'id': 'qCXa8MX0wk', 'url': 'https://www.boar...",13,12.0,1.0,300.0,79.0,$79.00,"[{'country': 'US', 'price': 79}]",1862: Railway Mania in the Eastern Counties,[],2,0,9,https://www.gmtgames.com/p-692-1862-railway-ma...,1-8,71,300,45.51,127.99,79.95,$45.51,0.0,"{'name': 'Mike Hutton', 'id': 'xMrDEmBdTo', 'u...","{'name': 'GMT Games', 'id': 'd5oY0duBgG', 'url...","[{'id': 'd5oY0duBgG', 'num_games': None, 'scor...",9999999,[],https://gmtwebsiteassets.s3-us-west-2.amazonaw...,,3.0,9.0,,,GMT1904,"[{'name': 'Noble Knight Games', 'sku': '214919...",[2149191519],[],,"[1862: Railway Mania in the Eastern Counties, ...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,817054011704.0,https://www.boardgameatlas.com/game/uEPtE5OOOU...,,1068,,,2013.0
3,True,,[Brigette Indelicato],,3.0,4.0,4.454545,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,"<p>18Chesapeake is a member of the <a href=""ht...",18Chesapeake is a member of the 18xx series...,"[{'id': 'fN0XCgUAPZ', 'num_games': None, 'scor...",[],0.0,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,18chesapeake,"[{'country': 'US', 'date': '2022-02-08T01:34:2...",Krn8i8C0fI,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,172,,6.0,180.0,"[{'id': 'AZxlPpi5oq', 'url': 'https://www.boar...",1,13.0,2.0,180.0,89.0,$89.00,"[{'country': 'US', 'price': 89}, {'country': '...",18Chesapeake,[],0,1,11,https://all-aboardgames.com/products/us-only-1...,2-6,160,180,0.0,0.0,0.0,Price: N/A,0.0,"{'id': 'fN0XCgUAPZ', 'name': 'Scott Petersen',...","{'id': 'ZOP4wDStJq', 'name': 'All-Aboard Games...","[{'id': 'ZOP4wDStJq', 'num_games': None, 'scor...",9999999,[],,,,,,,,,,[],,[18Chesapeake],https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/Krn8i8C0fI...,,870,,,2020.0
4,True,473334.0,[Klemens Franz],,0.0,0.0,4.0,"[{'id': '2bdFPJUvFo', 'url': 'https://www.boar...",0,,,<p>The railway history of both Switzerland and...,The railway history of both Switzerland and A...,"[{'id': 'gFwNqit3MM', 'num_games': None, 'scor...",[],0.2,https://www.boardgameatlas.com/game/eJuGpFzljd...,,184454-switzerland-and-austria-board-game,"[{'country': 'UK', 'date': '2021-12-03T08:56:5...",eJuGpFzljd,https://s3-us-west-1.amazonaws.com/5cc.images/...,{'thumb': 'https://d2k4q26owzy373.cloudfront.n...,False,,0,0,51,,7.0,300.0,"[{'id': 'ohABM4GjbC', 'url': 'https://www.boar...",0,12.0,3.0,300.0,94.99,$94.99,"[{'country': 'UK', 'price': 87.99}, {'country'...",1844 / 1854,[],0,0,6,,3-7,5,300,75.99,0.0,112.95,$75.99,0.0,"{'name': 'Helmut Ohley', 'id': 'gFwNqit3MM', '...","{'name': 'Mayfair Games', 'id': '7GTti1NuCH', ...","[{'id': '7GTti1NuCH', 'num_games': None, 'scor...",9999999,[],,,,,,,,,,[],,"[1844/1854 Switzerland/Austria, 1844/1854, 184...",https://s3-us-west-1.amazonaws.com/5cc.images/...,0,game,,https://www.boardgameatlas.com/game/eJuGpFzljd...,,505,,,2016.0


In [26]:
# Number of games 
len(categ_df)

16306

In [27]:
# Feature list
categ_df.dtypes

active                            bool
amazon_rank                    float64
artists                         object
availability_status             object
average_learning_complexity    float64
average_strategy_complexity    float64
average_user_rating            float64
categories                      object
comment_count                    int64
commentary                      object
cs_rating                      float64
description                     object
description_preview             object
designers                       object
developers                      object
discount                        object
edit_url                        object
faq                             object
handle                          object
historical_low_prices           object
id                              object
image_url                       object
images                          object
is_historical_low                 bool
isbn                            object
links                    

#### 3. Keep only the needed fields

In [28]:
# Keep only useful fields
categ_df_short = categ_df.loc[:, categ_df.columns.isin([
    'artists',
    'average_learning_complexity',
    'average_strategy_complexity',
    'average_user_rating',
    'categories',
    'comment_count',
    'description',
    'description_preview',
    'designers',
    'discount',
    'id',
    'is_historical_low',
    'listing_clicks',
    'max_players',
    'max_playtime',
    'min_age',
    'min_players',
    'min_playtime',
    'name',
    'num_user_ratings',
    'players',
    'plays',
    'playtime',
    'price',
    'price_au',
    'price_ca',
    'price_uk',
    'primary_publisher',
    'publishers',
    'visits',
    'year_published'  
])]

In [29]:
categ_df_short.isnull().sum()

artists                           0
average_learning_complexity       0
average_strategy_complexity       0
average_user_rating               0
categories                        0
comment_count                     0
description                       0
description_preview               0
designers                         0
discount                          0
id                                0
is_historical_low                 0
listing_clicks                    0
max_players                    1351
max_playtime                   1925
min_age                        2102
min_players                    1351
min_playtime                   1922
name                              0
num_user_ratings                  0
players                        1392
plays                             0
playtime                       2097
price                             0
price_au                          0
price_ca                          0
price_uk                          0
primary_publisher           

#### 4. Data cleaning

In [30]:
# Join lists / dictionaries to string
#categ_df_short['artists_mod'] = ''.join(str(e) for e in categ_df_short['artists'])
#categ_df_short['primary_publisher_mod'] = ''.join(str(e) for e in categ_df_short['primary_publisher'])
#categ_df_short['categories_mod'] = ''.join(str(e) for e in categ_df_short['categories'])
#categ_df_short['designers_mod'] = ''.join(str(e) for e in categ_df_short['designers'])
#categ_df_short['publishers_mod'] = ''.join(str(e) for e in categ_df_short['publishers'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the cavea

In [30]:
# Lowercase name
categ_df_short['name_lower']=categ_df_short['name'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [31]:
# Remove original fields
categ_df_short = categ_df_short.loc[:, ~categ_df_short.columns.isin(['artists', 'primary_publisher', 'categories', 'designers', 'publishers', 'description', 'description_preview'])]

In [32]:
# Feature list
categ_df_short.dtypes

average_learning_complexity    float64
average_strategy_complexity    float64
average_user_rating            float64
comment_count                    int64
discount                        object
id                              object
is_historical_low                 bool
listing_clicks                   int64
max_players                    float64
max_playtime                   float64
min_age                        float64
min_players                    float64
min_playtime                   float64
name                            object
num_user_ratings                 int64
players                         object
plays                            int64
playtime                        object
price                           object
price_au                        object
price_ca                        object
price_uk                        object
visits                           int64
year_published                 float64
name_lower                      object
dtype: object

In [33]:
pd.options.display.max_columns = None
categ_df_short.head()

Unnamed: 0,average_learning_complexity,average_strategy_complexity,average_user_rating,comment_count,discount,id,is_historical_low,listing_clicks,max_players,max_playtime,min_age,min_players,min_playtime,name,num_user_ratings,players,plays,playtime,price,price_au,price_ca,price_uk,visits,year_published,name_lower
0,3.0,4.5,4.006944,0,0.24,o6knI5ct0u,False,0,7.0,360.0,14.0,2.0,180.0,1830: Railways & Robber Barons,36,2-7,244,180-360,52.99,89.95,74.99,50.09,1903,1986.0,1830: railways & robber barons
1,4.0,4.0,3.885135,0,0.39,eSCVHuUFPs,False,0,5.0,240.0,14.0,3.0,240.0,1846: The Race for the Midwest,37,3-5,315,240,42.25,0.0,67.95,50.88,1085,2005.0,1846: the race for the midwest
2,0.0,0.0,4.111111,0,0.42,uEPtE5OOOU,False,0,8.0,300.0,12.0,1.0,300.0,1862: Railway Mania in the Eastern Counties,9,1-8,71,300,45.51,127.99,79.95,0.0,1068,2013.0,1862: railway mania in the eastern counties
3,3.0,4.0,4.454545,0,0.0,Krn8i8C0fI,False,0,6.0,180.0,13.0,2.0,180.0,18Chesapeake,11,2-6,160,180,0.0,0.0,0.0,0.0,870,2020.0,18chesapeake
4,0.0,0.0,4.0,0,0.2,eJuGpFzljd,False,0,7.0,300.0,12.0,3.0,300.0,1844 / 1854,6,3-7,5,300,75.99,0.0,112.95,0.0,505,2016.0,1844 / 1854


#### 5. Remove duplicates

In [34]:
# Number of duplicate board games
sum(categ_df_short.duplicated(subset=['name_lower', 'year_published']))

8335

In [35]:
# Number of duplicate board games
sum(categ_df_short.duplicated(subset=['name_lower']))

8370

In [36]:
# Remove duplicates
no_duplicate_categ_df = categ_df_short[~categ_df_short.duplicated(subset=['name_lower', 'year_published'])]

In [37]:
len(no_duplicate_categ_df)

7971

In [38]:
pd.options.display.max_columns = None
no_duplicate_categ_df.head()

Unnamed: 0,average_learning_complexity,average_strategy_complexity,average_user_rating,comment_count,discount,id,is_historical_low,listing_clicks,max_players,max_playtime,min_age,min_players,min_playtime,name,num_user_ratings,players,plays,playtime,price,price_au,price_ca,price_uk,visits,year_published,name_lower
0,3.0,4.5,4.006944,0,0.24,o6knI5ct0u,False,0,7.0,360.0,14.0,2.0,180.0,1830: Railways & Robber Barons,36,2-7,244,180-360,52.99,89.95,74.99,50.09,1903,1986.0,1830: railways & robber barons
1,4.0,4.0,3.885135,0,0.39,eSCVHuUFPs,False,0,5.0,240.0,14.0,3.0,240.0,1846: The Race for the Midwest,37,3-5,315,240,42.25,0.0,67.95,50.88,1085,2005.0,1846: the race for the midwest
2,0.0,0.0,4.111111,0,0.42,uEPtE5OOOU,False,0,8.0,300.0,12.0,1.0,300.0,1862: Railway Mania in the Eastern Counties,9,1-8,71,300,45.51,127.99,79.95,0.0,1068,2013.0,1862: railway mania in the eastern counties
3,3.0,4.0,4.454545,0,0.0,Krn8i8C0fI,False,0,6.0,180.0,13.0,2.0,180.0,18Chesapeake,11,2-6,160,180,0.0,0.0,0.0,0.0,870,2020.0,18chesapeake
4,0.0,0.0,4.0,0,0.2,eJuGpFzljd,False,0,7.0,300.0,12.0,3.0,300.0,1844 / 1854,6,3-7,5,300,75.99,0.0,112.95,0.0,505,2016.0,1844 / 1854


##### 6. Save dataset

In [39]:
# Save dataset 
no_duplicate_categ_df.to_csv("board_games_api.csv", sep=';' , quotechar='"')

## **III. Join datasets**

In [40]:
# Import dataset - Tidytuesday
board_games = pd.read_csv("board_games_raw.csv", sep=';')
board_games.columns = [str(col) + '_base' for col in board_games.columns]
board_games.head()

Unnamed: 0,Unnamed: 0_base,game_id_base,description_base,max_players_base,max_playtime_base,min_age_base,min_players_base,min_playtime_base,name_base,playing_time_base,year_published_base,artist_base,category_base,designer_base,family_base,mechanic_base,publisher_base,average_rating_base,users_rated_base,name_lower_base
0,0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,1986,Marcus Gschwendtner,"Economic,Negotiation,Political",Karl-Heinz Schmiel,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498,die macher
1,1,2,Dragonmaster is a trick-taking card game based...,4,30,12,3,30,Dragonmaster,30,1981,Bob Pepper,"Card Game,Fantasy","G. W. ""Jerry"" D'Arcey",Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478,dragonmaster
2,2,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,1998,Franz Vohwinkel,"Abstract Strategy,Medieval",Reiner Knizia,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019,samurai
3,3,4,When you see the triangular box and the luxuri...,4,60,12,2,60,Tal der Könige,60,1992,,Ancient,Christian Beierer,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314,tal der könige
4,4,5,"In Acquire, each player strategically invests ...",6,90,12,3,90,Acquire,90,1964,"Scott Okumura,Peter Whitley",Economic,Sid Sackson,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195,acquire


In [41]:
# Import dataset - API
board_games_api = pd.read_csv("board_games_api.csv", sep=';')
board_games_api.columns = [str(col) + '_api' for col in board_games_api.columns]
board_games_api.head()

Unnamed: 0,Unnamed: 0_api,average_learning_complexity_api,average_strategy_complexity_api,average_user_rating_api,comment_count_api,discount_api,id_api,is_historical_low_api,listing_clicks_api,max_players_api,max_playtime_api,min_age_api,min_players_api,min_playtime_api,name_api,num_user_ratings_api,players_api,plays_api,playtime_api,price_api,price_au_api,price_ca_api,price_uk_api,visits_api,year_published_api,name_lower_api
0,0,3.0,4.5,4.006944,0,0.24,o6knI5ct0u,False,0,7.0,360.0,14.0,2.0,180.0,1830: Railways & Robber Barons,36,2-7,244,180-360,52.99,89.95,74.99,50.09,1903,1986.0,1830: railways & robber barons
1,1,4.0,4.0,3.885135,0,0.39,eSCVHuUFPs,False,0,5.0,240.0,14.0,3.0,240.0,1846: The Race for the Midwest,37,3-5,315,240,42.25,0.0,67.95,50.88,1085,2005.0,1846: the race for the midwest
2,2,0.0,0.0,4.111111,0,0.42,uEPtE5OOOU,False,0,8.0,300.0,12.0,1.0,300.0,1862: Railway Mania in the Eastern Counties,9,1-8,71,300,45.51,127.99,79.95,0.0,1068,2013.0,1862: railway mania in the eastern counties
3,3,3.0,4.0,4.454545,0,0.0,Krn8i8C0fI,False,0,6.0,180.0,13.0,2.0,180.0,18Chesapeake,11,2-6,160,180,0.0,0.0,0.0,0.0,870,2020.0,18chesapeake
4,4,0.0,0.0,4.0,0,0.2,eJuGpFzljd,False,0,7.0,300.0,12.0,3.0,300.0,1844 / 1854,6,3-7,5,300,75.99,0.0,112.95,0.0,505,2016.0,1844 / 1854


In [42]:
# Merge datasets on name and year_published
united_df = pd.merge(board_games, 
                     board_games_api, 
                     how='inner', 
                     left_on=['name_lower_base', 'year_published_base'], 
                     right_on=['name_lower_api', 'year_published_api'])

In [43]:
len(united_df)

1241

In [44]:
# Merge datasets on name and year_published
united_df2 = pd.merge(board_games, 
                     board_games_api, 
                     how='inner', 
                     left_on=['name_lower_base'], 
                     right_on=['name_lower_api'])

In [45]:
len(united_df2)

1409

In [46]:
united_df.head()

Unnamed: 0,Unnamed: 0_base,game_id_base,description_base,max_players_base,max_playtime_base,min_age_base,min_players_base,min_playtime_base,name_base,playing_time_base,year_published_base,artist_base,category_base,designer_base,family_base,mechanic_base,publisher_base,average_rating_base,users_rated_base,name_lower_base,Unnamed: 0_api,average_learning_complexity_api,average_strategy_complexity_api,average_user_rating_api,comment_count_api,discount_api,id_api,is_historical_low_api,listing_clicks_api,max_players_api,max_playtime_api,min_age_api,min_players_api,min_playtime_api,name_api,num_user_ratings_api,players_api,plays_api,playtime_api,price_api,price_au_api,price_ca_api,price_uk_api,visits_api,year_published_api,name_lower_api
0,0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,1986,Marcus Gschwendtner,"Economic,Negotiation,Political",Karl-Heinz Schmiel,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498,die macher,20,0.0,0.0,3.684783,0,0.0,3hGLSZ61Yk,False,0,5.0,240.0,14.0,3.0,240.0,Die Macher,46,3-5,162,240,0.0,0.0,0.0,0.0,741,1986.0,die macher
1,2,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,1998,Franz Vohwinkel,"Abstract Strategy,Medieval",Reiner Knizia,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019,samurai,14,0.0,0.0,3.701527,0,0.0,uDl7CD01TP,False,0,4.0,60.0,10.0,2.0,30.0,Samurai,130,2-4,653,30-60,0.0,0.0,0.0,0.0,1470,1998.0,samurai
2,10,11,Bohnanza is the first in the Bohnanza family o...,7,45,13,2,45,Bohnanza,45,1997,"Fréderic Bertrand,Marek Bláha,Andrea Boekhoff,...","Card Game,Farming,Negotiation",Uwe Rosenberg,"Bohnanza,Lookout Bean Games","Hand Management,Set Collection,Trading","AMIGO Spiel + Freizeit GmbH,999 Games,Brain Ga...",7.06751,28354,bohnanza,28,2.0,2.5,3.41522,0,0.31,F1aw7kyGTA,True,15,7.0,60.0,13.0,2.0,45.0,Bohnanza,349,2-7,2042,45-60,13.79,13.99,18.89,0.0,2599,1997.0,bohnanza
3,12,13,"In Catan (formerly The Settlers of Catan), pla...",4,120,10,3,60,Catan,120,1995,"Volkan Baga,Tanja Donner,Pete Fenlon,Jason Haw...",Negotiation,Klaus Teuber,"Catan,Promotional Board Games","Dice Rolling,Hand Management,Modular Board,Rou...","KOSMOS,999 Games,Albi,Astrel Games,Bergsala En...",7.26569,67655,catan,1,2.090909,2.545455,3.254073,0,0.29,OIXt3DmJU0,False,10,4.0,90.0,10.0,3.0,45.0,Catan,856,3-4,5113,45-90,39.19,51.0,36.97,35.78,7203,1995.0,catan
4,17,18,The robots of the Robo Rally automobile factor...,8,120,12,2,45,RoboRally,120,1994,"Peter Bergting,Bob Carasca,Phil Foglio,Daniel ...","Miniatures,Racing,Science Fiction",Richard Garfield,Robots,"Action / Movement Programming,Grid Movement,Mo...","999 Games,AMIGO Spiel + Freizeit GmbH,Avalon H...",7.15355,19371,roborally,1,0.0,0.0,3.255405,0,0.0,x0sk1fJyaz,False,0,8.0,120.0,12.0,2.0,45.0,Roborally,185,2-8,548,45-120,0.0,49.95,199.99,0.0,521,1994.0,roborally


In [47]:
# Save dataset 
united_df.to_csv("board_games_workfile.csv", sep=';' , quotechar='"')