# Data load 

### What the code does:
* This code loads the data from the two main sources and creates the work file

**Import libraries**

In [2]:
import pandas as pd
import requests
import json
import time
import math

## **I. Data from tidyTuesday**

####  1. Import and check data

In [3]:
# Import and check data
board_games_raw = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-12/board_games.csv")

In [4]:
# Check data
pd.set_option("display.max_columns", None)
board_games_raw.head()

Unnamed: 0,game_id,description,image,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,thumbnail,year_published,artist,category,compilation,designer,expansion,family,mechanic,publisher,average_rating,users_rated
0,1,Die Macher is a game about seven sequential po...,//cf.geekdo-images.com/images/pic159509.jpg,5,240,14,3,240,Die Macher,240,//cf.geekdo-images.com/images/pic159509_t.jpg,1986,Marcus Gschwendtner,"Economic,Negotiation,Political",,Karl-Heinz Schmiel,,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498
1,2,Dragonmaster is a trick-taking card game based...,//cf.geekdo-images.com/images/pic184174.jpg,4,30,12,3,30,Dragonmaster,30,//cf.geekdo-images.com/images/pic184174_t.jpg,1981,Bob Pepper,"Card Game,Fantasy",,"G. W. ""Jerry"" D'Arcey",,Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478
2,3,"Part of the Knizia tile-laying trilogy, Samura...",//cf.geekdo-images.com/images/pic3211873.jpg,4,60,10,2,30,Samurai,60,//cf.geekdo-images.com/images/pic3211873_t.jpg,1998,Franz Vohwinkel,"Abstract Strategy,Medieval",,Reiner Knizia,,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019
3,4,When you see the triangular box and the luxuri...,//cf.geekdo-images.com/images/pic285299.jpg,4,60,12,2,60,Tal der Könige,60,//cf.geekdo-images.com/images/pic285299_t.jpg,1992,,Ancient,,Christian Beierer,,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314
4,5,"In Acquire, each player strategically invests ...",//cf.geekdo-images.com/images/pic342163.jpg,6,90,12,3,90,Acquire,90,//cf.geekdo-images.com/images/pic342163_t.jpg,1964,"Scott Okumura,Peter Whitley",Economic,,Sid Sackson,,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195


In [5]:
len(board_games_raw)

10532

#### 2. Remove duplicates

In [7]:
# Number of duplicate board games
sum(board_games_raw.duplicated(subset=['name', 'year_published']))

2

In [8]:
# Remove duplicates for name + year_published
board_games_raw = board_games_raw[~board_games_raw.duplicated(subset=['name', 'year_published'])]

In [9]:
len(board_games_raw)

10530

#### 3. Keep only the needed fields

In [10]:
# Check number of nulls
board_games_raw.isnull().sum()

game_id               0
description           0
image                 1
max_players           0
max_playtime          0
min_age               0
min_players           0
min_playtime          0
name                  0
playing_time          0
thumbnail             1
year_published        0
artist             2773
category             94
compilation       10120
designer            126
expansion          7778
family             2808
mechanic            949
publisher             3
average_rating        0
users_rated           0
dtype: int64

In [11]:
# Feature list
board_games_raw.dtypes

game_id             int64
description        object
image              object
max_players         int64
max_playtime        int64
min_age             int64
min_players         int64
min_playtime        int64
name               object
playing_time        int64
thumbnail          object
year_published      int64
artist             object
category           object
compilation        object
designer           object
expansion          object
family             object
mechanic           object
publisher          object
average_rating    float64
users_rated         int64
dtype: object

In [12]:
# Remove fields with high null rate
board_games_raw = board_games_raw.loc[:, ~board_games_raw.columns.isin(['image', 'thumbnail', 'compilation', 'expansion'])]

#### 4. Data cleaning

In [13]:
# Lowercase name
board_games_raw['name_lower']=board_games_raw['name'].str.lower()

#### 5. Save dataset

In [15]:
pd.set_option("display.max_columns", None)
board_games_raw.head()

Unnamed: 0,game_id,description,max_players,max_playtime,min_age,min_players,min_playtime,name,playing_time,year_published,artist,category,designer,family,mechanic,publisher,average_rating,users_rated,name_lower
0,1,Die Macher is a game about seven sequential po...,5,240,14,3,240,Die Macher,240,1986,Marcus Gschwendtner,"Economic,Negotiation,Political",Karl-Heinz Schmiel,"Country: Germany,Valley Games Classic Line","Area Control / Area Influence,Auction/Bidding,...","Hans im Glück Verlags-GmbH,Moskito Spiele,Vall...",7.66508,4498,die macher
1,2,Dragonmaster is a trick-taking card game based...,4,30,12,3,30,Dragonmaster,30,1981,Bob Pepper,"Card Game,Fantasy","G. W. ""Jerry"" D'Arcey",Animals: Dragons,Trick-taking,"E.S. Lowe,Milton Bradley",6.60815,478,dragonmaster
2,3,"Part of the Knizia tile-laying trilogy, Samura...",4,60,10,2,30,Samurai,60,1998,Franz Vohwinkel,"Abstract Strategy,Medieval",Reiner Knizia,"Asian Theme,Country: Japan,Knizia tile-laying ...","Area Control / Area Influence,Hand Management,...","999 Games,ABACUSSPIELE,Astrel Games,Ceilikan J...",7.44119,12019,samurai
3,4,When you see the triangular box and the luxuri...,4,60,12,2,60,Tal der Könige,60,1992,,Ancient,Christian Beierer,"Country: Egypt,Promotional Board Games","Action Point Allowance System,Area Control / A...",KOSMOS,6.60675,314,tal der könige
4,5,"In Acquire, each player strategically invests ...",6,90,12,3,90,Acquire,90,1964,"Scott Okumura,Peter Whitley",Economic,Sid Sackson,3M Bookshelf Series,"Hand Management,Stock Holding,Tile Placement","3M,Avalon Hill,Avalon Hill (Hasbro),Dujardin,G...",7.3583,15195,acquire


In [16]:
# Save dataset 
board_games_raw.to_csv("1loading_output.csv", sep=',' , quotechar='"')