# BGG - Scrubbing/Cleaning

In [8]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc

# ignore warnings (gets rid of Pandas copy warnings)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

# NLP tools
import spacy
nlp = spacy.load("en_core_web_sm")
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize

In [2]:
def integer_reduce(data, columns, fill_value=0):
    for column in columns:
        print(column)
        data[column].fillna(fill_value, inplace=True)
        if (data[column].max() <= 127) & (data[column].min() >= -128):
            data[column] = data[column].astype('int8')
        elif (data[column].max() <= 32767) & (data[column].min() >= -32768):
            data[column] = data[column].astype('int16')
        elif (data[column].max() <= 2147483647) & (data[column].min() >= -2147483648):
            data[column] = data[column].astype('int32')
        
    return data

In [3]:
def text_block_processor(text):
    '''Takes a block of text. Divides block into sentences with words lemmatized.
    Sends each sentence to word processor. Concatenates all words into one string
    If the string contains "zestimate", returns a DEFAULT listing note
    Otherwise returns string of cleaned and processed words from text block
    ARGUMENTS:
    block of text
    '''
    
    text = str(text)
    line = re.sub(r'[^a-zA-Z\s]', '', text).lower() # removes all special characters and numbers, and makes lower case
    tokens = nlp(line)
    words = []
    for token in tokens:
        if token.is_stop == False:
            token_preprocessed = token.lemma_
            if token_preprocessed != '': # only continues if returned word is not empty
                words.append(token_preprocessed) # appends word to list of words
    line = ' '.join(words)
    
    return line


In [4]:
def fix_numbers(x):
    
    if type(x) is int:
        return int(x)
    
    if str.endswith(x, 'k'):
        x = str(x).strip('k')
        new_num = int(float(x)*1000)
        return int(new_num)
    
    else: 
        return int(x)

In [5]:
def clean_ratings(id_num, game_ids_list):
    
    print('\nCleaning Frame #'+str(id_num))
    
    #load in raw users file
    users = pd.read_pickle('userid/user_ratings'+str(id_num)+'.pkl')
    
    # convert all datatypes to float
    float_converted = users.astype('float')
    
    # delete and clean up raw users file
    del users
    gc.collect()
    
    # create intersection between user file and game list ids
    cleaned = float_converted[float_converted.columns.intersection(game_ids_list)]
    
    del float_converted
    gc.collect()
    
    # make a list of users with fewer than 10 user ratings
    sums = cleaned.count(axis=1)<10
    # get indices for the rows with fewer than 10 ratings
    drop_these = sums.loc[sums==True].index
    # drop the rows with fewer than 10 ratings
    cleaned.drop(drop_these, axis=0, inplace=True)
    
    print(cleaned.info())
    
    # return cleaned file
    return cleaned

In [6]:
def create_ratings_file(num_files):
    
    game_ids = pd.read_pickle('data_cleaned/game_ids.pkl')
    game_ids_list = list(game_ids[0])
    
    del game_ids
    gc.collect()
    
    master_file = pd.DataFrame()
    
    num_files=num_files
    
    for id_num in np.arange(1,num_files+1,1):    
        cleaned_item = clean_ratings(id_num, game_ids_list)
        master_file = pd.concat([master_file, cleaned_item], axis=0)
    
    del cleaned_item
    gc.collect()
    
    return master_file

## Games

In [5]:
games = pd.read_pickle('data_dirty/games.pkl')

In [6]:
games

Unnamed: 0,BGGId,Name,Description,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,BestPlayers,GoodPlayers,NumOwned,NumWant,NumWish,NumWeightVotes,MfgPlaytime,ComMinPlaytime,ComMaxPlaytime,MfgAgeRec,NumUserRatings,NumComments,NumAlternates,NumExpansions,NumAwards,NumImplementations,NumFans,NumPageViews,RulesPosts,TotalPosts,IsExpansion,IsReimplementation,Family,Theme,Category,Kickstarted,ImagePath,Rank:boardgame,Rank:thematic,Rank:strategygames,Rank:wargames,Rank:familygames,Rank:cgs,Rank:abstracts,Rank:partygames,Rank:childrensgames,Rank:rpgitem,Rank:boardgameaccessory,Rank:videogame,Rank:amiga,Rank:commodore64,Rank:arcade,Rank:atarist
0,174430,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,2017,3.8726,8.76029,8.52385,1.63358,1,4,12.784946,4.152542,3,"[1, 2, 3, 4]",74975,1375,17276,2009,120,60,120,14,46228,8308,6,12,30,0,7718,10816029,6326,14752,0,0,Gloomhaven,,Dungeon Crawl,1,https://cf.geekdo-images.com/sZYp_3BTDGjh2unaZ...,1.0,1.0,1.0,,,,,,,,,,,,,
1,161936,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,2015,2.8331,8.60116,8.45000,1.56368,2,4,11.354037,4.077778,4,"[2, 3, 4]",68974,840,11544,1210,60,60,60,13,44043,6682,10,0,30,2,2952,3450336,1229,3086,0,1,Pandemic,,,,https://cf.geekdo-images.com/-Qer2BBPG7qGGDu6K...,2.0,2.0,3.0,,,,,,,,,,,,,
2,224517,Brass: Birmingham,Brass: Birmingham is an economic strategy game...,2018,3.9038,8.66907,8.41066,1.24331,2,4,13.226190,1.035714,3,"[2, 3, 4]",35448,1528,11277,1143,120,60,120,14,23776,3610,6,0,18,1,1902,1974872,387,1028,0,1,Brass,Canals,,1,https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg...,3.0,,2.0,,,,,,,,,,,,,
3,167791,Terraforming Mars,"In the 2400s, mankind begins to terraform the ...",2016,3.2429,8.42155,8.27751,1.38628,1,5,11.891156,3.380952,3,"[1, 2, 3, 4]",97330,2061,18842,2944,120,120,120,12,71474,10452,14,22,39,2,6372,6285054,1155,4313,0,0,Terraforming Mars,,,,https://cf.geekdo-images.com/wg9oOLcsKvDesSUdZ...,4.0,,6.0,,,,,,,,,,,,,
4,291457,Gloomhaven: Jaws of the Lion,Gloomhaven: Jaws of the Lion is a standalone g...,2020,3.5649,8.72198,8.25902,1.42169,1,4,12.051948,4.000000,2,"[1, 2, 3, 4]",33444,477,6250,485,120,30,120,14,13939,2149,6,0,11,0,1649,1471587,1073,2062,0,0,Gloomhaven,,Dungeon Crawl,,https://cf.geekdo-images.com/_HhIdavYW-hid20Iq...,5.0,3.0,5.0,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21403,7316,Bingo,A classic party game in which players cover pl...,1530,1.0439,2.87934,3.96295,1.76399,2,99,4.720000,1.062500,3,[3],1653,2,27,205,60,60,60,5,2257,707,72,0,0,0,10,57207,0,12,0,0,Classic (Schmidt Spiele),,,,https://cf.geekdo-images.com/7xqN6StcQz1FoGplo...,21478.0,,,,,,,631.0,,,,,,,,
21404,5048,Candy Land,Created by Eleanor Abbott in the early 1940's ...,1949,1.1012,3.18388,3.79668,1.72104,2,4,3.325000,1.052632,4,"[2, 3, 4]",6109,4,66,346,30,30,30,3,4178,1543,10,0,0,3,36,321486,3,108,0,0,,Food / Cooking,,,https://cf.geekdo-images.com/97n-BYkjnFiHAhqUz...,21479.0,,,,,,,,873.0,,,,,,,
21405,5432,Chutes and Ladders,Traditional game from ancient India was brough...,-200,1.0195,2.86610,3.61367,1.64312,2,6,3.357143,1.000000,4,"[2, 3, 4, 5]",4705,4,58,308,30,30,30,3,3967,1318,162,0,0,0,22,252400,0,55,0,0,GoPlay,Circus,,,https://cf.geekdo-images.com/P1qJDS_DFTtP_FrpW...,21480.0,,,,,,,,874.0,,,,,,,
21406,11901,Tic-Tac-Toe,A very old game where each player attempts to ...,-1300,1.1697,2.69687,3.57174,1.98415,2,2,4.181818,1.035714,2,[2],1436,9,27,383,1,1,1,4,3399,1007,38,1,0,0,28,147304,2,66,0,0,Game in a Tin (HABA),Video Game Theme: Super Mario Bros.,n in a row,,https://cf.geekdo-images.com/UImMYmMZKE4AGTMPH...,21481.0,,,,,,1106.0,,875.0,,,,,,,


In [7]:
games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21408 entries, 0 to 21407
Data columns (total 55 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   BGGId                    21408 non-null  int64  
 1   Name                     21408 non-null  object 
 2   Description              21408 non-null  object 
 3   YearPublished            21408 non-null  int64  
 4   GameWeight               21408 non-null  float64
 5   AvgRating                21408 non-null  float64
 6   BayesAvgRating           21408 non-null  float64
 7   StdDev                   21408 non-null  float64
 8   MinPlayers               21408 non-null  int64  
 9   MaxPlayers               21408 non-null  int64  
 10  ComAgeRec                16010 non-null  float64
 11  LanguageEase             15642 non-null  float64
 12  BestPlayers              1929 non-null   object 
 13  GoodPlayers              21408 non-null  object 
 14  NumOwned              

In [8]:
drops = games.loc[games['BGGId']==0].index
games.drop(games.loc[drops].index, axis=0, inplace=True)
games.drop_duplicates(subset='BGGId', keep='first', inplace=True)
games.drop(['Rank:rpgitem', 'Rank:boardgameaccessory', 'Rank:videogame', 'Rank:amiga', 'Rank:commodore64', 'Rank:arcade', 'Rank:atarist'], axis=1, inplace=True)
games.shape

(21369, 48)

In [9]:
games.columns

Index(['BGGId', 'Name', 'Description', 'YearPublished', 'GameWeight',
       'AvgRating', 'BayesAvgRating', 'StdDev', 'MinPlayers', 'MaxPlayers',
       'ComAgeRec', 'LanguageEase', 'BestPlayers', 'GoodPlayers', 'NumOwned',
       'NumWant', 'NumWish', 'NumWeightVotes', 'MfgPlaytime', 'ComMinPlaytime',
       'ComMaxPlaytime', 'MfgAgeRec', 'NumUserRatings', 'NumComments',
       'NumAlternates', 'NumExpansions', 'NumAwards', 'NumImplementations',
       'NumFans', 'NumPageViews', 'RulesPosts', 'TotalPosts', 'IsExpansion',
       'IsReimplementation', 'Family', 'Theme', 'Category', 'Kickstarted',
       'ImagePath', 'Rank:boardgame', 'Rank:thematic', 'Rank:strategygames',
       'Rank:wargames', 'Rank:familygames', 'Rank:cgs', 'Rank:abstracts',
       'Rank:partygames', 'Rank:childrensgames'],
      dtype='object')

In [10]:
games['BestPlayers'].fillna(0, inplace=True)
games['BestPlayers'].unique()

array(['3', '4', '2', '6', '1', '5', '7', '8', 0, '3+', '14', '15', '12',
       '0+', '9', '13', '11'], dtype=object)

In [11]:
games.loc[(games['BestPlayers']=='3+') | (games['BestPlayers']=='0+')] = 0
games['BestPlayers'] = games['BestPlayers'].astype('int8')

In [12]:
games.loc[games['Rank:thematic'].notna(), 'Cat:Thematic'] = int(1)
games.loc[games['Rank:strategygames'].notna(), 'Cat:Strategy'] = int(1)
games.loc[games['Rank:wargames'].notna(), 'Cat:War'] = int(1)
games.loc[games['Rank:familygames'].notna(), 'Cat:Family'] = int(1)
games.loc[games['Rank:cgs'].notna(), 'Cat:CGS'] = int(1)
games.loc[games['Rank:abstracts'].notna(), 'Cat:Abstract'] = int(1)
games.loc[games['Rank:partygames'].notna(), 'Cat:Party'] = int(1)
games.loc[games['Rank:childrensgames'].notna(), 'Cat:Childrens'] = int(1)

In [13]:
int_columns=['BGGId', 'YearPublished', 'MinPlayers', 'MaxPlayers', 'NumOwned',
       'NumWant', 'NumWish', 'NumWeightVotes', 'MfgPlaytime', 'ComMinPlaytime',
       'ComMaxPlaytime','MfgAgeRec', 'NumUserRatings', 'NumComments',
       'NumAlternates', 'NumExpansions', 'NumAwards', 'NumImplementations',
       'NumFans', 'NumPageViews', 'RulesPosts', 'TotalPosts', 'IsExpansion',
       'IsReimplementation', 'Kickstarted']

ranks = ['Rank:boardgame', 'Rank:thematic', 'Rank:strategygames',
       'Rank:wargames', 'Rank:familygames', 'Rank:cgs', 'Rank:abstracts',
       'Rank:partygames', 'Rank:childrensgames']

categories = ['Cat:Thematic', 'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract', 'Cat:Party', 'Cat:Childrens']

games = integer_reduce(games, int_columns, fill_value=0)

games = integer_reduce(games, ranks, fill_value=21369)

games = integer_reduce(games, categories, fill_value=0)

games.info()

BGGId
YearPublished
MinPlayers
MaxPlayers
NumOwned
NumWant
NumWish
NumWeightVotes
MfgPlaytime
ComMinPlaytime
ComMaxPlaytime
MfgAgeRec
NumUserRatings
NumComments
NumAlternates
NumExpansions
NumAwards
NumImplementations
NumFans
NumPageViews
RulesPosts
TotalPosts
IsExpansion
IsReimplementation
Kickstarted
Rank:boardgame
Rank:thematic
Rank:strategygames
Rank:wargames
Rank:familygames
Rank:cgs
Rank:abstracts
Rank:partygames
Rank:childrensgames
Cat:Thematic
Cat:Strategy
Cat:War
Cat:Family
Cat:CGS
Cat:Abstract
Cat:Party
Cat:Childrens
<class 'pandas.core.frame.DataFrame'>
Int64Index: 21369 entries, 0 to 21407
Data columns (total 56 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   BGGId                21369 non-null  int32  
 1   Name                 21369 non-null  object 
 2   Description          21369 non-null  object 
 3   YearPublished        21369 non-null  int16  
 4   GameWeight           21369 non-null  float64
 5   Avg

In [14]:
empty_games = list(games.loc[games['Name']==0].index)
games.drop(games.index[empty_games], inplace=True)
games.reset_index(inplace=True, drop=True)

In [15]:
not_released = list(games.loc[games['YearPublished']>2021].index)
games.drop(games.index[not_released], inplace=True)
games.reset_index(inplace=True, drop=True)

In [16]:
# copy the description column to a new data frame for text processing
games['Description'] = games['Description'].apply(lambda x: text_block_processor(x))

In [17]:
# All games with over 12 players are set at 13 players.
games.loc[games['MaxPlayers']>12, 'MaxPlayers'] = 13

In [18]:
# Games with min players of 0, we will set their min players = 2

games.loc[games['MinPlayers']<1, 'MaxPlayers'] = 2

In [19]:
games

Unnamed: 0,BGGId,Name,Description,YearPublished,GameWeight,AvgRating,BayesAvgRating,StdDev,MinPlayers,MaxPlayers,ComAgeRec,LanguageEase,BestPlayers,GoodPlayers,NumOwned,NumWant,NumWish,NumWeightVotes,MfgPlaytime,ComMinPlaytime,ComMaxPlaytime,MfgAgeRec,NumUserRatings,NumComments,NumAlternates,NumExpansions,NumAwards,NumImplementations,NumFans,NumPageViews,RulesPosts,TotalPosts,IsExpansion,IsReimplementation,Family,Theme,Category,Kickstarted,ImagePath,Rank:boardgame,Rank:thematic,Rank:strategygames,Rank:wargames,Rank:familygames,Rank:cgs,Rank:abstracts,Rank:partygames,Rank:childrensgames,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
0,174430,Gloomhaven,gloomhaven game euroinspired tactical combat...,2017,3.8726,8.76029,8.52385,1.63358,1,4,12.784946,4.152542,3,"[1, 2, 3, 4]",74975,1375,17276,2009,120,60,120,14,46228,8308,6,12,30,0,7718,10816029,6326,14752,0,0,Gloomhaven,,Dungeon Crawl,1,https://cf.geekdo-images.com/sZYp_3BTDGjh2unaZ...,1,1,1,21369,21369,21369,21369,21369,21369,1,1,0,0,0,0,0,0
1,161936,Pandemic Legacy: Season 1,pandemic legacy cooperative campaign game over...,2015,2.8331,8.60116,8.45000,1.56368,2,4,11.354037,4.077778,4,"[2, 3, 4]",68974,840,11544,1210,60,60,60,13,44043,6682,10,0,30,2,2952,3450336,1229,3086,0,1,Pandemic,,,0,https://cf.geekdo-images.com/-Qer2BBPG7qGGDu6K...,2,2,3,21369,21369,21369,21369,21369,21369,1,1,0,0,0,0,0,0
2,224517,Brass: Birmingham,brass birmingham economic strategy game sequel...,2018,3.9038,8.66907,8.41066,1.24331,2,4,13.226190,1.035714,3,"[2, 3, 4]",35448,1528,11277,1143,120,60,120,14,23776,3610,6,0,18,1,1902,1974872,387,1028,0,1,Brass,Canals,,1,https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg...,3,21369,2,21369,21369,21369,21369,21369,21369,0,1,0,0,0,0,0,0
3,167791,Terraforming Mars,s mankind begin terraform planet mar giant cor...,2016,3.2429,8.42155,8.27751,1.38628,1,5,11.891156,3.380952,3,"[1, 2, 3, 4]",97330,2061,18842,2944,120,120,120,12,71474,10452,14,22,39,2,6372,6285054,1155,4313,0,0,Terraforming Mars,,,0,https://cf.geekdo-images.com/wg9oOLcsKvDesSUdZ...,4,21369,6,21369,21369,21369,21369,21369,21369,0,1,0,0,0,0,0,0
4,291457,Gloomhaven: Jaws of the Lion,gloomhaven jaw lion standalone game take place...,2020,3.5649,8.72198,8.25902,1.42169,1,4,12.051948,4.000000,2,"[1, 2, 3, 4]",33444,477,6250,485,120,30,120,14,13939,2149,6,0,11,0,1649,1471587,1073,2062,0,0,Gloomhaven,,Dungeon Crawl,0,https://cf.geekdo-images.com/_HhIdavYW-hid20Iq...,5,3,5,21369,21369,21369,21369,21369,21369,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21285,16398,War,war play standard card deck special themed d...,0,1.0000,2.29912,4.01489,1.64441,2,2,4.357143,1.000000,0,[],446,2,7,128,30,30,30,4,1392,486,31,0,0,1,8,64834,2,23,0,0,,,,0,https://cf.geekdo-images.com/CeBpBleLUkV6uM-98...,21477,21369,21369,21369,21369,21369,21369,21369,876,0,0,0,0,0,0,0,1
21286,7316,Bingo,classic party game player cover place card bas...,1530,1.0439,2.87934,3.96295,1.76399,2,13,4.720000,1.062500,3,[3],1653,2,27,205,60,60,60,5,2257,707,72,0,0,0,10,57207,0,12,0,0,Classic (Schmidt Spiele),,,0,https://cf.geekdo-images.com/7xqN6StcQz1FoGplo...,21478,21369,21369,21369,21369,21369,21369,631,21369,0,0,0,0,0,0,1,0
21287,5048,Candy Land,create eleanor abbott early s entertain child ...,1949,1.1012,3.18388,3.79668,1.72104,2,4,3.325000,1.052632,4,"[2, 3, 4]",6109,4,66,346,30,30,30,3,4178,1543,10,0,0,3,36,321486,3,108,0,0,,Food / Cooking,,0,https://cf.geekdo-images.com/97n-BYkjnFiHAhqUz...,21479,21369,21369,21369,21369,21369,21369,21369,873,0,0,0,0,0,0,0,1
21288,5432,Chutes and Ladders,traditional game ancient india bring uk comm...,-200,1.0195,2.86610,3.61367,1.64312,2,6,3.357143,1.000000,4,"[2, 3, 4, 5]",4705,4,58,308,30,30,30,3,3967,1318,162,0,0,0,22,252400,0,55,0,0,GoPlay,Circus,,0,https://cf.geekdo-images.com/P1qJDS_DFTtP_FrpW...,21480,21369,21369,21369,21369,21369,21369,21369,874,0,0,0,0,0,0,0,1


In [20]:
games.reset_index(inplace=True, drop=True)

In [21]:
games.to_pickle('data_cleaned/games.pkl')

## Designers

In [22]:
designers = pd.read_pickle('data_dirty/designers.pkl')
designers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21407 entries, 0 to 21406
Columns: 9691 entries, BGGId to Ledger Gibbs
dtypes: float64(9690), object(1)
memory usage: 1.5+ GB


In [23]:
designers.drop_duplicates(subset='BGGId', keep='first', inplace=True)
designers.fillna(0, inplace=True)
temp_id = designers['BGGId']
designers.drop('BGGId', axis=1, inplace=True)
designers = designers.astype('int8')
designers['BGGId'] = temp_id
designers.drop(designers.index[empty_games], inplace=True)
designers.reset_index(inplace=True, drop=True)
designers.drop(designers.index[not_released], inplace=True)
designers.reset_index(inplace=True, drop=True)
designers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21290 entries, 0 to 21289
Columns: 9691 entries, Isaac Childres to BGGId
dtypes: int64(1), int8(9690)
memory usage: 196.9 MB


In [24]:
designers.to_pickle('data_cleaned/designers_all.pkl')

In [25]:
# locate all row that sum to 3 or less (find low experience designers)

# change the 3 to whatever desired for more or less experience
lowexp_rows = designers.loc[:, designers.sum(axis=0) <= 3]

# Locate the columns that contain the low experience designers
lowexp_columns = lowexp_rows[lowexp_rows.sum(axis=1) > 0]

# get indices of those low-exp columns
indices = lowexp_columns.index

# make new column for low exp designer
designers['Low-Exp Designer'] = 0

# for each index in the low exp list, set low exp designer to 1
for index in indices:
    designers.loc[index, 'Low-Exp Designer'] = 1
    
# drop all columns for one-off designers
designers.drop(designers.loc[:, designers.sum(axis=0) <= 3], axis=1, inplace=True)

In [26]:
designers.to_pickle('data_cleaned/designers_reduced.pkl')

## Categories

In [27]:
categories = pd.read_pickle('data_dirty/categories.pkl')

In [28]:
categories.drop_duplicates(subset='BGGId', keep='first', inplace=True)
categories.fillna(0, inplace=True)
temp_id = categories['BGGId']
categories.drop('BGGId', axis=1, inplace=True)
categories = categories.astype('int8')
categories['BGGId'] = temp_id
categories.drop(categories.index[empty_games], inplace=True)
categories.reset_index(inplace=True, drop=True)
categories.drop(categories.index[not_released], inplace=True)
categories.reset_index(inplace=True, drop=True)
categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21290 entries, 0 to 21289
Data columns (total 84 columns):
 #   Column                      Non-Null Count  Dtype
---  ------                      --------------  -----
 0   Adventure                   21290 non-null  int8 
 1   Exploration                 21290 non-null  int8 
 2   Fantasy                     21290 non-null  int8 
 3   Fighting                    21290 non-null  int8 
 4   Miniatures                  21290 non-null  int8 
 5   Environmental               21290 non-null  int8 
 6   Medical                     21290 non-null  int8 
 7   Economic                    21290 non-null  int8 
 8   Industry / Manufacturing    21290 non-null  int8 
 9   Transportation              21290 non-null  int8 
 10  Science Fiction             21290 non-null  int8 
 11  Space Exploration           21290 non-null  int8 
 12  Territory Building          21290 non-null  int8 
 13  Civilization                21290 non-null  int8 
 14  Negoti

In [29]:
categories

Unnamed: 0,Adventure,Exploration,Fantasy,Fighting,Miniatures,Environmental,Medical,Economic,Industry / Manufacturing,Transportation,Science Fiction,Space Exploration,Territory Building,Civilization,Negotiation,Political,Wargame,Civil War,Movies / TV / Radio theme,Card Game,Novel-based,Age of Reason,Mythology,Renaissance,American West,Animals,Modern Warfare,Dice,Medieval,Ancient,City Building,Nautical,Post-Napoleonic,Horror,Educational,Puzzle,Collectible Components,Farming,Religious,Travel,Murder/Mystery,Pirates,Comic Book / Strip,Mature / Adult,Video Game Theme,Spies/Secret Agents,Abstract Strategy,Bluffing,Action / Dexterity,Arabian,Prehistoric,Deduction,Trains,Party Game,Word Game,Aviation / Flight,Zombies,World War II,Pike and Shot,World War I,Real-time,Humor,Print & Play,Racing,Sports,Electronic,Maze,Mafia,Expansion for Base-game,American Indian Wars,Napoleonic,American Revolutionary War,Children's Game,Memory,Vietnam War,American Civil War,Math,Number,Trivia,Music,Korean War,Game System,Book,BGGId
0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,174430
1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,161936
2,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,224517
3,0,0,0,0,0,1,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,167791
4,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,291457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21285,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,16398
21286,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,7316
21287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5048
21288,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5432


In [30]:
categories.to_pickle('data_cleaned/categories.pkl')

## Mechanics

In [31]:
mechanics = pd.read_pickle('data_dirty/mechanics.pkl')
mechanics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21407 entries, 0 to 21406
Columns: 185 entries, BGGId to Impulse Movement
dtypes: float64(184), int32(1)
memory usage: 30.1 MB


In [32]:
mechanics.drop_duplicates(subset='BGGId', keep='first', inplace=True)
mechanics.fillna(0, inplace=True)
temp_id = mechanics['BGGId']
mechanics.drop('BGGId', axis=1, inplace=True)
mechanics = mechanics.astype('int8')
mechanics['BGGId'] = temp_id
mechanics.drop(mechanics.index[empty_games], inplace=True)
mechanics.reset_index(inplace=True, drop=True)
mechanics.drop(mechanics.index[not_released], inplace=True)
mechanics.reset_index(inplace=True, drop=True)
mechanics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21290 entries, 0 to 21289
Columns: 185 entries, Action Queue to BGGId
dtypes: int32(1), int8(184)
memory usage: 3.8 MB


In [33]:
mechanics

Unnamed: 0,Action Queue,Action Retrieval,Campaign / Battle Card Driven,Card Play Conflict Resolution,Communication Limits,Cooperative Game,Critical Hits and Failures,Deck Construction,"Deck, Bag, and Pool Building",Grid Movement,Hand Management,Hexagon Grid,Legacy,Legacy Game,Modular Board,Once-Per-Game Abilities,Scenario / Mission / Campaign Game,Simultaneous Action Selection,Solo / Solitaire Game,Storytelling,Variable Player Powers,Action Points,Point to Point Movement,Set Collection,Trading,Income,Loans,Market,Network and Route Building,Score-and-Reset Game,Tech Trees / Tech Tracks,Turn Order: Stat-Based,Variable Set-up,Drafting,End Game Bonuses,TableauBuilding,Take That,Tile Placement,Turn Order: Progressive,Line of Sight,Action Drafting,Area Majority / Influence,Area-Impulse,Dice Rolling,Follow,King of the Hill,Variable Phase Order,Voting,Turn Order: Pass Order,Victory Points as a Resource,Area Movement,Delayed Purchase,Team-Based Game,Auction/Bidding,Auction: Dutch,Card Drafting,Events,Hidden Movement,Movement Points,Simulation,Ownership,Rondel,Track Movement,Action/Event,Advantage Token,Sudden Death Ending,Tug of War,Force Commitment,Narrative Choice / Paragraph,Grid Coverage,Worker Placement with Dice Workers,Layering,Increase Value of Unchosen Resources,Hidden Roles,Player Elimination,Semi-Cooperative Game,Traitor Game,Automatic Resource Growth,Push Your Luck,Worker Placement,Role Playing,Stat Check Resolution,Contracts,Turn Order: Auction,"Worker Placement, Different Worker Types",Race,Hidden Victory Points,Turn Order: Claim Action,Memory,Enclosure,Stock Holding,Pick-up and Deliver,Map Addition,Die Icon Resolution,Resource to Move,Turn Order: Role Order,Trick-taking,Move Through Deck,Bias,Auction: Turn Order Until Pass,Catch the Leader,Programmed Movement,Slide/Push,Commodity Speculation,Square Grid,Moving Multiple Units,Alliances,Kill Steal,Passed Action Token,Pattern Building,Investment,Secret Unit Deployment,Flicking,Highest-Lowest Scoring,Constrained Bidding,Mancala,Auction: Fixed Placement,Multiple-Lot Auction,Lose a Turn,Auction: Sealed Bid,Betting and Bluffing,Negotiation,Time Track,Connections,Targeted Clues,Order Counters,Bingo,Line Drawing,Paper-and-Pencil,Deduction,Movement Template,Selection Order Bid,Multiple Maps,Re-rolling and Locking,Roll / Spin and Move,Finale Ending,Roles with Asymmetric Information,Command Cards,"I Cut, You Choose",Prisoner's Dilemma,Interrupts,Real-Time,Ladder Climbing,Predictive Bid,Auction: Once Around,Closed Economy Auction,Three Dimensional Movement,Zone of Control,Random Production,Relative Movement,Cube Tower,Auction: English,Bribery,Map Deformation,Elapsed Real Time Ending,Melding and Splaying,Pieces as Map,Pattern Movement,Static Capture,Pattern Recognition,Minimap Resolution,Auction: Dutch Priority,Rock-Paper-Scissors,Map Reduction,Turn Order: Random,Acting,Singing,Ratio / Combat Results Table,Single Loser Game,Stacking and Balancing,Chaining,Different Dice Movement,Measurement Movement,Action Timer,Physical Removal,Induction,Hot Potato,Speed Matching,Player Judge,Crayon Rail System,Chit-Pull System,Matching,Auction: Dexterity,Impulse Movement,BGGId
0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,174430
1,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,161936
2,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,224517
3,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,167791
4,1,1,1,0,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,291457
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21285,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16398
21286,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7316
21287,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5048
21288,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5432


In [34]:
mechanics.to_pickle('data_cleaned/mechanics.pkl')

## Artists

In [35]:
artists = pd.read_pickle('data_dirty/artists.pkl')
artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21407 entries, 0 to 21406
Columns: 10844 entries, BGGId to Sam Pratt
dtypes: float64(10843), object(1)
memory usage: 1.7+ GB


In [36]:
artists.drop_duplicates(subset='BGGId', keep='first', inplace=True)
artists.fillna(0, inplace=True)
temp_id = artists['BGGId']
artists.drop('BGGId', axis=1, inplace=True)
artists = artists.astype('int8')
artists['BGGId'] = temp_id
artists.drop(artists.index[empty_games], inplace=True)
artists.reset_index(inplace=True, drop=True)
artists.drop(artists.index[not_released], inplace=True)
artists.reset_index(inplace=True, drop=True)
artists.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21290 entries, 0 to 21289
Columns: 10844 entries, Alexandr Elichev to BGGId
dtypes: int64(1), int8(10843)
memory usage: 220.3 MB


In [37]:
artists.to_pickle('data_cleaned/artists_all.pkl')

In [38]:
# locate all row that sum to 3 or less (find low experience artists)

# change the 3 to whatever desired for more or less experience
lowexp_rows = artists.loc[:, artists.sum(axis=0) <= 3]

# Locate the columns that contain the low experience artists
lowexp_columns = lowexp_rows[lowexp_rows.sum(axis=1) > 0]

# get indices of those low-exp columns
indices = lowexp_columns.index

# make new column for low exp Artist
artists['Low-Exp Artist'] = 0

# for each index in the low exp list, set low exp Artist to 1
for index in indices:
    artists.loc[index, 'Low-Exp Artist'] = 1
    
# drop all columns for one-off artists
artists.drop(artists.loc[:, artists.sum(axis=0) <= 3], axis=1, inplace=True)

In [39]:
artists.to_pickle('data_cleaned/artists_reduced.pkl')

## Publishers

In [40]:
publishers = pd.read_pickle('data_dirty/publishers.pkl')
publishers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21407 entries, 0 to 21406
Columns: 6913 entries, BGGId to Wales Game Systems
dtypes: float64(6912), object(1)
memory usage: 1.1+ GB


In [41]:
publishers.drop_duplicates(subset='BGGId', keep='first', inplace=True)
publishers.fillna(0, inplace=True)
temp_id = publishers['BGGId']
publishers.drop('BGGId', axis=1, inplace=True)
publishers = publishers.astype('int8')
publishers['BGGId'] = temp_id
publishers.drop(publishers.index[empty_games], inplace=True)
publishers.reset_index(inplace=True, drop=True)
publishers.drop(publishers.index[not_released], inplace=True)
publishers.reset_index(inplace=True, drop=True)
publishers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21290 entries, 0 to 21289
Columns: 6913 entries, Albi to BGGId
dtypes: int64(1), int8(6912)
memory usage: 140.5 MB


In [42]:
publishers.to_pickle('data_cleaned/publishers_all.pkl')

In [43]:
# locate all row that sum to 3 or less (find low experience publishers)

# change the 3 to whatever desired for more or less experience
lowexp_rows = publishers.loc[:, publishers.sum(axis=0) <= 3]

# Locate the columns that contain the low experience publishers
lowexp_columns = lowexp_rows[lowexp_rows.sum(axis=1) > 0]

# get indices of those low-exp columns
indices = lowexp_columns.index

# make new column for low exp Publisher
publishers['Low-Exp Publisher'] = 0

# for each index in the low exp list, set low exp Publisher to 1
for index in indices:
    publishers.loc[index, 'Low-Exp Publisher'] = 1
    
# drop all columns for one-off publishers
publishers.drop(publishers.loc[:, publishers.sum(axis=0) <= 3], axis=1, inplace=True)

In [44]:
publishers.to_pickle('data_cleaned/publishers_reduced.pkl')

## Awards

In [45]:
awards = pd.read_pickle('data_dirty/awards.pkl')
awards.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21407 entries, 0 to 21406
Columns: 897 entries, BGGId to ENnies - Best Family Game Winner
dtypes: float64(896), object(1)
memory usage: 146.5+ MB


In [46]:
awards.drop_duplicates(subset='BGGId', keep='first', inplace=True)
awards.fillna(0, inplace=True)
temp_id = awards['BGGId']
awards.drop('BGGId', axis=1, inplace=True)
awards = awards.astype('int8')
awards['BGGId'] = temp_id
awards.drop(awards.index[empty_games], inplace=True)
awards.reset_index(inplace=True, drop=True)
awards.drop(awards.index[not_released], inplace=True)
awards.reset_index(inplace=True, drop=True)
awards.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21290 entries, 0 to 21289
Columns: 897 entries, As d'Or - Jeu de l'Année Expert Nominee to BGGId
dtypes: int64(1), int8(896)
memory usage: 18.4 MB


In [47]:
awards.to_pickle('data_cleaned/awards_all.pkl')

In [48]:
# locate all row that sum to 3 or less (find low experience awards)

# change the 3 to whatever desired for more or less experience
lowexp_rows = awards.loc[:, awards.sum(axis=0) <= 20]

# Locate the columns that contain the low experience awards
lowexp_columns = lowexp_rows[lowexp_rows.sum(axis=1) > 0]

# get indices of those low-exp columns
indices = lowexp_columns.index

# make new column for low exp designer
awards['Other Award(s)'] = 0

# for each index in the low exp list, set low exp designer to 1
for index in indices:
    awards.loc[index, 'Other Award(s)'] = 1
    
# drop all columns for one-off awards
awards.drop(awards.loc[:, awards.sum(axis=0) <= 20], axis=1, inplace=True)

In [49]:
awards.to_pickle('data_cleaned/awards_reduced.pkl')

## Ratings Dist

In [50]:
ratings_dist = pd.read_pickle('data_dirty/ratings_dist.pkl')
ratings_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21407 entries, 0 to 21406
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   BGGId   21407 non-null  object
 1   1       21407 non-null  object
 2   10      21407 non-null  object
 3   2       21407 non-null  object
 4   3       21407 non-null  object
 5   4       21407 non-null  object
 6   5       21407 non-null  object
 7   6       21407 non-null  object
 8   7       21407 non-null  object
 9   8       21407 non-null  object
 10  9       21407 non-null  object
dtypes: object(11)
memory usage: 1.8+ MB


In [51]:
ratings_dist.drop_duplicates(subset='BGGId', keep='first', inplace=True)
ratings_dist.fillna(0, inplace=True)
ratings_dist.drop(ratings_dist.index[empty_games], inplace=True)
ratings_dist.reset_index(inplace=True, drop=True)
ratings_dist.drop(ratings_dist.index[not_released], inplace=True)
ratings_dist.reset_index(inplace=True, drop=True)

for column in ratings_dist.columns[1:]:
    ratings_dist[column] = ratings_dist[column].apply(lambda x: fix_numbers(x))

ratings_dist['num_votes'] = games['NumUserRatings']

temp_id = ratings_dist['BGGId']
ratings_dist.drop('BGGId', axis=1, inplace=True)
ratings_dist = ratings_dist.astype('int32')
ratings_dist['BGGId'] = temp_id

ratings_dist.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21290 entries, 0 to 21289
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   1          21290 non-null  int32
 1   10         21290 non-null  int32
 2   2          21290 non-null  int32
 3   3          21290 non-null  int32
 4   4          21290 non-null  int32
 5   5          21290 non-null  int32
 6   6          21290 non-null  int32
 7   7          21290 non-null  int32
 8   8          21290 non-null  int32
 9   9          21290 non-null  int32
 10  num_votes  21290 non-null  int32
 11  BGGId      21290 non-null  int64
dtypes: int32(11), int64(1)
memory usage: 1.1 MB


In [52]:
ratings_dist

Unnamed: 0,1,10,2,3,4,5,6,7,8,9,num_votes,BGGId
0,445,20000,227,337,394,705,1800,2900,7700,11000,46228,174430
1,445,15000,200,252,319,572,1700,3300,9900,13000,44043,161936
2,66,6800,34,75,117,213,681,1700,6500,7700,23776,224517
3,187,17000,145,359,626,1200,3600,7300,21000,21000,71474,167791
4,106,4800,41,50,77,144,409,916,3100,4300,13939,291457
...,...,...,...,...,...,...,...,...,...,...,...,...
21285,575,16,355,204,143,54,24,15,6,1,1392,16398
21286,619,16,481,433,342,201,109,40,14,6,2257,7316
21287,717,38,891,957,858,411,174,84,43,13,4178,5048
21288,1,9,926,998,780,679,335,147,56,33,3967,5432


In [53]:
ratings_dist.to_pickle('data_cleaned/ratings_dist.pkl')

## Comments

In [None]:
comments = pd.read_pickle('data_dirty/comments.pkl')
comments.info()

In [None]:
comments.drop_duplicates(subset=['BGGId', 'Username'], keep='first', inplace=True)
comments.reset_index(inplace=True, drop=True)
comments.fillna(0, inplace=True)
comments.drop(comments.loc[comments['Name']=='Sheep in Disguise'].index, axis=0, inplace=True)

# copy the description column to a new data frame for text processing
comments['cleaned'] = comments['Value'].apply(lambda x: text_block_processor(x))
    
# drop the description field and save our listing_text to file so we don't have to run it again
comments.drop('Value', axis=1, inplace=True)

In [None]:
comments.to_pickle('data_cleaned/comments.pkl')

## Ratings Matrix

In [None]:
games = pd.read_pickle('data_dirty/games.pkl')


In [9]:
ratings_matrix = create_ratings_file(31)


Cleaning Frame #1
<class 'pandas.core.frame.DataFrame'>
Index: 1499 entries, -Johnny- to AlexSimmons
Columns: 14782 entries, 223033 to 264858
dtypes: float64(14782)
memory usage: 169.1+ MB
None

Cleaning Frame #2
<class 'pandas.core.frame.DataFrame'>
Index: 1184 entries, AlexTC to Arctic tern
Columns: 12717 entries, 3805 to 251632
dtypes: float64(12717)
memory usage: 114.9+ MB
None

Cleaning Frame #3
<class 'pandas.core.frame.DataFrame'>
Index: 1828 entries, ArcticLancer to Belphegor82
Columns: 14962 entries, 209685 to 231477
dtypes: float64(14962)
memory usage: 208.7+ MB
None

Cleaning Frame #4
<class 'pandas.core.frame.DataFrame'>
Index: 1017 entries, Belruel to BoardGamePatrol
Columns: 12831 entries, 192458 to 209672
dtypes: float64(12831)
memory usage: 99.6+ MB
None

Cleaning Frame #5
<class 'pandas.core.frame.DataFrame'>
Index: 1008 entries, BoardGameRental to Buffybot
Columns: 12913 entries, 233144 to 220741
dtypes: float64(12913)
memory usage: 99.3+ MB
None

Cleaning Frame #6
<

In [10]:
ratings_matrix = ratings_matrix[~ratings_matrix.index.duplicated(keep='first')]

In [11]:
ratings_matrix.to_pickle('data_cleaned/ratings_matrix.pkl')

In [12]:
del ratings_matrix
gc.collect()

42