# Notebook Objective and Setup

BGG03 is the scrubbing and cleaning of the various data obtained in notebooks BGG01 and BGG02. The following datasets are cleaned, constructed, or otherwise prepared for EDA and modeling.

    * Games
    * Mechanics
    * Subcategories
    * Designers
    * Artists
    * Publishers
    * Awards
    * Ratings Distribution
    * Comments
    * Ratings Matrix

## Package Imports

In [None]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import json

# ignore warnings (gets rid of Pandas copy warnings)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = None

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 30)

# NLP tools
import spacy
nlp = spacy.load("en_core_web_sm")
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize

## Notebook Functions

In [None]:
def integer_reduce(data, columns, fill_value=0):
    '''
    Reduces an integer type to its smallest memory size type
    
    Inputs:
    data: dataframe to reduce
    columns: columns to reduce
    fill_value: fill value to use if none
    
    Returns: 
    data: dataframe with memory reduced data types
    '''
    for column in columns:
        print(column)
        data[column].fillna(fill_value, inplace=True)
        if (data[column].max() <= 127) & (data[column].min() >= -128):
            data[column] = data[column].astype('int8')
        elif (data[column].max() <= 32767) & (data[column].min() >= -32768):
            data[column] = data[column].astype('int16')
        elif (data[column].max() <= 2147483647) & (data[column].min() >= -2147483648):
            data[column] = data[column].astype('int32')
        
    return data

In [None]:
def text_block_processor(text):
    '''Takes a block of text. Divides block into sentences with words lemmatized.
    Sends each sentence to word processor. Concatenates all words into one string
    Otherwise returns string of cleaned and processed words from text block
    
    ARGUMENTS:
    block of text
    '''
    
    text = str(text)
    line = re.sub(r'[^a-zA-Z\s]', '', text).lower() # removes all special characters and numbers, and makes lower case
    line2 = re.sub(r'\s{2}', '', line).lower() # removes extra blocks of 2 spaces
    tokens = nlp(line)
    words = []
    for token in tokens:
        if token.is_stop == False:
            token_preprocessed = token.lemma_
            if token_preprocessed != '': # only continues if returned word is not empty
                words.append(token_preprocessed) # appends word to list of words
    line = ' '.join(words)
    
    return line


In [None]:
def fix_numbers(x):
    '''
    Checks for numbers or strings
    If a string, strips off the "k" and multiply by 10000
    Sends back cleaned int
    '''
    
    if type(x) is int:
        return int(x)
    
    if str.endswith(x, 'k'):
        x = str(x).strip('k')
        new_num = int(float(x)*1000)
        return int(new_num)
    
    else: 
        return int(x)

In [None]:
def clean_ratings(id_num, game_ids):
    '''
    Loads and cleans a raw user ratings file
    Drops game ids not present in games file
    Drops users with fewer than 10 ratings
    
    Inputs:
    id_num: the appendation of the file to find the path
    game_ids: list of game ids in the games file
    
    Outputs:
    Cleaned user ratings file
    '''
    
    print('\nCleaning Frame #'+str(id_num))
    
    #load in raw users file according to id_num inputted
    path = 'userid/user_ratings'+str(id_num)+'.pkl'
    users = pd.read_pickle(path)
    
    # convert all datatypes to float
    float_converted = users.astype('float')
    
    # delete and clean up raw users file
    del users
    gc.collect()
    
    # create intersection between user file and game list ids
    float_converted.columns = float_converted.columns.astype('int32')
    cleaned = float_converted[float_converted.columns.intersection(game_ids)]
    
    # delete and clean up
    del float_converted
    gc.collect()
    
    # make a list of users with fewer than 5 user ratings
    sums = cleaned.count(axis=1)<5
    # get indices for the rows with fewer than 5 ratings
    drop_these = sums.loc[sums==True].index
    # drop the users with fewer than 5 ratings
    cleaned.drop(drop_these, axis=0, inplace=True)
    
    
    # print memory usage
    print(cleaned.info())
    
    # return cleaned file
    return cleaned

In [None]:
def create_ratings_file(start_file, end_file, game_ids):
    '''
    Puts together dataframes from a range of files
    Each file calls the clean_ratings function
    Then all files in range are concatenated
    
    Inputs:
    start_file: start of file name appendation
    end_file: end file name appendation
    game_ids_list: list of game ids in the games file
    
    Outputs:
    Cleaned and concatenated master file
    
    '''
    
    # make an empty dataframe
    master_file = pd.DataFrame()
    
    # for each number in the range from start to end:
    for id_num in np.arange(start_file,end_file+1,1):
        print(id_num)
        # clean the file calling clean_ratings
        cleaned_item = clean_ratings(id_num, game_ids)
        # append the file to the dataframe
        master_file = pd.concat([master_file, cleaned_item], axis=0)
    
    master_file.drop_duplicates(keep='first', inplace=True)
    
    # clean up
    del cleaned_item
    gc.collect()
    
    return master_file

# Games Files

In [None]:
games = pd.read_pickle('data_dirty_new_scraper/games.pkl')
games

In [None]:
games['Theme'].unique()

In [None]:
# Get info, make note of datatypes and memory usage
games.info()

In [None]:
#drops = games.loc[games['BGGId']==0].index # get indices of any games with no BGGId
#drops
#games.drop(games.loc[drops].index, axis=0, inplace=True) # drop games with no BGGId

In [None]:
games.drop_duplicates(subset='BGGId', keep='first', inplace=True) # drop duplicate entires

# drop non-boardgame related information
games.drop(['NumAwards', 'NumFans', 'NumPageViews', 'RulesPosts', 'TotalPosts', 'Category', 'IsExpansion', 'Rank:rpgitem', 'Rank:boardgameaccessory', 'Rank:videogame', 'Rank:amiga', 'Rank:commodore64', 'Rank:arcade', 'Rank:atarist'], axis=1, inplace=True)

games.shape # check shape of file

In [None]:
# what are the columns?
games.columns

In [None]:
games['BestPlayers'].fillna(0, inplace=True) # Fill NaN on BestPlayers
games['BestPlayers'].unique() # What are the unique BestPlayers entries?

In [None]:
# Clean up Best Players so all are integers
games.loc[(games['BestPlayers']=='3+'), 'BestPlayers'] = 3
games.loc[(games['BestPlayers']=='0+'), 'BestPlayers'] = 4

# change dtype on BestPlayers to int8
games['BestPlayers'] = games['BestPlayers'].astype('int8')

In [None]:
# Add Categories with binary flags
games.loc[games['Rank:thematic'].notna(), 'Cat:Thematic'] = int(1)
games.loc[games['Rank:strategygames'].notna(), 'Cat:Strategy'] = int(1)
games.loc[games['Rank:wargames'].notna(), 'Cat:War'] = int(1)
games.loc[games['Rank:familygames'].notna(), 'Cat:Family'] = int(1)
games.loc[games['Rank:cgs'].notna(), 'Cat:CGS'] = int(1)
games.loc[games['Rank:abstracts'].notna(), 'Cat:Abstract'] = int(1)
games.loc[games['Rank:partygames'].notna(), 'Cat:Party'] = int(1)
games.loc[games['Rank:childrensgames'].notna(), 'Cat:Childrens'] = int(1)

In [None]:
# prepare different column sets for memory integer reduction

# integer reduction with fill_values of 0
int_columns=['BGGId', 'YearPublished', 'MinPlayers', 'MaxPlayers', 'NumOwned',
       'NumWant', 'NumWish', 'NumWeightVotes', 'MfgPlaytime', 'ComMinPlaytime',
       'ComMaxPlaytime','MfgAgeRec', 'NumUserRatings', 'NumComments',
       'NumAlternates', 'NumExpansions', 'NumImplementations',
       'IsReimplementation', 'Kickstarted', 'Cat:Thematic', 'Cat:Strategy', 
        'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract', 'Cat:Party', 'Cat:Childrens']

# integer reduction with fill_values of 21926 (lower is better on these)
ranks = ['Rank:boardgame', 'Rank:thematic', 'Rank:strategygames',
       'Rank:wargames', 'Rank:familygames', 'Rank:cgs', 'Rank:abstracts',
       'Rank:partygames', 'Rank:childrensgames']

# call integer_reduce on the sets
games = integer_reduce(games, int_columns, fill_value=0)

games = integer_reduce(games, ranks, fill_value=21926)

games.info() # recheck data types and memory usage

In [None]:
# any games with no name listed?
games.loc[games['Name']==0]

In [None]:
# if so, drop them and reset index
#empty_games = list(games.loc[games['Name']==0].index)
#games.drop(games.index[empty_games], inplace=True)
#games.reset_index(inplace=True, drop=True)

In [None]:
# Drop all games that are not yet released, then reset index
#not_released = list(games.loc[games['YearPublished']>2021].index)
#games.drop(games.index[not_released], inplace=True)
#games.reset_index(inplace=True, drop=True)

In [None]:
# All games with over 12 players are set at 13 players.
games.loc[games['MaxPlayers']>12, 'MaxPlayers'] = 13

In [None]:
# Games with min players of 0, we will set their min players = 2
games.loc[games['MinPlayers']<1, 'MaxPlayers'] = 2

In [None]:
themes = pd.DataFrame(games['Theme'])
games.drop('Theme', axis=1, inplace=True)

In [None]:
# process the Description column text
games['Description'] = games['Description'].apply(lambda x: text_block_processor(x))

In [None]:
games.reset_index(inplace=True, drop=True)

In [None]:
# Cleaned and prepared games frame
games

In [None]:
# save to file
games.to_pickle('data_cleaned_new_scraper/games.pkl')

## Mechanics and Subcategories

### Load and Clean Mechanics

In [None]:
# Load mechanics and check memory usage
mechanics = pd.read_pickle('data_dirty_new_scraper/mechanics.pkl')
mechanics

In [None]:
mechanics.drop_duplicates(subset='BGGId', keep='first', inplace=True)  # drop duplicates
mechanics.fillna(0, inplace=True) # fill nan

mechanics.reset_index(inplace=True, drop=True) # reset index
mechanics.info() # check memory usage

In [None]:
# Clean up mechanics
# Here we are using our domain knowledge to compact several different catogories into one

auction_list = ['Auction: Dexterity','Auction: Dutch','Auction: Dutch Priority',
                'Auction: Fixed Placement','Auction: English','Auction: Once Around','Auction: Sealed Bid',
                'Auction: Turn Order Until Pass','Multiple-Lot Auction','Closed Economy Auction','Selection Order Bid',
                'Constrained Bidding']

turn_order_list = ['Turn Order: Auction','Turn Order: Claim Action','Turn Order: Pass Order',
                   'Turn Order: Progressive','Turn Order: Random','Turn Order: Role Order','Turn Order: Stat-Based']

dumb_physical_list = ['Acting','Hot Potato','Singing','Rock-Paper-Scissors']

drafting = ['Card Drafting']

legacy = ['Legacy']

worker_placement = ['Worker Placement with Dice Workers','Worker Placement, Different Worker Types'] #'Worker Placement',

dexterity = ['Stacking and Balancing', 'Flicking']

# compacting categories here:

for item in worker_placement:
    mechanics.loc[mechanics[item]==1, 'Worker Placement'] = int(1)
    mechanics.drop([item], axis=1, inplace=True)

for item in auction_list:
    mechanics.loc[mechanics[item]==1, 'Auction/Bidding'] = int(1)
    mechanics.drop([item], axis=1, inplace=True)

mechanics['Dexterity'] = int(0)
for item in dexterity:
    mechanics.loc[mechanics[item]==1, 'Dexterity'] = int(1)
    mechanics.drop([item], axis=1, inplace=True)
    
mechanics['Physical'] = int(0)
for item in dumb_physical_list:
    mechanics.loc[mechanics[item]==1, 'Physical'] = int(1)
    mechanics.drop([item], axis=1, inplace=True)
    
mechanics.loc[mechanics['Card Drafting']==1, 'Drafting'] = int(1)

mechanics.loc[mechanics['Legacy']==1, 'Legacy Game'] = int(1)

mechanics.drop(turn_order_list, axis=1, inplace=True)
mechanics.drop(['Card Drafting','Legacy'], axis=1, inplace=True)

In [None]:
columns = mechanics.columns

# call integer_reduce on the sets
mechanics = integer_reduce(mechanics, columns, fill_value=0)

In [None]:
mechanics

### Load and Clean Subcategories

In [None]:
themes_expanded = pd.get_dummies(themes)
theme_sort = pd.DataFrame(themes_expanded.sum().sort_values(ascending=False))
themes_over_1 = list(theme_sort.loc[theme_sort[0]>1].index)
themes_attach = themes_expanded[themes_over_1]
themes_attach

In [None]:
columns = themes_attach.columns

# call integer_reduce on the sets
themes_attach = integer_reduce(themes_attach, columns, fill_value=0)

In [None]:
# load subcategories file and check memory usage

indices = list(games['BGGId'])
subcategories = pd.read_pickle('data_dirty_new_scraper/subcategories.pkl')
subcategories.info()

In [None]:
subcategories.reset_index(inplace=True, drop=True) # reset index

In [None]:
subcategories.drop_duplicates(subset='BGGId', keep='first', inplace=True) # drop duplicates
subcategories.fillna(0, inplace=True) # fill nan
#subcategories.set_index('BGGId', drop=True, inplace=True) # drop bad games
#subcategories = subcategories.loc[indices] # use only indices in games file

columns = subcategories.columns

# call integer_reduce on the sets
subcategories = integer_reduce(subcategories, columns, fill_value=0)


subcategories

### Clean/Combine Mechanics and Subcategories

Manually cleaning up Subcategories. This section on BGG has a lot of "catch-all" concept that involve theming, mechanics (which should be in the mechanics section), and large subcategories that should be alone. We use our domain knowledge to clean this section.

In [None]:
# print our subcategories so we can prepare to sort them
subcategories.columns

In [None]:
# picking the items that will go under "themes"
subcats_themes = ['BGGId', 'Adventure', 'Fantasy', 'Fighting', 'Environmental', 'Medical', 'Economic',
       'Industry / Manufacturing', 'Transportation', 'Science Fiction',
       'Space Exploration', 'Civilization', 'Civil War',
       'Movies / TV / Radio theme', 'Novel-based',
       'Age of Reason', 'Mythology', 'Renaissance', 'American West', 'Animals',
       'Modern Warfare', 'Medieval', 'Ancient','Nautical', 'Post-Napoleonic', 'Horror',
         'Farming', 'Religious', 'Travel',
       'Murder/Mystery', 'Pirates', 'Comic Book / Strip', 'Mature / Adult',
       'Video Game Theme', 'Spies/Secret Agents','Arabian', 'Prehistoric',
                 'Trains','Aviation / Flight', 'Zombies',
       'World War II', 'Racing', 'Pike and Shot', 'World War I','Humor','Sports',
                 'Mafia','American Indian Wars', 'Napoleonic',
       'American Revolutionary War','Vietnam War', 'American Civil War','Number', 'Trivia',
       'Music', 'Korean War', 'City Building', 'Political', 'Math', 'Maze',]

In [None]:
# picking the items that will stay as subcategories
subcats_subcategories = ['BGGId', 'Exploration', 'Miniatures', 'Territory Building', 'Card Game',
             'Educational', 'Puzzle','Collectible Components',
        'Word Game','Print & Play','Electronic']

In [None]:
# looking at our mechanics in alphabetical order so we can see where to roll in the subcategory mechanics
sorted(mechanics.columns)

In [None]:
# drop subcategories that are none of the things
subcategories.drop(['Expansion for Base-game',  'Game System', 'Book'], axis=1, inplace=True)

In [None]:
# for mechanics that are mistakenly in the subcategories frame, make sure those mechanics are in the mechanics frame instead
# remove them from the subcategories frame after moving them over

indices = list(subcategories.loc[subcategories['Negotiation']==1].index)
mechanics.loc[indices, 'Negotiation']=1
subcategories.drop('Negotiation', axis=1, inplace=True)

indices = list(subcategories.loc[subcategories['Action / Dexterity']==1].index)
mechanics.loc[indices, 'Dexterity']=1
subcategories.drop('Action / Dexterity', axis=1, inplace=True)

indices = list(subcategories.loc[subcategories['Dice']==1].index)
mechanics.loc[indices, 'Dice Rolling']=1
subcategories.drop('Dice', axis=1, inplace=True)

indices = list(subcategories.loc[subcategories['Bluffing']==1].index)
mechanics.loc[indices, 'Betting and Bluffing']=1
subcategories.drop('Bluffing', axis=1, inplace=True)

indices = list(subcategories.loc[subcategories['Real-time']==1].index)
mechanics.loc[indices, 'Real-Time']=1
subcategories.drop('Real-time', axis=1, inplace=True)

indices = list(subcategories.loc[subcategories['Memory']==1].index)
mechanics.loc[indices, 'Memory']=1
subcategories.drop('Memory', axis=1, inplace=True)

indices = list(subcategories.loc[subcategories['Deduction']==1].index)
mechanics.loc[indices, 'Deduction']=1
subcategories.drop('Deduction', axis=1, inplace=True)


In [None]:
# or for larger categories in the subcategories frame, move those to the games frame
# remove them from the subcategories frame after moving them over

indices = list(subcategories.loc[subcategories['Wargame']==1].index)
games.loc[indices, 'Cat:War']=1
subcategories.drop('Wargame', axis=1, inplace=True)

indices = list(subcategories.loc[subcategories["Children's Game"]==1].index)
games.loc[indices, 'Cat:Childrens']=1
subcategories.drop("Children's Game", axis=1, inplace=True)

indices = list(subcategories.loc[subcategories['Party Game']==1].index)
games.loc[indices, 'Cat:Party']=1
subcategories.drop('Party Game', axis=1, inplace=True)

indices = list(subcategories.loc[subcategories['Abstract Strategy']==1].index)
games.loc[indices, 'Cat:Abstract']=1
subcategories.drop('Abstract Strategy', axis=1, inplace=True)

In [None]:
# make new data frames for THEMES and SUBCATEGORIES
themes = subcategories[subcats_themes]
themes = pd.concat([themes, themes_attach], axis=1)
subcategories = subcategories[subcats_subcategories]

Save all the file we just cleaned or created!

In [None]:
mechanics.to_pickle('data_cleaned_new_scraper/mechanics.pkl')

In [None]:
subcategories.to_pickle('data_cleaned_new_scraper/subcategories.pkl')

In [None]:
themes.to_pickle('data_cleaned_new_scraper/themes.pkl')

## Designers

In [None]:
# Load up our designers file!
designers = pd.read_pickle('data_dirty_new_scraper/designers.pkl')
designers.info()

In [None]:
designers.drop_duplicates(subset='BGGId', keep='first', inplace=True) # drop duplicates
designers.fillna(0, inplace=True) # Fill NaN

# change all flags to int8
temp_id = designers['BGGId'] 
designers.drop('BGGId', axis=1, inplace=True)
designers = designers.astype('int8')
designers['BGGId'] = temp_id

designers.reset_index(inplace=True, drop=True)# reset index
designers.info() # check memory usage

In [None]:
# save file of all possible designers
designers.to_pickle('data_cleaned_new_scraper/designers_all.pkl')

In [None]:
# locate all row that sum to 3 or less (find low experience designers)

# change the 3 to whatever desired for more or less experience
lowexp_rows = designers.loc[:, designers.sum(axis=0) <= 3]

# Locate the columns that contain the low experience designers
lowexp_columns = lowexp_rows[lowexp_rows.sum(axis=1) > 0]

# get indices of those low-exp columns
indices = lowexp_columns.index

# make new column for low exp designer
designers['Low-Exp Designer'] = 0

# for each index in the low exp list, set low exp designer to 1
for index in indices:
    designers.loc[index, 'Low-Exp Designer'] = 1
    
# drop all columns for one-off designers
designers.drop(designers.loc[:, designers.sum(axis=0) <= 3], axis=1, inplace=True)

In [None]:
# save file of designers reduced to 3 or more works
designers.to_pickle('data_cleaned_new_scraper/designers_reduced.pkl')

## Artists

In [None]:
# load artists file
artists = pd.read_pickle('data_dirty_new_scraper/artists.pkl')
artists.info()

In [None]:
artists.drop_duplicates(subset='BGGId', keep='first', inplace=True) # drop duplicates
artists.fillna(0, inplace=True) # fill nan

# convert flags to int8
temp_id = artists['BGGId']
artists.drop('BGGId', axis=1, inplace=True)
artists = artists.astype('int8')
artists['BGGId'] = temp_id

artists.reset_index(inplace=True, drop=True)# reset index
artists.info()# get memory usage

In [None]:
# save all artists to file
artists.to_pickle('data_cleaned_new_scraper/artists_all.pkl')

In [None]:
# locate all row that sum to 3 or less (find low experience artists)

# change the 3 to whatever desired for more or less experience
lowexp_rows = artists.loc[:, artists.sum(axis=0) <= 3]

# Locate the columns that contain the low experience artists
lowexp_columns = lowexp_rows[lowexp_rows.sum(axis=1) > 0]

# get indices of those low-exp columns
indices = lowexp_columns.index

# make new column for low exp Artist
artists['Low-Exp Artist'] = 0

# for each index in the low exp list, set low exp Artist to 1
for index in indices:
    artists.loc[index, 'Low-Exp Artist'] = 1
    
# drop all columns for one-off artists
artists.drop(artists.loc[:, artists.sum(axis=0) <= 3], axis=1, inplace=True)

In [None]:
# save artists to file reduces to 3 or more works
artists.to_pickle('data_cleaned_new_scraper/artists_reduced.pkl')

## Publishers

In [None]:
# load publishers
publishers = pd.read_pickle('data_dirty_new_scraper/publishers.pkl')
publishers.info()

In [None]:
publishers.drop_duplicates(subset='BGGId', keep='first', inplace=True) # drop duplicates
publishers.fillna(0, inplace=True)# fill nan

# set flags to int8
temp_id = publishers['BGGId']
publishers.drop('BGGId', axis=1, inplace=True)
publishers = publishers.astype('int8')
publishers['BGGId'] = temp_id

publishers.reset_index(inplace=True, drop=True) # reset index
publishers.info() # get memory usage

In [None]:
# save all publishers to file
publishers.to_pickle('data_cleaned_new_scraper/publishers_all.pkl')

In [None]:
# locate all row that sum to 3 or less (find low experience publishers)

# change the 3 to whatever desired for more or less experience
lowexp_rows = publishers.loc[:, publishers.sum(axis=0) <= 3]

# Locate the columns that contain the low experience publishers
lowexp_columns = lowexp_rows[lowexp_rows.sum(axis=1) > 0]

# get indices of those low-exp columns
indices = lowexp_columns.index

# make new column for low exp Publisher
publishers['Low-Exp Publisher'] = 0

# for each index in the low exp list, set low exp Publisher to 1
for index in indices:
    publishers.loc[index, 'Low-Exp Publisher'] = 1
    
# drop all columns for one-off publishers
publishers.drop(publishers.loc[:, publishers.sum(axis=0) <= 3], axis=1, inplace=True)

In [None]:
# save publishers reduced to 3 or more works
publishers.to_pickle('data_cleaned_new_scraper/publishers_reduced.pkl')

## Ratings Distribution

In [None]:
# Load the storage dictionary for this block
with open('data_cleaned/raw_ratings.json') as json_file:
    raw_ratings = json.load(json_file) 

In [None]:
ratings_distribution = pd.DataFrame()

for item in raw_ratings.keys():
    
    print(item)
    
    ratings_temp = pd.DataFrame(raw_ratings[item]).round(1)
    ratings_counts = pd.DataFrame(ratings_temp.value_counts()).sort_index().T
    
    ratings_distribution = ratings_distribution.append(ratings_counts)

In [None]:
ratings_distribution.set_axis(list(raw_ratings.keys()), axis=0, inplace=True)

In [None]:
ratings_distribution.fillna(0, inplace=True)

In [None]:
ratings_distribution['total_ratings'] = ratings_distribution.sum(axis=1)

In [None]:
ratings_distribution = ratings_distribution.T.reset_index().T

In [None]:
ratings_distribution.rename(columns={0:'BGGId'}, inplace=True)

In [None]:
ratings_distribution['BGGId'] = ratings_distribution['BGGId'].astype('int64')

In [None]:
ratings_distribution.to_pickle('data_cleaned/ratings_distribution.pkl')

# Ratings - by Item, User, & Comments

In [None]:
files = []

for item in os.listdir('data_dirty_new_scraper/'):
    if item.startswith('user_ratings_comments'):
        files.append(item)

## Test Code

In [None]:
master_comments = pd.DataFrame()

comments_one = pd.read_pickle('data_dirty_new_scraper/'+item)

# call integer_reduce on the sets
comments_one['BGGId'] = comments_one['BGGId'].astype(int)
comments_one['Rating'] = comments_one['Rating'].astype(float)

comments_one.head()

In [None]:
master_comments = master_comments.append(comments_one)
master_comments.head()

In [None]:
ratings = comments_one[['BGGId', 'Rating', 'Username']]
ratings.head()

In [None]:
ratings[:10].apply(lambda x: process_dataframe_ratings(x, user_ratings, raw_ratings), axis=1)

In [None]:
user_ratings

In [None]:
raw_ratings[68448]

## Deploy - Master_Comments

In [None]:
master_comments = pd.DataFrame()

for item in files:
    
    print("Deploying "+str(item))
    comments_one = pd.read_pickle('data_dirty_new_scraper/'+item)
    
    # call integer_reduce on the sets
    comments_one['BGGId'] = comments_one['BGGId'].astype(int)
    comments_one['Rating'] = comments_one['Rating'].astype(float)

    master_comments = master_comments.append(comments_one)
    master_comments.drop_duplicates(keep='first', inplace=True)
    
    master_comments.to_pickle('data_cleaned_new_scraper/master_comments.pkl')
    
    del comments_one
    gc.collect()

In [None]:
master_comments.info()

In [None]:
master_comments.drop_duplicates(keep='first', inplace=True)

In [None]:
master_comments.to_pickle('data_cleaned_new_scraper/master_comments.pkl')

## Deploy - User and Item Ratings

In [None]:
game_ids_current = pd.read_pickle('data_dirty_new_scraper/game_ids_current')
game_ids = list(game_ids_current)

In [None]:
user_ratings = {}

raw_ratings = {}

for item in game_ids:
    raw_ratings[item] = []

In [None]:
def process_dataframe_ratings(x, user_ratings, raw_ratings):
    
    try:
        user_ratings[x['Username']][x['BGGId']] = float(x['Rating']) 
    
    except:
        user_ratings[x['Username']] = {}
        user_ratings[x['Username']][x['BGGId']] = float(x['Rating']) 
    
    raw_ratings[x['BGGId']].append(x['Rating'])

In [None]:
ratings = master_comments[['BGGId', 'Rating', 'Username']]
del master_comments
gc.collect() 
    
ratings.apply(lambda x: process_dataframe_ratings(x, user_ratings, raw_ratings), axis=1)
    
# save dictionary
with open('data_cleaned_new_scraper/raw_ratings.json', 'w') as convert_file:
    convert_file.write(json.dumps(raw_ratings))
    
with open('data_cleaned_new_scraper/user_ratings.json', 'w') as convert_file:
    convert_file.write(json.dumps(user_ratings))
    
del ratings
gc.collect() 

In [None]:
user_ratings['Torsten']

In [None]:
len(raw_ratings[91671])

### Winnow Users to 5+ Ratings

In [None]:
# Opening JSON file
with open('data_cleaned_new_scraper/user_ratings.json') as json_file:
    user_ratings = json.load(json_file)

In [None]:
all_users = list(user_ratings.keys())

In [None]:
for user in all_users:

    if len(user_ratings[user]) < 5:
        print("Removing user "+user)
        del user_ratings[user]
        continue

In [None]:
with open('data_cleaned_new_scraper/user_ratings.json', 'w') as convert_file:
    convert_file.write(json.dumps(user_ratings))

# DEPRECATED

## Ratings Dist

In [None]:
# open ratings distribution
ratings_dist = pd.read_pickle('data_dirty_new_scraper/ratings_dist.pkl')
ratings_dist.info()

In [None]:
ratings_dist.drop_duplicates(subset='BGGId', keep='first', inplace=True) # drop duplicates
ratings_dist.fillna(0, inplace=True) # fill nan
ratings_dist.drop(ratings_dist.index[empty_games], inplace=True) # drop empty games
ratings_dist.reset_index(inplace=True, drop=True) # reset index
ratings_dist.drop(ratings_dist.index[not_released], inplace=True) # drop unreleased games
ratings_dist.reset_index(inplace=True, drop=True) # reset index

# for each column:
for column in ratings_dist.columns[1:]:
    # fix numbers as needed (convert 1xk to 10000)
    ratings_dist[column] = ratings_dist[column].apply(lambda x: fix_numbers(x))

# add a column
ratings_dist['num_votes'] = games['NumUserRatings']

# convert types to int32
temp_id = ratings_dist['BGGId']
ratings_dist.drop('BGGId', axis=1, inplace=True)
ratings_dist = ratings_dist.astype('int32')
ratings_dist['BGGId'] = temp_id

ratings_dist.info() # check memory usage

In [None]:
# save to file
ratings_dist.to_pickle('data_cleaned_new_scraper/ratings_dist.pkl')

## Comments

In [None]:
# load comments
comments = pd.read_pickle('data_dirty_new_scraper/comments.pkl')
comments.info()

In [None]:
comments.drop_duplicates(subset=['BGGId', 'Username'], keep='first', inplace=True) # drop duplicates
comments.reset_index(inplace=True, drop=True) # rest index
comments.fillna(0, inplace=True) # fill na

# clean the comments text
comments['cleaned'] = comments['Value'].apply(lambda x: text_block_processor(x))
    
# drop the description field 
comments.drop('Value', axis=1, inplace=True)

In [None]:
# save file
comments.to_pickle('data_cleaned_new_scraper/comments.pkl')

## Ratings Matrix

In [None]:
# dictionary of game IDs-Names

# Load games
games = pd.read_pickle('data_cleaned_new_scraper/games.pkl')

# lists of game ids and game names
game_ids = list(games['BGGId'])
game_names = list(games['Name'])

# make lookup dictionary
game_id_lookup = {}

# store ids and names in lookup dictionary
for key, item in zip(game_ids, game_names):
    game_id_lookup[key] = item

    
del games
gc.collect()

game_id_lookup

In [None]:
# create a ratings matrix file from a set of files on disk
ratings_matrix = create_ratings_file(51,57, game_ids)

In [None]:
# save file to pickle, specify filename carefully!
ratings_matrix.to_pickle('data_cleaned_new_scraper/ratings_matrix_cleaned_06.pkl')

### Data Validation

In [None]:
ratings_matrix_1 = pd.read_pickle('userid/user_ratings31.pkl')
ratings_matrix_2  = pd.read_pickle('userid/user_ratings32.pkl')

In [None]:
this_user = pd.DataFrame(ratings_matrix_1.T['Threnody'].dropna(axis=0))
this_user.rename(columns={'Threnody':'Rating'}, inplace=True)
this_user.reset_index(inplace=True)
this_user['Game'] = this_user['index'].astype('int32').map(game_id_lookup)
this_user.drop('index', axis=1, inplace=True)
this_user.sort_values('Rating', ascending=False).head(30)

In [None]:
# create a ratings matrix file from a set of files on disk
ratings_matrix = create_ratings_file(31,32, game_ids)

In [None]:
ratings_matrix.head()

In [None]:
this_user = pd.DataFrame(ratings_matrix.T['Threnody'].dropna(axis=0))
this_user.rename(columns={'Threnody':'Rating'}, inplace=True)
this_user.reset_index(inplace=True)
this_user['Game'] = this_user['index'].astype('int32').map(game_id_lookup)
this_user.drop('index', axis=1, inplace=True)
this_user.sort_values('Rating', ascending=False).head(30)

In [None]:
# save file to pickle, specify filename carefully!
ratings_matrix.to_pickle('data_cleaned_new_scraper/ratings_matrix_cleaned_06.pkl')

In [None]:
del ratings_matrix
gc.collect()