# Notebook Objective and Setup

BGG03 is the scrubbing and cleaning of the various data obtained in notebooks BGG01 and BGG02. The following datasets are cleaned, constructed, or otherwise prepared for EDA and modeling.

    * Games
   * Mechanics
    * Subcategories
    * Designers
    * Artists
    * Publishers
    * Awards
    * Ratings Distribution
    * Comments
    * Ratings Matrix

## Package Imports

In [None]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import json
from statistics import mean

# ignore warnings (gets rid of Pandas copy warnings)
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 30)

# NLP tools
# import spacy

# nlp = spacy.load("en_core_web_sm")
# import re
# import nltk
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from nltk.tokenize import word_tokenize

filepath = "../../data/games/game_dfs_dirty/"

## Notebook Functions

In [None]:
def integer_reduce(data: pd.DataFrame, columns: list[str], fill_value: int = 0):
    """
    Reduces an integer type to its smallest memory size type

    Inputs:
    data: dataframe to reduce
    columns: columns to reduce
    fill_value: fill value to use if none

    Returns:
    data: dataframe with memory reduced data types
    """
    for column in columns:
        # strip all non integers
        data[column] = data[column].replace(r"[^0-9]", "", regex=True)
        data[column] = data[column].fillna(fill_value)
        data[column] = pd.to_numeric(data[column], errors="coerce", downcast="integer")

        if (data[column].max() <= 127) & (data[column].min() >= -128):
            data[column] = data[column].astype("Int8")
        elif (data[column].max() <= 32767) & (data[column].min() >= -32768):
            data[column] = data[column].astype("Int16")
        elif (data[column].max() <= 2147483647) & (data[column].min() >= -2147483648):
            data[column] = data[column].astype("Int32")

    return data

In [None]:
def text_block_processor(text):
    """Takes a block of text. Divides block into sentences with words lemmatized.
    Sends each sentence to word processor. Concatenates all words into one string
    Otherwise returns string of cleaned and processed words from text block

    ARGUMENTS:
    block of text
    """

    text = str(text)
    line = re.sub(
        r"[^a-zA-Z\s]", "", text
    ).lower()  # removes all special characters and numbers, and makes lower case
    line2 = re.sub(r"\s{2}", "", line).lower()  # removes extra blocks of 2 spaces
    tokens = nlp(line)
    words = []
    for token in tokens:
        if token.is_stop == False:
            token_preprocessed = token.lemma_
            if token_preprocessed != "":  # only continues if returned word is not empty
                words.append(token_preprocessed)  # appends word to list of words
    line = " ".join(words)

    return line

In [None]:
def fix_numbers(x):
    """
    Checks for numbers or strings
    If a string, strips off the "k" and multiply by 10000
    Sends back cleaned int
    """

    if type(x) is int:
        return int(x)

    if str.endswith(x, "k"):
        x = str(x).strip("k")
        new_num = int(float(x) * 1000)
        return int(new_num)

    else:
        return int(x)

In [None]:
def clean_ratings(id_num, game_ids):
    """
    Loads and cleans a raw user ratings file
    Drops game ids not present in games file
    Drops users with fewer than 10 ratings

    Inputs:
    id_num: the appendation of the file to find the path
    game_ids: list of game ids in the games file

    Outputs:
    Cleaned user ratings file
    """

    print("\nCleaning Frame #" + str(id_num))

    # load in raw users file according to id_num inputted
    path = "userid/user_ratings" + str(id_num) + ".pkl"
    users = pd.read_pickle(path)

    # convert all datatypes to float
    float_converted = users.astype("float")

    # delete and clean up raw users file
    del users
    gc.collect()

    # create intersection between user file and game list ids
    float_converted.columns = float_converted.columns.astype("int32")
    cleaned = float_converted[float_converted.columns.intersection(game_ids)]

    # delete and clean up
    del float_converted
    gc.collect()

    # make a list of users with fewer than 5 user ratings
    sums = cleaned.count(axis=1) < 5
    # get indices for the rows with fewer than 5 ratings
    drop_these = sums.loc[sums == True].index
    # drop the users with fewer than 5 ratings
    cleaned.drop(drop_these, axis=0, inplace=True)

    # print memory usage
    print(cleaned.info())

    # return cleaned file
    return cleaned

In [None]:
def create_ratings_file(start_file, end_file, game_ids):
    """
    Puts together dataframes from a range of files
    Each file calls the clean_ratings function
    Then all files in range are concatenated

    Inputs:
    start_file: start of file name appendation
    end_file: end file name appendation
    game_ids_list: list of game ids in the games file

    Outputs:
    Cleaned and concatenated master file

    """

    # make an empty dataframe
    master_file = pd.DataFrame()

    # for each number in the range from start to end:
    for id_num in np.arange(start_file, end_file + 1, 1):
        print(id_num)
        # clean the file calling clean_ratings
        cleaned_item = clean_ratings(id_num, game_ids)
        # append the file to the dataframe
        master_file = pd.concat([master_file, cleaned_item], axis=0)

    master_file.drop_duplicates(keep="first", inplace=True)

    # clean up
    del cleaned_item
    gc.collect()

    return master_file

In [None]:
def process_dataframe_ratings(x, user_ratings, raw_ratings):

    try:
        user_ratings[x["Username"]][x["BGGId"]] = float(x["Rating"])

    except:
        user_ratings[x["Username"]] = {}
        user_ratings[x["Username"]][x["BGGId"]] = float(x["Rating"])

    raw_ratings[x["BGGId"]].append(x["Rating"])

# Games Files

In [None]:
games = pd.read_pickle(f"{filepath}games.pkl")
# drop duplicate entires
games = games.drop_duplicates(subset="BGGId", keep="first")
# Get info, make note of datatypes and memory usage
games.info()

In [None]:
drop_columns = [
    "NumAwards",
    "NumFans",
    "NumPageViews",
    "RulesPosts",
    "TotalPosts",
    "Category",
    "IsExpansion",
    "Rank:rpgitem",
    "Rank:boardgameaccessory",
    "Rank:videogame",
    "Rank:amiga",
    "Rank:commodore64",
    "Rank:arcade",
    "Rank:atarist",
    "Setting",
    "Mechanism",
]

# drop non-boardgame related information
for column in drop_columns:
    if column in games.columns:
        games = games.drop(column, axis=1)

In [None]:
# Get rid of all non-integer characters from df["BestPlayers"] using regex
games["BestPlayers"] = games["BestPlayers"].str.replace(r"\D", "", regex=True)

# change the datatype of BestPlayers to int8
games["BestPlayers"] = pd.to_numeric(
    games["BestPlayers"], errors="coerce", downcast="integer"
)

# fill in missing values with 0
games["BestPlayers"] = games["BestPlayers"].fillna(0)

games["BestPlayers"] = games["BestPlayers"].astype("int8")

In [None]:
# Add Categories with binary flags
games.loc[games["Rank:thematic"].notna(), "Cat:Thematic"] = 1
games.loc[games["Rank:strategygames"].notna(), "Cat:Strategy"] = 1
games.loc[games["Rank:wargames"].notna(), "Cat:War"] = 1
games.loc[games["Rank:familygames"].notna(), "Cat:Family"] = 1
games.loc[games["Rank:cgs"].notna(), "Cat:CGS"] = 1
games.loc[games["Rank:abstracts"].notna(), "Cat:Abstract"] = 1
games.loc[games["Rank:partygames"].notna(), "Cat:Party"] = 1
games.loc[games["Rank:childrensgames"].notna(), "Cat:Childrens"] = 1

In [None]:
# prepare different column sets for memory integer reduction

# integer reduction with fill_values of 0
int_columns = [
    "BGGId",
    "YearPublished",
    "MinPlayers",
    "MaxPlayers",
    "NumOwned",
    "NumWant",
    "NumWish",
    "NumWeightVotes",
    "MfgPlaytime",
    "ComMinPlaytime",
    "ComMaxPlaytime",
    "MfgAgeRec",
    "NumUserRatings",
    "NumComments",
    "NumAlternates",
    "NumExpansions",
    "NumImplementations",
    "IsReimplementation",
    "Kickstarted",
    "Cat:Thematic",
    "Cat:Strategy",
    "Cat:War",
    "Cat:Family",
    "Cat:CGS",
    "Cat:Abstract",
    "Cat:Party",
    "Cat:Childrens",
]

# integer reduction with fill_values of 21926 (lower is better on these)
ranks = [
    "Rank:boardgame",
    "Rank:thematic",
    "Rank:strategygames",
    "Rank:wargames",
    "Rank:familygames",
    "Rank:cgs",
    "Rank:abstracts",
    "Rank:partygames",
    "Rank:childrensgames",
]

# call integer_reduce on the sets
games = integer_reduce(games, int_columns, fill_value=0)

games = integer_reduce(games, ranks, fill_value=21926)

games.info()  # recheck data types and memory usage

In [None]:
# Drop all games that are not yet released, then reset index
# not_released = list(games.loc[games['YearPublished']>2021].index)
# games.drop(games.index[not_released], inplace=True)
# games.reset_index(inplace=True, drop=True)

In [None]:
themes = pd.DataFrame(games["Theme"])
games.drop("Theme", axis=1, inplace=True)

In [None]:
# process the Description column text
games["Description"] = games["Description"].apply(lambda x: text_block_processor(x))

In [None]:
games.reset_index(inplace=True, drop=True)

In [None]:
# save to file
games.to_csv("data_kaggle/games.csv", index=False)

In [None]:
# All games with over 12 players are set at 13 players.
games.loc[games["MaxPlayers"] > 12, "MaxPlayers"] = 13

In [None]:
# Games with min players of 0, we will set their min players = 2
games.loc[games["MinPlayers"] < 1, "MinPlayers"] = 2

In [None]:
# save to file
games.to_pickle("data_store/data_cleaned/games.pkl")

In [None]:
games.shape

## Game ID Lookup

In [None]:
# dictionary of game IDs-Names

# Load games
games = pd.read_pickle(f"{filepath}/games.pkl")

# lists of game ids and game names
game_ids = list(games["BGGId"])
game_names = list(games["Name"])

# make lookup dictionary
game_id_lookup = {}

# store ids and names in lookup dictionary
for key, item in zip(game_ids, game_names):
    game_id_lookup[key] = item


# del games
# gc.collect()

game_id_lookup

In [None]:
# save dictionary
# with open("data_store/data_cleaned/game_id_lookup.json", "w") as convert_file:
#     convert_file.write(json.dumps(game_id_lookup))

## Mechanics and Subcategories

### Clean Mechanics

In [None]:
# Load mechanics and check memory usage
mechanics = pd.read_pickle(f"{filepath}/mechanics.pkl")
mechanics = mechanics.drop_duplicates(keep="first")
mechanics["Count_Column"] = 1
mechanics = mechanics.sort_values(by="BGGId").reset_index(drop=True)
mechanics.head(10)

mechanics = mechanics.pivot_table(
    index="BGGId", columns="mechanic", values="Count_Column"
)

# Clean up mechanics
# Here we are using our domain knowledge to compact several different catogories into one

auction_list = mechanics[
    [x for x in mechanics.columns if "auction" in x.lower()]
].columns.to_list()

drafting = mechanics[
    [x for x in mechanics.columns if "drafting" in x.lower()]
].columns.to_list()

worker_placement = mechanics[
    [x for x in mechanics.columns if "worker" in x.lower()]
].columns.to_list()

compacting_categories = {
    "Auction or Bidding": auction_list,
    "Drafting": drafting,
    "Worker Placement": worker_placement,
}

for category in compacting_categories:
    for item in compacting_categories[category]:
        mechanics.loc[mechanics[item] == 1, category] = 1
        mechanics = mechanics.drop([item], axis=1)

mechanics.loc[mechanics["Legacy"] == 1, "Legacy Game"] = 1
mechanics = mechanics.drop(["Legacy"], axis=1)

turn_order_list = mechanics[
    [x for x in mechanics.columns if "turn order" in x.lower()]
].columns.to_list()

mechanics = mechanics.drop(turn_order_list, axis=1)

columns = mechanics.columns

# call integer_reduce on the sets
mechanics = integer_reduce(mechanics, columns, fill_value=0)

mechanics = mechanics.reset_index().melt(
    id_vars="BGGId", var_name="mechanic", value_name="value"
)
mechanics = (
    mechanics[mechanics["value"] == 1]
    .drop("value", axis=1)
    .sort_values(by="BGGId")
    .reset_index(drop=True)
)
mechanics.head()

### Clean/Combine Mechanics and Subcategories

Manually cleaning up Subcategories. This section on BGG has a lot of "catch-all" concept that involve theming, mechanics (which should be in the mechanics section), and large subcategories that should be alone. We use our domain knowledge to clean this section.

In [None]:
# picking the items that will go under "themes"
actually_themes = [
    "Adventure",
    "Age of Reason",
    "American Civil War",
    "American Indian Wars",
    "American Revolutionary War",
    "American West",
    "Ancient",
    "Animals",
    "Arabian",
    "Aviation / Flight",
    "City Building",
    "Civil War",
    "Civilization",
    "Comic Book / Strip",
    "Economic",
    "Environmental",
    "Fantasy",
    "Farming",
    "Fighting",
    "Horror",
    "Humor",
    "Industry / Manufacturing",
    "Korean War",
    "Mafia",
    "Math",
    "Mature / Adult",
    "Maze",
    "Medical",
    "Medieval",
    "Modern Warfare",
    "Movies / TV / Radio theme",
    "Murder/Mystery",
    "Music",
    "Mythology",
    "Napoleonic",
    "Nautical",
    "Novel-based",
    "Number",
    "Pike and Shot",
    "Pirates",
    "Political",
    "Post-Napoleonic",
    "Prehistoric",
    "Racing",
    "Religious",
    "Renaissance",
    "Science Fiction",
    "Space Exploration",
    "Spies/Secret Agents",
    "Sports",
    "Trains",
    "Transportation",
    "Travel",
    "Trivia",
    "Video Game Theme",
    "Vietnam War",
    "World War I",
    "World War II",
    "Zombies",
]

In [None]:
# picking out mechanics
actually_mechanics = {
    "Real-time": "Real-Time",
    "Bluffing": "Betting and Bluffing",
    "Deduction": "Deduction",
    "Dice": "Dice Rolling",
    "Memory": "Memory",
    "Negotiation": "Negotiation",
    "Exploration": "Exploration",
    "Territory Building": "Territory Building",
}

# picking the items that will stay as subcategories
actually_subcategories = [
    "Abstract Strategy",
    "Miniatures",
    "Card Game",
    "Educational",
    "Puzzle",
    "Collectible Components",
    "Word Game",
    "Print & Play",
    "Electronic",
    "Children's Game",
    "Collectible Components",
    "Wargame",
    "Word Game",
]

actually_major_categories = {
    "Wargame": "Cat:War",
    "Children's Game": "Cat:Childrens",
    "Party Game": "Cat:Party",
    "Abstract Strategy": "Cat:Abstract",
}

drop_subcategories = ["Expansion for Base-game", "Game System", "Book"]

### Clean Subcategories that go in Mechanics

In [None]:
# load subcategories file and check memory usage

# indices = list(games["BGGId"])
subcategories = pd.read_pickle(f"{filepath}/subcategories.pkl")
subcategories = (
    subcategories.dropna(subset=["boardgamecategory"])
    .sort_values("BGGId")
    .reset_index(drop=True)
)
subcategories.head()

In [None]:
mechanics_in_subcats_df = subcategories[
    subcategories["boardgamecategory"].isin(actually_mechanics.keys())
].reset_index(drop=True)

themes_in_subcats_df = subcategories[
    subcategories["boardgamecategory"].isin(actually_themes)
].reset_index(drop=True)

big_cats_in_subcats_df = subcategories[
    subcategories["boardgamecategory"].isin(actually_major_categories.keys())
].reset_index(drop=True)

# drop rows from subcategories where boardgamecategory is in list drop
subcategories = subcategories[
    ~subcategories["boardgamecategory"].isin(drop_subcategories)
]
subcategories = subcategories[
    ~subcategories["boardgamecategory"].isin(actually_mechanics.keys())
]
subcategories = subcategories[
    ~subcategories["boardgamecategory"].isin(actually_subcategories)
].reset_index(drop=True)

mechanics_in_subcats_df.head()

In [None]:
# use the dictionary "actually_mechanics" to rename the entries in column boardgamecategory
mechanics_in_subcats_df["boardgamecategory"] = mechanics_in_subcats_df[
    "boardgamecategory"
].map(actually_mechanics)

# rename teh column "boardgamecategory" to "mechanic"
mechanics_in_subcats_df = mechanics_in_subcats_df.rename(
    columns={"boardgamecategory": "mechanic"}
)

mechanics_in_subcats_df.head()

In [None]:
mechanics.shape, mechanics_in_subcats_df.shape

In [None]:
mechanics = (
    pd.concat([mechanics, mechanics_in_subcats_df], axis=0)
    .sort_values(by="BGGId")
    .reset_index(drop=True)
)

In [None]:
mechanics.shape

In [None]:
mechanics.head()

### Clean Subcategories that go in Themes

In [None]:
themes = pd.read_pickle(f"{filepath}themes.pkl")
themes = themes.dropna(subset=["Theme"]).sort_values("BGGId")
themes.head()

In [None]:
themes_in_subcats_df.head()

In [None]:
# rename teh column "boardgamecategory" to "mechanic"
themes_in_subcats_df = themes_in_subcats_df.rename(
    columns={"boardgamecategory": "Theme"}
)

themes_in_subcats_df.head()

In [None]:
themes_in_subcats_df.shape, themes.shape

In [None]:
themes = (
    pd.concat([themes, themes_in_subcats_df], axis=0)
    .sort_values(by="BGGId")
    .reset_index(drop=True)
)

In [None]:
themes.shape

### Clean Subcategories that go in LARGE theme categories

In [None]:
games = pd.read_pickle("../../data/games/game_dfs_clean/games.pkl")
games.head()

In [None]:
big_cats_in_subcats_df.head()

In [None]:
# make a list of the unique things in boardgamecategory
unique_cats = big_cats_in_subcats_df["boardgamecategory"].unique()

big_category_mapper = {
    "Wargame": "Cat:War",
    "Children's Game": "Cat:Childrens",
    "Party Game": "Cat:Party",
    "Abstract Strategy": "Cat:Abstract",
}

# for each key in mapper, set the BGGId in games to 1 in the corresponding column value
for key in big_category_mapper:
    games.loc[
        games["BGGId"].isin(
            big_cats_in_subcats_df.loc[
                big_cats_in_subcats_df["boardgamecategory"] == key, "BGGId"
            ]
        ),
        mapper[key],
    ] = 1

Save all the file we just cleaned or created!

In [None]:
mechanics.to_pickle("data_store/data_cleaned/mechanics.pkl")

In [None]:
subcategories.to_pickle("data_store/data_cleaned/subcategories.pkl")

In [None]:
themes.to_pickle("data_store/data_cleaned/themes.pkl")

In [None]:
mechanics.to_csv("data_kaggle/mechanics.csv", index=False)
subcategories.to_csv("data_kaggle/subcategories.csv", index=False)
themes.to_csv("data_kaggle/themes.csv", index=False)

In [None]:
mechanics = pd.read_pickle("data_store/data_cleaned/mechanics.pkl")
subcategories = pd.read_pickle("data_store/data_cleaned/subcategories.pkl")
themes = pd.read_pickle("data_store/data_cleaned/themes.pkl")

In [None]:
mechanics.head()

In [None]:
subcategories.head()

In [None]:
themes.head()

## Designers

In [None]:
# Load up our designers file!
designers = pd.read_pickle(f"{filepath}designers.pkl")
designers = designers.loc[designers["boardgamedesigner"] != "(Uncredited)"]
designers = designers.reset_index(drop=True)
designers.head()

In [None]:
# remove designers with <3 games
df_onehot = pd.get_dummies(
    designers, columns=["boardgamedesigner"], drop_first=False
).astype(int)

# Step 1: Identify columns where the sum of their values is <= 3
cols_with_low_sum = df_onehot.columns[df_onehot.sum(axis=0) <= 3]

# Step 2: Create the "Low Entries" column
# If any column in a row has a sum <= 3, mark that row with 1
df_onehot["Low Entries"] = df_onehot[cols_with_low_sum].gt(0).any(axis=1).astype(int)

# Step 3: Drop columns where the sum is <= 3
df = df_onehot.drop(columns=cols_with_low_sum).reset_index(drop=True)

In [None]:
df.columns = df.columns.str.replace("boardgamedesigner_", "")
df.head()

In [None]:
df = df.melt(id_vars="BGGId", var_name="mechanic", value_name="value")
df = (
    df[df["value"] == 1]
    .drop("value", axis=1)
    .sort_values(by="BGGId")
    .reset_index(drop=True)
)
df.head()

## Artists

In [None]:
# load artists file
artists = pd.read_pickle(f"{filepath}artists.pkl")
artists.head()

In [None]:
artists = artists.loc[artists["boardgameartist"] != "(Uncredited)"]
artists = artists.reset_index(drop=True)
artists.head()

In [None]:
# locate all row that sum to 3 or less (find low experience artists)

# change the 3 to whatever desired for more or less experience
lowexp_rows = artists.loc[:, artists.sum(axis=0) <= 3]

# Locate the columns that contain the low experience artists
lowexp_columns = lowexp_rows[lowexp_rows.sum(axis=1) > 0]

# get indices of those low-exp columns
indices = lowexp_columns.index

# make new column for low exp Artist
artists["Low-Exp Artist"] = 0

# for each index in the low exp list, set low exp Artist to 1
for index in indices:
    artists.loc[index, "Low-Exp Artist"] = 1

# drop all columns for one-off artists
artists.drop(artists.loc[:, artists.sum(axis=0) <= 3], axis=1, inplace=True)

## Publishers

In [None]:
# load publishers
publishers = pd.read_pickle(f"{filepath}/publishers.pkl")
publishers.head()

In [None]:
publishers = publishers.loc[publishers["boardgamepublisher"] != "(Uncredited)"]
publishers = publishers.reset_index(drop=True)

In [None]:
# save all publishers to file
publishers.to_pickle("data_store/data_cleaned/publishers_all.pkl")
publishers.to_csv("data_kaggle/publishers_all.csv", index=False)

In [None]:
# locate all row that sum to 3 or less (find low experience publishers)

# change the 3 to whatever desired for more or less experience
lowexp_rows = publishers.loc[:, publishers.sum(axis=0) <= 3]

# Locate the columns that contain the low experience publishers
lowexp_columns = lowexp_rows[lowexp_rows.sum(axis=1) > 0]

# get indices of those low-exp columns
indices = lowexp_columns.index

# make new column for low exp Publisher
publishers["Low-Exp Publisher"] = 0

# for each index in the low exp list, set low exp Publisher to 1
for index in indices:
    publishers.loc[index, "Low-Exp Publisher"] = 1

# drop all columns for one-off publishers
publishers.drop(publishers.loc[:, publishers.sum(axis=0) <= 3], axis=1, inplace=True)

In [None]:
# save publishers reduced to 3 or more works
publishers.to_pickle("data_store/data_cleaned/publishers_reduced.pkl")
publishers.to_csv("data_kaggle/publishers_reduced.csv", index=False)

In [None]:
publishers = pd.read_pickle("data_store/data_cleaned/publishers_reduced.pkl")
publishers.head()

# Ratings - by Item, User, & Comments

## Test Code

In [None]:
master_comments = pd.read_pickle("data_store/data_dirty/raw_game_ratings.pkl")

# integer_reduce
master_comments["BGGId"] = master_comments["BGGId"].astype(int)
master_comments["Rating"] = master_comments["Rating"].astype(float)

master_comments.head()

In [None]:
master_comments.drop_duplicates(keep="first", inplace=True)

In [None]:
master_comments.info()

In [None]:
nodrops = list(master_comments.loc[master_comments.Comments != ""].index)

In [None]:
comments_only = master_comments.iloc[nodrops]

In [None]:
comments_only.head(10)

In [None]:
comments_only.to_pickle("data_store/data_cleaned/master_comments_file.pkl")

In [None]:
del comments_only
gc.collect()

## Deploy - User and Item Ratings

In [None]:
raw_game_ratings = pd.read_pickle("data_store/data_dirty/raw_game_ratings.pkl")

In [None]:
ratings = raw_game_ratings[["BGGId", "Rating", "Username"]]
ratings.head()

In [None]:
del raw_game_ratings
gc.collect()

In [None]:
ratings.drop_duplicates(keep="first", inplace=True)

In [None]:
ratings.head()

In [None]:
ratings.to_csv("data_kaggle/user_ratings.csv", index=False)

In [None]:
ratings.Username.nunique()

In [None]:
game_ids_current = pd.read_pickle("data_store/data_cleaned/game_ids_current.pkl")
game_ids = list(game_ids_current)

In [None]:
user_ratings = {}

raw_ratings = {}

for item in game_ids:
    raw_ratings[item] = []

In [None]:
ratings.apply(lambda x: process_dataframe_ratings(x, user_ratings, raw_ratings), axis=1)

In [None]:
user_ratings["Threnody"]

In [None]:
raw_ratings[213788]

In [None]:
# save dictionary
with open("data_store/data_cleaned/game_raw_ratings.json", "w") as convert_file:
    convert_file.write(json.dumps(raw_ratings))

with open("real_ratings/user_ratings_unscaled.json", "w") as convert_file:
    convert_file.write(json.dumps(user_ratings))

del ratings
gc.collect()

### Winnow Users to 5+ Ratings

In [None]:
# Opening JSON file
with open("data_store/data_cleaned/user_ratings_unscaled.json") as json_file:
    user_ratings = json.load(json_file)

In [None]:
all_users = list(user_ratings.keys())

In [None]:
for user in all_users:

    if len(user_ratings[user]) < 5:
        print("Removing user " + user)
        del user_ratings[user]
        continue

In [None]:
with open("data_store/data_cleaned/user_ratings_unscaled.json", "w") as convert_file:
    convert_file.write(json.dumps(user_ratings))

## Ratings Distribution

In [None]:
# Load the storage dictionary for this block
with open("data_store/data_cleaned/game_raw_ratings.json") as json_file:
    raw_ratings = json.load(json_file)

In [None]:
len(raw_ratings)

In [None]:
ratings_distribution = pd.DataFrame()

for item in raw_ratings.keys():

    print(item)

    ratings_temp = pd.DataFrame(raw_ratings[item]).round(1)
    ratings_counts = pd.DataFrame(ratings_temp.value_counts()).sort_index().T

    ratings_distribution = ratings_distribution.append(ratings_counts)

In [None]:
# ratings_distribution = pd.read_pickle('data_store/data_cleaned/ratings_distribution.pkl')

In [None]:
ratings_distribution.set_axis(list(raw_ratings.keys()), axis=0, inplace=True)

In [None]:
ratings_distribution.head()

In [None]:
ratings_distribution.fillna(0, inplace=True)

In [None]:
ratings_distribution.head()

In [None]:
ratings_distribution["total_ratings"] = ratings_distribution.sum(axis=1)

In [None]:
# ratings_distribution = ratings_distribution.T.reset_index().T

In [None]:
ratings_distribution.reset_index(inplace=True)

In [None]:
ratings_distribution.head()

In [None]:
ratings_distribution.rename(columns={"index": "BGGId"}, inplace=True)

In [None]:
ratings_distribution["BGGId"] = ratings_distribution["BGGId"].astype("int64")

In [None]:
ratings_distribution.head()

In [None]:
ratings_distribution.to_pickle("data_store/data_cleaned/ratings_distribution.pkl")

In [None]:
ratings_distribution.to_csv("data_kaggle/ratings_distribution.csv", index=False)

## Item Means



In [None]:
ratings = pd.read_pickle("real_ratings/real_user_ratings_unscaled_fullmatrix.pkl")

In [None]:
ratings.head()

In [None]:
ratings = ratings.replace(0, np.NaN)

In [None]:
item_means = ratings.mean().to_dict()

In [None]:
# save catalog to file
with open("data_store/data_cleaned/item_means.json", "w") as convert_file:
    convert_file.write(json.dumps(item_means))

## User Means

In [None]:
# Opening JSON file
with open("real_ratings/real_user_ratings_unscaled.json") as json_file:
    user_ratings = json.load(json_file)

In [None]:
len(user_ratings)

In [None]:
user_means = {}

In [None]:
for person in user_ratings:
    user_items = []
    for item in user_ratings[person]:
        user_items.append(user_ratings[person][item])
    user_mean = round((mean(user_items)), 1)
    user_means[person] = user_mean

In [None]:
user_means["Threnody"]

In [None]:
user_means["moosh21"]

In [None]:
user_means["Shade92008"]

In [None]:
user_means["Torsten"]

In [None]:
# save dictionary
with open("data_store/data_cleaned/user_means.json", "w") as convert_file:
    convert_file.write(json.dumps(user_means))

In [None]:
del user_means
gc.collect()

In [None]:
# Opening JSON file
with open("data_store/data_cleaned/user_means.json") as json_file:
    user_means_dict = json.load(json_file)

In [None]:
user_means = pd.DataFrame.from_dict(user_means_dict, orient="index")
user_means.rename(columns={0: "Mean"}, inplace=True)
user_means.head()

In [None]:
user_means.to_pickle("data_store/data_cleaned/user_means.pkl")