# Notebook Objective and Setup

In BGG07 we build several large datasets, notably:

* Real Ratings Dictionaries and Longform(melted) Dataframes in both unscaled and scaled versions
* Synthetic Ratings Dictionaries and Longform Dataframes in both unscaled and scaled versions, for 100, 250 and 500
* User Means lookup dictionary

These files are used in the Collaborative Filter (BGG08)

## Notebook Preparation

### Package Imports

In [1]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import json
from statistics import mean
import copy

# ignore warnings (gets rid of Pandas copy warnings)
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 30)

### Notebook Functions

In [2]:
def get_user(user_items, user, game_ids, scaled=True):
    """
    Takes in a sparse matrix of users and items, a specific user to retrieve, and a list of game_ids
    Get the mean for the user
    Builds a list of user's rated items and subtracts user mean from all ratings
    Builds a corresponding list of game ids for the rated games
    Gets intersection of user's rated ids with the overall game_ids
    Stores user game_id:rating in user ratings dictionary
    Returns the user dictionary

    Inputs:
    user_matrix: sparse matrix of users and game ratings
    user: user to retrieve
    game_ids: all possible game_ids in matrix
    scaled: default true, is whether to remove user's mean from their ratings

    Outputs:
    overall_user: user entry with user's game ratings
    """

    # get the mean rating for that user
    user_mean = user_items.mean()

    if scaled:
        # normalize the ratings for that user by subtracting their mean from all ratings, store in list
        game_ratings_normed = list(user_items - user_mean)

    else:
        game_ratings_normed = list(user_items)

    # Get a list of all of the game IDs that the user rated (meaning are not NaN)
    users_game_ids = list(user_items.index)

    user_ratings = {}

    for key, value in zip(users_game_ids, game_ratings_normed):
        user_ratings[key] = value

    game_ids_set = set(game_ids).intersection(set(users_game_ids))

    # make a dictionary to store the intersected ratings
    set_dictionary = {}

    # for each matching key, value in game_ids and game_ratings for the user
    for item in game_ids_set:
        set_dictionary[item] = user_ratings[item]

    # store the user's ratings
    overall_user = set_dictionary

    return overall_user

In [3]:
def make_user_dictionaries(path_item, scaled=True):
    """Loads a user ratings matrix, cleans, and returns as ratings dictionary

    Inputs: path
    Scaled: default true, passes to get_user

    Outputs: ratings dictionary for this file
    """

    temp_dictionary = {}

    path = "data_cleaned_new_scraper/ratings_matrix_cleaned_" + path_item + ".pkl"
    user_matrix = pd.read_pickle(path)
    user_matrix = user_matrix.T
    user_matrix.index = user_matrix.index.astype("int32")
    user_matrix.shape

    for user in user_matrix.columns:

        print("Starting user " + user)

        user_items = user_matrix[user].dropna(axis=0)

        # copy the current user dictionary to the synthetic ratings storage dictionary
        temp_dictionary[user] = get_user(user_items, user, game_ids, scaled)

    return temp_dictionary

In [4]:
def make_long_df(path, scaled=True):
    """Melts a user ratings dictionary into a longform dataframe

    Inputs: path, passes to make_user_dictionaries
    scaled: default true, passes to make_user_dictionaries then get_user
    """
    # call for dictionary
    temp_dictionary = make_user_dictionaries(path, scaled)

    # melt dictionary into longform DF
    real_user_ratings = pd.DataFrame.from_dict(temp_dictionary).T
    real_user_ratings.reset_index(inplace=True)
    real_user_ratings.rename(columns={"index": "UserID"}, inplace=True)
    real_user_ratings_long = real_user_ratings.melt(
        id_vars="UserID", var_name="BGGId", value_name="Rating"
    ).dropna()
    real_user_ratings_long.sort_values("UserID", inplace=True)
    real_user_ratings_long.reset_index(drop=True, inplace=True)

    del real_user_ratings
    gc.collect()

    return temp_dictionary, real_user_ratings_long

In [5]:
# load and make game id list
games = pd.read_pickle("data_store/data_cleaned/games.pkl")

game_ids = list(games["BGGId"])

# Make Scaled dictionary

In [71]:
# Opening JSON file
with open("data_store/data_cleaned/user_ratings.json") as json_file:
    raw_ratings = json.load(json_file)

In [73]:
raw_ratings_scaled = {}

for person in raw_ratings:
    raw_ratings_scaled[person] = {}

    user_mean = mean(raw_ratings[person].values())

    for item in raw_ratings[person]:

        new_value = round((raw_ratings[person][item] - user_mean), 2)
        raw_ratings_scaled[person][item] = new_value

In [74]:
with open("data_store/data_cleaned/scaled_user_ratings.json", "w") as convert_file:
    convert_file.write(json.dumps(raw_ratings_scaled))

In [78]:
del raw_ratings
gc.collect()

84

# Build Real Ratings Dictionaries

Build the various data sets

In [51]:
# Opening JSON file
with open("data_store/data_cleaned/user_ratings_block_7.json") as json_file:
    raw_ratings_small = json.load(json_file)

In [52]:
list(raw_ratings_small.keys())[:10]

['Balanced1',
 'Looted',
 'erp_lsf',
 'tredilxy',
 'aaronfoz',
 'dr4gonbl4z3r',
 'sinbad78',
 'KyleBrown',
 'TheFlyingNothing',
 'Halfstache']

In [53]:
raw_ratings_small["Balanced1"]

{'147151': 7.0,
 '124708': 7.5,
 '148203': 7.0,
 '216091': 7.0,
 '216092': 7.0,
 '186751': 8.0,
 '146508': 8.0,
 '129437': 7.0,
 '143741': 6.0,
 '172225': 5.0,
 '172': 4.0,
 '174973': 7.0,
 '157969': 7.5,
 '170216': 8.0,
 '31481': 6.0,
 '174430': 10.0,
 '150376': 8.0,
 '65244': 6.5,
 '39856': 7.0,
 '155068': 10.0,
 '112686': 6.0,
 '91872': 6.0,
 '192153': 7.0,
 '55690': 8.0,
 '70323': 10.0,
 '148228': 8.0,
 '40692': 7.0,
 '169786': 7.5,
 '178900': 7.0}

In [54]:
user_mean = mean(raw_ratings_small["Balanced1"].values())
user_mean

7.241379310344827

In [72]:
del raw_ratings_scaled
gc.collect()

21

In [75]:
len(raw_ratings)

263503

In [76]:
len(raw_ratings_scaled)

263503

In [77]:
raw_ratings_scaled["Balanced1"]

{'147151': -0.24,
 '124708': 0.26,
 '148203': -0.24,
 '216091': -0.24,
 '216092': -0.24,
 '186751': 0.76,
 '146508': 0.76,
 '129437': -0.24,
 '143741': -1.24,
 '172225': -2.24,
 '172': -3.24,
 '174973': -0.24,
 '157969': 0.26,
 '170216': 0.76,
 '31481': -1.24,
 '174430': 2.76,
 '150376': 0.76,
 '65244': -0.74,
 '39856': -0.24,
 '155068': 2.76,
 '112686': -1.24,
 '91872': -1.24,
 '192153': -0.24,
 '55690': 0.76,
 '70323': 2.76,
 '148228': 0.76,
 '40692': -0.24,
 '169786': 0.26,
 '178900': -0.24}

In [42]:
# create longform

real_user_ratings = pd.DataFrame.from_dict(raw_ratings_scaled).T
real_user_ratings.set_axis(
    real_user_ratings.columns.astype("int"), axis="columns", inplace=True
)
real_user_ratings.reset_index(inplace=True)
real_user_ratings.head()
real_user_ratings.rename(columns={"index": "UserID"}, inplace=True)
real_user_ratings_long = real_user_ratings.melt(
    id_vars="UserID", var_name="BGGId", value_name="Rating"
).dropna()
real_user_ratings_long.sort_values("UserID", inplace=True)
real_user_ratings_long.reset_index(drop=True, inplace=True)
real_user_ratings_long.head()

## Unscaled Ratings - Longform DF

In [5]:
real_user_ratings_long_unscaled = pd.DataFrame()

In [6]:
for appendation in np.arange(1, 8, 1):

    print(appendation)

    with open(
        "data_store/data_cleaned/user_ratings_block_" + str(appendation) + ".json"
    ) as json_file:
        raw_ratings = json.load(json_file)

        # create longform

    real_user_ratings = pd.DataFrame.from_dict(raw_ratings).T

    del raw_ratings
    gc.collect()
    real_user_ratings.set_axis(
        real_user_ratings.columns.astype("int"), axis="columns", inplace=True
    )
    real_user_ratings.reset_index(inplace=True)
    real_user_ratings.rename(columns={"index": "UserID"}, inplace=True)
    real_user_ratings_long = real_user_ratings.melt(
        id_vars="UserID", var_name="BGGId", value_name="Rating"
    ).dropna()

    del real_user_ratings
    gc.collect()

    real_user_ratings_long.sort_values("UserID", inplace=True)
    real_user_ratings_long.reset_index(drop=True, inplace=True)
    real_user_ratings_long_unscaled = real_user_ratings_long_unscaled.append(
        real_user_ratings_long
    )
    real_user_ratings_long.head()

1
2
3
4
5
6
7


In [7]:
real_user_ratings_long_unscaled.shape

(17001178, 3)

In [8]:
real_user_ratings_long_unscaled.to_pickle(
    "real_ratings/real_user_ratings_long_unscaled.pkl"
)

In [9]:
del real_user_ratings_long_unscaled
gc.collect()

63

## Scaled Ratings - Full Matrix

In [4]:
users_real_fullmatrix = pd.DataFrame()

In [5]:
for appendation in np.arange(1, 8, 1):

    print(appendation)

    with open(
        "data_store/data_cleaned/user_ratings_block_scaled_" + str(appendation) + ".json"
    ) as json_file:
        raw_ratings = json.load(json_file)

    %time real_user_ratings = pd.DataFrame.from_dict(raw_ratings).T

    del raw_ratings
    gc.collect()

    real_user_ratings_index = list(real_user_ratings.index)
    real_user_ratings_columns = list(real_user_ratings.columns)

    %time ratings_array = real_user_ratings.values

    %time ratings_array = np.round(ratings_array, 1)
    %time ratings_array = ratings_array*10
    %time ratings_array = ratings_array.astype('int8')

    %time scaled_df = pd.DataFrame(ratings_array, columns=real_user_ratings_columns)
    %time scaled_df.set_axis(real_user_ratings_index, axis=0, inplace=True)

    users_real_fullmatrix = users_real_fullmatrix.append(scaled_df)

1
Wall time: 1min 20s
Wall time: 0 ns
Wall time: 4.58 s
Wall time: 1.35 s
Wall time: 1.16 s
Wall time: 21 ms
Wall time: 2 ms
2
Wall time: 1min 22s
Wall time: 0 ns
Wall time: 4.88 s
Wall time: 1.41 s
Wall time: 1.26 s
Wall time: 21 ms
Wall time: 2 ms
3
Wall time: 1min 20s
Wall time: 0 ns
Wall time: 4.71 s
Wall time: 1.3 s
Wall time: 1.11 s
Wall time: 21 ms
Wall time: 1 ms
4
Wall time: 1min 22s
Wall time: 1 ms
Wall time: 4.79 s
Wall time: 1.42 s
Wall time: 1.11 s
Wall time: 22 ms
Wall time: 2 ms
5
Wall time: 1min 20s
Wall time: 0 ns
Wall time: 4.55 s
Wall time: 1.32 s
Wall time: 1.11 s
Wall time: 26 ms
Wall time: 1 ms
6
Wall time: 51.8 s
Wall time: 0 ns
Wall time: 2.55 s
Wall time: 731 ms
Wall time: 629 ms
Wall time: 22 ms
Wall time: 2 ms
7
Wall time: 24.2 s
Wall time: 0 ns
Wall time: 1.02 s
Wall time: 259 ms
Wall time: 196 ms
Wall time: 14 ms
Wall time: 1 ms


In [7]:
users_real_fullmatrix.fillna(0, inplace=True)
users_real_fullmatrix = users_real_fullmatrix.astype("int8")

In [8]:
users_real_fullmatrix.to_pickle("real_ratings/users_real_fullmatrix.pkl")

## Scaled Ratings - Longform DF

In [16]:
real_user_ratings_long_scaled = pd.DataFrame()

In [17]:
for appendation in np.arange(1, 8, 1):

    print(appendation)

    with open(
        "data_store/data_cleaned/user_ratings_block_scaled_" + str(appendation) + ".json"
    ) as json_file:
        raw_ratings = json.load(json_file)

        # create longform

    real_user_ratings = pd.DataFrame.from_dict(raw_ratings).T

    del raw_ratings
    gc.collect()
    real_user_ratings.set_axis(
        real_user_ratings.columns.astype("int"), axis="columns", inplace=True
    )
    real_user_ratings.reset_index(inplace=True)
    real_user_ratings.rename(columns={"index": "UserID"}, inplace=True)
    real_user_ratings_long = real_user_ratings.melt(
        id_vars="UserID", var_name="BGGId", value_name="Rating"
    ).dropna()

    del real_user_ratings
    gc.collect()

    real_user_ratings_long.sort_values("UserID", inplace=True)
    real_user_ratings_long.reset_index(drop=True, inplace=True)
    real_user_ratings_long_scaled = real_user_ratings_long_scaled.append(
        real_user_ratings_long
    )
    real_user_ratings_long.head()

1
2
3
4
5
6
7


In [18]:
real_user_ratings_long_scaled.shape

(17001178, 3)

In [19]:
real_user_ratings_long_scaled.to_pickle(
    "real_ratings/real_user_ratings_long_scaled.pkl"
)

In [20]:
del real_user_ratings_long_scaled
gc.collect()

63

# Build Synthetic Ratings Dictionaries

## Scaled Ratings

In [8]:
def make_lookup(appendation):
    # Opening JSON file
    with open(
        "synthetic_ratings/users_synthetic_" + appendation + "_1.json"
    ) as json_file:
        users_dump_syntheticratings01 = json.load(json_file)
    with open(
        "synthetic_ratings/users_synthetic_" + appendation + "_2.json"
    ) as json_file:
        users_dump_syntheticratings02 = json.load(json_file)
    with open(
        "synthetic_ratings/users_synthetic_" + appendation + "_3.json"
    ) as json_file:
        users_dump_syntheticratings03 = json.load(json_file)
    with open(
        "synthetic_ratings/users_synthetic_" + appendation + "_4.json"
    ) as json_file:
        users_dump_syntheticratings04 = json.load(json_file)
    with open(
        "synthetic_ratings/users_synthetic_" + appendation + "_5.json"
    ) as json_file:
        users_dump_syntheticratings05 = json.load(json_file)
    with open(
        "synthetic_ratings/users_synthetic_" + appendation + "_6.json"
    ) as json_file:
        users_dump_syntheticratings06 = json.load(json_file)
    with open(
        "synthetic_ratings/users_synthetic_" + appendation + "_7.json"
    ) as json_file:
        users_dump_syntheticratings07 = json.load(json_file)

    synth_user_ratings_dictionary_scaled = {}
    synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings01)
    synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings02)
    synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings03)
    synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings04)
    synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings05)
    synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings06)
    synth_user_ratings_dictionary_scaled.update(users_dump_syntheticratings07)

    with open(
        "synthetic_ratings/synth_user_ratings_dictionary_scaled_"
        + appendation
        + ".json",
        "w",
    ) as convert_file:
        convert_file.write(json.dumps(synth_user_ratings_dictionary_scaled))

In [11]:
# make_lookup('50')
# make_lookup('100')
# make_lookup('250')
make_lookup("500")
# make_lookup('1k')
# make_lookup('2k')

# Validate Data

In [2]:
# Opening JSON file
with open(
    "synthetic_ratings/synth_user_ratings_dictionary_scaled_500.json"
) as json_file:
    synth_user_ratings_dictionary_unscaled = json.load(json_file)

# Opening JSON file
with open("real_ratings/real_user_ratings_dictionary_unscaled.json") as json_file:
    real_user_ratings_dictionary_unscaled = json.load(json_file)

In [3]:
# dictionary of game IDs-Names

# Load games
games = pd.read_pickle("data_store/data_cleaned/games.pkl")

# lists of game ids and game names
game_ids = list(games["BGGId"])
game_names = list(games["Name"])

# make lookup dictionary
game_id_lookup = {}

# store ids and names in lookup dictionary
for key, item in zip(game_ids, game_names):
    game_id_lookup[key] = item


del games
gc.collect()

# game_id_lookup

0

In [4]:
len(real_user_ratings_dictionary_unscaled["Threnody"])

156

In [5]:
len(synth_user_ratings_dictionary_unscaled["Threnody"])

624

In [6]:
user = "Threnody"

In [7]:
this_user = pd.DataFrame(
    real_user_ratings_dictionary_unscaled[user].values(),
    index=real_user_ratings_dictionary_unscaled[user].keys(),
)
this_user.reset_index(inplace=True)
this_user["Game"] = this_user["index"].astype("int32").map(game_id_lookup)
this_user.tail(30)

Unnamed: 0,index,0,Game
126,228341,8.0,Pulsar 2849
127,201921,7.0,Tiny Epic Quest
128,246784,8.0,Cryptid
129,192135,9.0,Too Many Bones
130,49,6.0,Mamma Mia!
131,17223,6.0,World of Warcraft: The Boardgame
132,17329,7.0,Animal Upon Animal
133,2381,6.0,Scattergories
134,22345,7.0,Yspahan
135,163068,8.0,Trickerion: Legends of Illusion


In [8]:
this_user = pd.DataFrame(
    synth_user_ratings_dictionary_unscaled[user].values(),
    index=synth_user_ratings_dictionary_unscaled[user].keys(),
)
this_user.reset_index(inplace=True)
this_user["Game"] = this_user["index"].astype("int32").map(game_id_lookup)
this_user.tail(30)

Unnamed: 0,index,0,Game
594,172278,8,Ether Wars
595,170799,-1,Lost Woods
596,31887,7,AmuseAmaze
597,149241,14,Assault on Doomrock
598,1105,-9,Titus
599,10640,-9,Doom: The Boardgame
600,84464,-1,Animal Upon Animal: Balancing Bridge
601,67877,-9,Anomia
602,202896,-1,Package!?
603,97207,6,Dungeon Petz


# Build User Means Dictionary

In [4]:
def get_user_means(path_item):

    temp_dictionary = {}

    path = "data_store/data_cleaned/ratings_matrix_cleaned_" + path_item + ".pkl"
    user_matrix = pd.read_pickle(path)
    user_matrix = user_matrix.T
    user_matrix.index = user_matrix.index.astype("int32")
    user_matrix.shape

    print("Processing " + str(path_item))
    for user in user_matrix.columns:

        # get the mean rating for that user
        user_mean = user_matrix[user].mean()

        user_means[user] = user_mean

In [5]:
user_means = {}

get_user_means("01")
get_user_means("02")
get_user_means("03")
get_user_means("04")
get_user_means("05")
get_user_means("06")

Processing 01
Processing 02
Processing 03
Processing 04
Processing 05
Processing 06


In [6]:
user_means["Threnody"]

7.222485207100592

In [7]:
with open("users_means.json", "w") as convert_file:
    convert_file.write(json.dumps(user_means))

In [8]:
user_means = pd.DataFrame.from_dict(user_means, columns=["user_mean"], orient="index")

In [9]:
user_means.to_pickle("user_means.pkl")

In [None]:
with open("users_means.json") as json_file:
    user_means = json.load(json_file)