# Notebook Objective and Setup

BGG03 is the scrubbing and cleaning of the various data obtained in notebooks BGG01 and BGG02. The following datasets are cleaned, constructed, or otherwise prepared for EDA and modeling.

    * Games
   * Mechanics
    * Subcategories
    * Designers
    * Artists
    * Publishers
    * Awards
    * Ratings Distribution
    * Comments
    * Ratings Matrix

## Package Imports

In [None]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import json
from statistics import mean

# ignore warnings (gets rid of Pandas copy warnings)
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 30)

# NLP tools
# import spacy

# nlp = spacy.load("en_core_web_sm")
# import re
# import nltk
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# from nltk.tokenize import word_tokenize

filepath = "../../data/prod/games/game_dfs_dirty/"

# Ratings - by Item, User, & Comments

## Ratings Distribution

In [None]:
# Load the storage dictionary for this block
with open("data_store/data_cleaned/game_raw_ratings.json") as json_file:
    raw_ratings = json.load(json_file)

In [None]:
len(raw_ratings)

In [None]:
ratings_distribution = pd.DataFrame()

for item in raw_ratings.keys():

    print(item)

    ratings_temp = pd.DataFrame(raw_ratings[item]).round(1)
    ratings_counts = pd.DataFrame(ratings_temp.value_counts()).sort_index().T

    ratings_distribution = ratings_distribution.append(ratings_counts)

In [None]:
# ratings_distribution = pd.read_pickle('data_store/data_cleaned/ratings_distribution.pkl')

In [None]:
ratings_distribution.set_axis(list(raw_ratings.keys()), axis=0, inplace=True)

In [None]:
ratings_distribution.head()

In [None]:
ratings_distribution.fillna(0, inplace=True)

In [None]:
ratings_distribution.head()

In [None]:
ratings_distribution["total_ratings"] = ratings_distribution.sum(axis=1)

In [None]:
# ratings_distribution = ratings_distribution.T.reset_index().T

In [None]:
ratings_distribution.reset_index(inplace=True)

In [None]:
ratings_distribution.head()

In [None]:
ratings_distribution.rename(columns={"index": "BGGId"}, inplace=True)

In [None]:
ratings_distribution["BGGId"] = ratings_distribution["BGGId"].astype("int64")

In [None]:
ratings_distribution.head()

In [None]:
ratings_distribution.to_pickle("data_store/data_cleaned/ratings_distribution.pkl")

In [None]:
ratings_distribution.to_csv("data_kaggle/ratings_distribution.csv", index=False)

## Item Means



In [None]:
ratings = pd.read_pickle("real_ratings/real_user_ratings_unscaled_fullmatrix.pkl")

In [None]:
ratings.head()

In [None]:
ratings = ratings.replace(0, np.NaN)

In [None]:
item_means = ratings.mean().to_dict()

In [None]:
# save catalog to file
with open("data_store/data_cleaned/item_means.json", "w") as convert_file:
    convert_file.write(json.dumps(item_means))

## User Means

In [None]:
# Opening JSON file
with open("real_ratings/real_user_ratings_unscaled.json") as json_file:
    user_ratings = json.load(json_file)

In [None]:
len(user_ratings)

In [None]:
user_means = {}

In [None]:
for person in user_ratings:
    user_items = []
    for item in user_ratings[person]:
        user_items.append(user_ratings[person][item])
    user_mean = round((mean(user_items)), 1)
    user_means[person] = user_mean

In [None]:
user_means["Threnody"]

In [None]:
user_means["moosh21"]

In [None]:
user_means["Shade92008"]

In [None]:
user_means["Torsten"]

In [None]:
# save dictionary
with open("data_store/data_cleaned/user_means.json", "w") as convert_file:
    convert_file.write(json.dumps(user_means))

In [None]:
del user_means
gc.collect()

In [None]:
# Opening JSON file
with open("data_store/data_cleaned/user_means.json") as json_file:
    user_means_dict = json.load(json_file)

In [None]:
user_means = pd.DataFrame.from_dict(user_means_dict, orient="index")
user_means.rename(columns={0: "Mean"}, inplace=True)
user_means.head()

In [None]:
user_means.to_pickle("data_store/data_cleaned/user_means.pkl")