# Notebook Objective and Setup

BGG05 is the building of a content-based item filter. Using category weights, I use my domain expertise to tune an item similarity matrix for all game IDs in the games file.

This content-based filter could be used as-is to find similar games to a user's catalog and predict ratings.

## Notebook Preparation

### Package Imports

In [1]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import copy
import json

# ignore warnings (gets rid of Pandas copy warnings)
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 30)

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# # NLP tools
import spacy

nlp = spacy.load("en_core_web_sm")
# import re
# import nltk
# import fasttext
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.python.keras.preprocessing import sequence, text

### Notebook Functions

##### Processing Functions

In [2]:
def clean_text(text):
    # remove numbers
    text = text.replace("&amp","")
    text = re.sub(r"\d+", "", text)
    # remove punctuation except periods
    text = re.sub(r"[^\w\s\.]", "", text)
    text = text.lower()
    text = text.strip()
    return text

def filter_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

def evaluate_quality_words_over_thresh(text):
    word_tokens = word_tokenize(text)
    return len(word_tokens) > 5

In [3]:
def processing_pipeline_games(weight_groups, df):
    """
    !!!Hard-coded processor!!!
    Takes in weight tuples and a dataframe
    Scales specific dataframe columns to tuples

    Inputs:
    weight_groups: list of weight tuples (x, y)
    df: df to be scaled

    Returns:
    Processed Dataframe"""

    # Whole pipeline with continuous then categorical transformers
    total_pipeline = ColumnTransformer(
        [
            (
                "games_weight_weight",
                MinMaxScaler(feature_range=weight_groups[0]),
                ["GameWeight"],
            ),
            ("avgrating", MinMaxScaler(feature_range=weight_groups[1]), ["AvgRating"]),
            (
                "bayes_weight",
                MinMaxScaler(feature_range=weight_groups[2]),
                ["BayesAvgRating"],
            ),
            (
                "players_weight",
                MinMaxScaler(feature_range=weight_groups[3]),
                ["BestPlayers"],
            ),
            (
                "playtime_weight",
                MinMaxScaler(feature_range=weight_groups[4]),
                ["Playtime"],
            ),
            (
                "remainder_weight",
                MinMaxScaler(feature_range=weight_groups[5]),
                ["Cat:Thematic",
    "Cat:Strategy",
    "Cat:Family",
    "Cat:War",
    "Cat:CGS",
    "Cat:Abstract",
    "Cat:Party",
    "Cat:Childrens",],
            ),
        ]
    )

    # Fit and tranform the pipeline on x_train, then transform x_test
    processed = total_pipeline.fit_transform(df)

    return processed

# Content Based Filtering

## Set Weights

These are the scales for each of these categories. All entries in the category will be scaled to this tuple range by the MinMaxScaler.

In [4]:
### DO NOT TOUCH THESE ARE THE PRODUCTION WEIGHTS!!!!!

games_weight_weight = (-1, 1)  # game weight. Is a range, so (-, )
rating_weight = (-0.3, 0.3)
bayes_weight = (-0.5, 0.5)  # game weighted rating. Is a range, so (-, )
players_weight = (0, 1)  # best players. Is a problematic range due to outliers
playtime_weight = (0, 2)  # playtime. Is a range so (-, ). Has high outliers

designers_weight = (0, 0.75)  # designer, binary
mechanics_weight = (0, 1)  # mechanics, binary
subcategories_weight = (0, 1)  # other mechanics like card game, print&play. binary
family_weights = (0, 1)  # game families like pandemic, century. binary
categories_weight = (0, 1.5)  # the five large overarching categories, binary
themes_weight = (0, 0.75)  # themes like space, western. binary

In [5]:
weights = {"games_weight": games_weight_weight,
           "rating": rating_weight,
           "bayes": bayes_weight,
           "players": players_weight,
           "playtime": playtime_weight,
           "designers": designers_weight,
           "mechanics": mechanics_weight,
           "subcategories": subcategories_weight,
           "family": family_weights,
           "categories": categories_weight,
           "themes": themes_weight}

In [None]:
"""games_weight_weight = (-1, 1) # game weight. Is a range, so (-, )
rating_weight = (-.5, .5)
bayes_weight = (-.001, .001) # game weighted rating. Is a range, so (-, )
players_weight = (0, 1) # best players. Is a problematic range due to outliers
playtime_weight = (0, 2) # playtime. Is a range so (-, ). Has high outliers
families_weight = (0, 0.5) # families, binary
mechanics_weight = (0, .75) # mechanics, binary
subcategories_weight = (0, .75) # other mechanics like card game, print&play. binary
family_weights = (0, 0.5) # game families like pandemic, century. binary
categories_weight = (0, 1) # the five large overarching categories, binary
themes_weight = (-.15, 0.15) # themes like space, western. binary"""

## Load and Prep Data

In [None]:
how_many_games = 5000

game_data_dir = "../data/prod/games/game_dfs_clean"

# Load games
games = pd.read_pickle(f"{game_data_dir}/games_clean.pkl")

# I don't want to deal with every game ever to be honest, so let's reduce.
# Let's just take the top 5000 games by average rating
games = games.sort_values("BayesAvgRating", ascending=False).head(how_many_games).reset_index(drop=True)

games = games.sort_values("BGGId").reset_index(drop=True)

bgg_ids = games["BGGId"].tolist()
bgg_names = games["Name"].tolist()
game_lookup = {value.lower():key for key, value in zip(bgg_ids, bgg_names)}

games['AvgRating'] = games['AvgRating'].round(2)
games['BayesAvgRating'] = games['BayesAvgRating'].round(2)
games['GameWeight'] = games['GameWeight'].round(2)

# determine playtime for each game according to community
games["Playtime"] = 0
games["Playtime"] = games.apply(
    lambda x: np.mean(x["ComMinPlaytime"] + x["ComMaxPlaytime"]), axis=1
)

# set upper cap on playtime
over_6_hours = list(games.loc[games["Playtime"] > 480].index)
games.loc[over_6_hours, "Playtime"] = 480
games.head()

### Weight Scale Datasets

In [8]:
def refine_binary_subset(filename, column, weight_type, thresh):
    df = pd.read_pickle(f"{game_data_dir}/{filename}.pkl")
    df = pd.get_dummies(df, columns=[column], prefix="", prefix_sep="").groupby("BGGId").sum().reset_index()

    # get floor of mechanics presence in catalog (.03% of games)
    df_floor = round(df.shape[0] * thresh)
    # make a list of mechanics more than the floor
    sums = pd.DataFrame(df.sum() >= df_floor)

    # get indices for the mechanics keeping
    keep_df = sums.loc[sums[0] == True].index

    df = df[keep_df]

    df = df[df['BGGId'].isin(bgg_ids)].set_index("BGGId")

    df = df.replace(1, weights[weight_type][1])

    return df.reset_index(names="BGGId").drop(columns=["BGGId"])

In [None]:
scaled_mechanics = refine_binary_subset(filename="mechanics_clean",
                                        column="mechanic",
                                        weight_type="mechanics",
                                        thresh=0.003).astype('int8')
scaled_mechanics.head()

In [None]:
scaled_themes = refine_binary_subset(filename="themes_clean",
                                        column="Theme",
                                        weight_type="themes",
                                        thresh=0.003).astype('int8')
scaled_themes.head()

In [None]:
scaled_subcategories = refine_binary_subset(filename="subcategories_clean",
                                        column="boardgamecategory",
                                        weight_type="subcategories",
                                        thresh=0.003).astype('int8')
scaled_subcategories.head()

In [None]:
scaled_designers = refine_binary_subset(filename="designers_clean",
                                        column="boardgamedesigner",
                                        weight_type="designers",
                                        thresh=0.003).astype('int8')
scaled_designers.head()

In [None]:
# scaled game families
families = pd.get_dummies(games["Family"]).astype(int)

# get floor of mechanics presence in catalog (.01% of games)
families_floor = round(families.shape[0] * 0.001)

# make a list of themes more than the floor
sums = pd.DataFrame(families.sum() >= families_floor)

# get indices for the mechanics keeping
keep_families = sums.loc[sums[0] == True].index

families = families[keep_families]

scaled_families = families.replace(1, weights["family"][1]).astype('int8')

scaled_families.head()

### Master CBF Frame

In [None]:
# # instantiate MissForest imputer and fill all nans in scaled_games
# imputer = KNNImputer(n_neighbors=5)
# scaled_games = pd.DataFrame(
#     imputer.fit_transform(scaled_games), columns=games_included_columns
# )

# include these columns for comparison
games_included_columns = [
    "GameWeight",
    "AvgRating",
    "BayesAvgRating",
    "BestPlayers",
    "Playtime",
    "Cat:Thematic",
    "Cat:Strategy",
    "Cat:Family",
    "Cat:War",
    "Cat:CGS",
    "Cat:Abstract",
    "Cat:Party",
    "Cat:Childrens",
]

games_reduced = games[games_included_columns]

games_reduced.head()

In [None]:
# set up weight groups for hard coded pipeline
weight_groups = [
    games_weight_weight,
    rating_weight,
    bayes_weight,
    players_weight,
    playtime_weight,
    categories_weight,
]

# process scaled_games with pipeline
scaled_games = pd.DataFrame(
    processing_pipeline_games(weight_groups, games_reduced), columns=games_included_columns
)

scaled_games.head()

In [None]:
# put together master dataframe with other already processed dataframes
master_games = pd.concat(
    (
        scaled_games,
        scaled_themes,
        scaled_mechanics,
        scaled_families,
        scaled_designers,
        scaled_subcategories
    ),
    axis=1,
)  # , description_vectors, word_vectors

# put game id on master_games DF
master_games["BGGId"] = bgg_ids

# set index to id
master_games = master_games.set_index("BGGId")

# fill nans with 0
master_games = master_games.fillna(0)

master_games.head()

In [None]:
master_games

In [18]:
master_games.to_pickle("master_games_scaled.pkl")

In [None]:
# clean up

del scaled_mechanics
del scaled_families
del scaled_designers
del scaled_games
del scaled_subcategories
del scaled_themes

gc.collect()

## Item Similarity via Cosine Distance

In [None]:
{x:y for x, y in game_lookup.items() if "haven" in x}

In [21]:
master_games = pd.read_pickle("master_games_scaled.pkl")

In [None]:
# get cosine similarities!
cosine_sims = cosine_similarity(master_games)

# do similarities by game id
sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids)
sims_byid["Game_Id"] = bgg_ids
sims_byid = sims_byid.set_index("Game_Id", drop=True)

sims_byid = sims_byid.round(2)

sims_byid = sims_byid.replace(1.00, 0)
sims_byid.head()

In [23]:
for item in np.arange(0, len(bgg_names), 1):
    bgg_names[item] = re.sub("[^A-Za-z0-9\s]+", "", bgg_names[item])

In [None]:
sims_byname = sims_byid.copy()

lowercase_bgg_names = [x.lower() for x in bgg_names]
sims_byname = sims_byname.set_axis(lowercase_bgg_names, axis=1).set_axis(lowercase_bgg_names, axis=0)
sims_byname.head()

In [25]:
# save to pickles, we really only need the id one
sims_byid.to_pickle(f"{game_data_dir}/game_cosine_similarity_byid.pkl")
sims_byname.to_pickle(f"{game_data_dir}/game_cosine_similarity_byname.pkl")

In [None]:
del sims_byname
del sims_byid
gc.collect()

### CHECK GAME HERE

In [None]:
# save to pickles, we really only need the id one
# sims_byid = pd.read_pickle("{game_data_dir}/game_cosine_similarity_byid.pkl")
sims_byname = pd.read_pickle(
    f"{game_data_dir}/game_cosine_similarity_byname.pkl"
)

# make all the fields lowercase

sims_byname.head()

In [28]:
# # find any entries with a particular string in the index or column name
# def find_string_in_index_or_column(df, string):
#     df =  df[df.index.str.contains(string, case=False, na=False)]
#     columns=list(df.columns[df.columns.str.contains("gloomhaven",case=False, na=False)])
#     return df[columns]

# sims_byname = find_string_in_index_or_column(sims_byname, "gloomhaven")
# sims_byname.head()

This is why we made the name one

In [None]:
# test my specific game set here

test_dict = {
    "Dominion": list(sims_byname["dominion"].sort_values(ascending=False)[:15].index),
    "D_Sim": list(sims_byname["dominion"].sort_values(ascending=False)[:15]),
    "Gloomhaven": list(
        sims_byname["gloomhaven"].sort_values(ascending=False)[:15].index
    ),
    "G_Sim": list(sims_byname["gloomhaven"].sort_values(ascending=False)[:15]),
    "Pandemic": list(sims_byname["pandemic"].sort_values(ascending=False)[:15].index),
    "Pa_Sim": list(sims_byname["pandemic"].sort_values(ascending=False)[:15]),
    "Splendor": list(sims_byname["splendor"].sort_values(ascending=False)[:15].index),
    "Sp_Sim": list(sims_byname["splendor"].sort_values(ascending=False)[:15]),
    "Viticulture Essential Edition": list(
        sims_byname["viticulture essential edition"]
        .sort_values(ascending=False)[:15]
        .index
    ),
    "V_Sim": list(
        sims_byname["viticulture essential edition"].sort_values(ascending=False)[:15]
    ),
    "Agricola": list(sims_byname["agricola"].sort_values(ascending=False)[:15].index),
    "Ag_Sim": list(sims_byname["agricola"].sort_values(ascending=False)[:15]),
    "Space Base": list(
        sims_byname["space base"].sort_values(ascending=False)[:15].index
    ),
    "Spa_Sim": list(sims_byname["space base"].sort_values(ascending=False)[:15]),
    "Terraforming Mars": list(
        sims_byname["terraforming mars"].sort_values(ascending=False)[:15].index
    ),
    "Te_Sim": list(sims_byname["terraforming mars"].sort_values(ascending=False)[:15]),
    "Chess": list(sims_byname["chess"].sort_values(ascending=False)[:15].index),
    "Ch_Sim": list(sims_byname["chess"].sort_values(ascending=False)[:15]),
    # 'Sagrada':list(sims_byname['sagrada'].sort_values(ascending=False)[:15].index), 
    # 'Sa_Sim':list(sims_byname['sagrada'].sort_values(ascending=False)[:15]),
    "Azul": list(sims_byname["azul"].sort_values(ascending=False)[:15].index),
    "Az_Sim": list(sims_byname["azul"].sort_values(ascending=False)[:15]),
    "Codenames": list(sims_byname["codenames"].sort_values(ascending=False)[:15].index),
    "Co_Sim": list(sims_byname["codenames"].sort_values(ascending=False)[:15]),
    "Lords of Waterdeep": list(
        sims_byname["lords of waterdeep"].sort_values(ascending=False)[:15].index
    ),
    "L_Sim": list(sims_byname["lords of waterdeep"].sort_values(ascending=False)[:15]),
    "Century: Spice Road": list(
        sims_byname["century spice road"].sort_values(ascending=False)[:15].index
    ),
    "Ce_Sim": list(sims_byname["century spice road"].sort_values(ascending=False)[:15]),
    "Power Grid": list(
        sims_byname["power grid"].sort_values(ascending=False)[:15].index
    ),
    "P_Grid": list(sims_byname["power grid"].sort_values(ascending=False)[:15]),
}

pd.DataFrame(test_dict)

In [None]:
break

# Clean game descriptions for critical components

In [None]:
# gather entire corpus of game descriptions

# games = pd.read_pickle(f"{game_data_dir}/games_clean.pkl")
# games.head()

In [None]:
len(games)

In [9]:
description_freq = {}

def tokenize_description(one_row):

    if type(one_row['Description']) == float:
        return None
    one_row_desc = one_row["Description"]
    one_row_title = one_row["Name"]
    title_tokens = word_tokenize(one_row_title)
    title_tokens = [word.lower() for word in title_tokens if word.isalpha()]

    

    description = filter_stopwords(clean_text(one_row_desc)).replace(" ."," ").replace("  "," ").strip()
    description = word_tokenize(description)
    description = [word for word in description if word not in title_tokens]

    description = [PorterStemmer().stem(word) for word in description]

    for word in description:
        if word in description_freq:
            description_freq[word] += 1
        else:
            description_freq[word] = 1

    return " ".join(description)


In [None]:
games['cleaned_description'] = games.apply(tokenize_description, axis=1)
games.head()

In [None]:
descriptions = games[games['cleaned_description'].notna()][['BGGId','cleaned_description']]
descriptions.head()

In [None]:


# prepare the vectorizer with the chosen parameters
tfid_proc = TfidfVectorizer(
    sublinear_tf=True,
    use_idf=True,
    max_df=0.1,
    min_df=0.01,
    ngram_range=(1, 2),
    max_features=1000,
)

# fit the vectorizer to the descriptions
word_vectors = tfid_proc.fit_transform(descriptions["cleaned_description"])

# cast the vector array to a data frame with columns named by the features selected by the vectorizer
word_vectors_df = pd.DataFrame(
    word_vectors.toarray(), columns=tfid_proc.get_feature_names_out())

word_vectors_df.head()

In [None]:
len(word_vectors_df)

In [None]:
games_df_with_word_vectors = pd.concat([descriptions, word_vectors_df], axis=1).drop(columns=['cleaned_description'])
games_df_with_word_vectors.head()

In [None]:
games = games.merge(games_df_with_word_vectors, on='BGGId', how='left')
games.head()