# Notebook Objective and Setup

BGG05 is the building of a content-based item filter. Using category weights, I use my domain expertise to tune an item similarity matrix for all game IDs in the games file.

This content-based filter could be used as-is to find similar games to a user's catalog and predict ratings.

## Notebook Preparation

### Package Imports

In [None]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import copy
import json

# ignore warnings (gets rid of Pandas copy warnings)
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 30)

# from sklearn.metrics.pairwise import cosine_similarity
# from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
# from missingpy import MissForest
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.feature_extraction.text import TfidfTransformer

# # NLP tools
import spacy

nlp = spacy.load("en_core_web_sm")
# import re
# import nltk
# import fasttext
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.python.keras.preprocessing import sequence, text

### Notebook Functions

##### Processing Functions

In [None]:
def clean_text(text):
    # remove numbers
    text = re.sub(r"\d+", "", text)
    # remove punctuation except periods
    text = re.sub(r"[^\w\s\.]", "", text)
    text = text.lower()
    text = text.strip()
    return text

def filter_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

def evaluate_quality_words_over_thresh(text):
    word_tokens = word_tokenize(text)
    return len(word_tokens) > 5

In [None]:
def processing_pipeline(weight_groups, df):
    """
    !!!Hard-coded processor!!!
    Takes in weight tuples and a dataframe
    Scales specific dataframe columns to tuples

    Inputs:
    weight_groups: list of weight tuples (x, y)
    df: df to be scaled

    Returns:
    Processed Dataframe"""

    # continuous pipeline
    family_encoder = Pipeline(
        [
            ("encoder", OneHotEncoder()),
            ("scaler", MinMaxScaler(feature_range=weight_groups[6])),
        ]
    )

    # Whole pipeline with continuous then categorical transformers
    total_pipeline = ColumnTransformer(
        [
            (
                "games_weight_weight",
                MinMaxScaler(feature_range=weight_groups[0]),
                ["GameWeight"],
            ),
            ("avgrating", MinMaxScaler(feature_range=weight_groups[1]), ["AvgRating"]),
            (
                "bayes_weight",
                MinMaxScaler(feature_range=weight_groups[2]),
                ["BayesAvgRating"],
            ),
            (
                "players_weight",
                MinMaxScaler(feature_range=weight_groups[3]),
                ["BestPlayers"],
            ),
            (
                "playtime_weight",
                MinMaxScaler(feature_range=weight_groups[4]),
                ["Playtime"],
            ),
            (
                "remainder_weight",
                MinMaxScaler(feature_range=weight_groups[5]),
                ["Cat:War", "Cat:CGS", "Cat:Abstract", "Cat:Party", "Cat:Childrens"],
            ),
        ]
    )

    # Fit and tranform the pipeline on x_train, then transform x_test
    processed = total_pipeline.fit_transform(df)

    return processed

In [None]:
def scale_dataset(dataset, weights, tfidf=False):
    """
    Set each item either according to Term Frequency or TF-IDF
    Then scale dataset according to provided weights

    tfidf flag:
    Use TRUE when RARE entries are more important than FREQUENT entries
    Use FALSE when COMMON entries are more important than RARE entries

    Inputs:
    dataset: dataset to scale
    weights: tuple (x, y) to weight dataset
    tfidf: set flag to True for TF-IDF

    Outputs:
    Scaled data
    """

    # drop BGG Id
    try:
        dataset_pared = dataset.drop("BGGId", axis=1)
    except:
        dataset_pared = dataset

    # get list of titles to reapply to DF after transformation
    titles = list(dataset_pared.columns)

    # set up weighted scaler
    scaler = MinMaxScaler(feature_range=weights)

    # get total number of entries
    total_entries = sum(dataset.sum())

    if tfidf:

        # instantiate tfidf transformer
        tfidf = TfidfTransformer()

        # convert matrix to tfidf
        tfidf_dataset = pd.DataFrame(
            tfidf.fit_transform(dataset_pared).toarray(), columns=titles
        )

        # run scaler on transpose (scale by row not column)
        transpose_scaled = scaler.fit_transform(tfidf_dataset.T)

    else:
        # for each column,
        for item in list(dataset_pared.columns):
            # set item as its term frequency
            dataset_pared.loc[dataset_pared[item] > 0, item] = (
                dataset_pared[item].sum() / total_entries
            )

        # run scaler on transpose (scale by row not column)
        transpose_scaled = scaler.fit_transform(dataset_pared.T)

    # rebuild dataframe
    scaled_dataset = pd.DataFrame(transpose_scaled.T, columns=titles)

    return scaled_dataset

# Content Based Filtering

## Set Weights

These are the scales for each of these categories. All entries in the category will be scaled to this tuple range by the MinMaxScaler.

In [None]:
### DO NOT TOUCH THESE ARE THE PRODUCTION WEIGHTS!!!!!

games_weight_weight = (-1.5, 1.5)  # game weight. Is a range, so (-, )
rating_weight = (-0.3, 0.3)
bayes_weight = (-0.5, 0.5)  # game weighted rating. Is a range, so (-, )
players_weight = (0, 1)  # best players. Is a problematic range due to outliers
playtime_weight = (0, 2)  # playtime. Is a range so (-, ). Has high outliers
designers_weight = (0, 0.5)  # designers, binary
mechanics_weight = (0, 0.75)  # mechanics, binary
subcategories_weight = (0, 0.75)  # other mechanics like card game, print&play. binary
family_weights = (0, 0.5)  # game families like pandemic, century. binary
categories_weight = (0, 1)  # the five large overarching categories, binary
themes_weight = (-0.15, 0.15)  # themes like space, western. binary

In [None]:
"""games_weight_weight = (-1, 1) # game weight. Is a range, so (-, )
rating_weight = (-.5, .5)
bayes_weight = (-.001, .001) # game weighted rating. Is a range, so (-, )
players_weight = (0, 1) # best players. Is a problematic range due to outliers
playtime_weight = (0, 2) # playtime. Is a range so (-, ). Has high outliers
designers_weight = (0, 0.5) # designers, binary
mechanics_weight = (0, .75) # mechanics, binary
subcategories_weight = (0, .75) # other mechanics like card game, print&play. binary
family_weights = (0, 0.5) # game families like pandemic, century. binary
categories_weight = (0, 1) # the five large overarching categories, binary
themes_weight = (-.15, 0.15) # themes like space, western. binary"""

## Load and Prep Data

In [None]:
game_data_dir = "../../data/prod/games/game_dfs_clean"

# Load games
games = pd.read_pickle(f"{game_data_dir}/games_clean.pkl")

# determine playtime for each game according to community
games["Playtime"] = 0
games["Playtime"] = games.apply(
    lambda x: np.mean(x["ComMinPlaytime"] + x["ComMaxPlaytime"]), axis=1
)

# set upper cap on playtime
over_6_hours = list(games.loc[games["Playtime"] > 360].index)
games.loc[over_6_hours, "Playtime"] = 360

# load other files to use

designers = pd.read_pickle(f"{game_data_dir}/designers_clean.pkl")
subcategories = pd.read_pickle(f"{game_data_dir}/subcategories_clean.pkl")
themes = pd.read_pickle(f"{game_data_dir}/themes_clean.pkl")

# games.head()

In [None]:
themes.head()

### TF-IDF

In [None]:
# Load spaCy with English language processor
nlp = spacy.load("en_core_web_sm")

In [None]:
descriptions = games[["Description"]].astype(str)
descriptions['Description'] = descriptions['Description'].apply(lambda x: clean_text(x))
descriptions['Description'] = descriptions['Description'].apply(lambda x: filter_stopwords(x))
descriptions.head()

In [None]:
# prepare the vectorizer with the chosen parameters
tfid_proc = TfidfVectorizer(
    sublinear_tf=True,
    use_idf=True,
    max_df=0.8,
    min_df=0.005,
    ngram_range=(1, 3),
    max_features=5000,
)

# fit the vectorizer to the descriptions
word_vectors = tfid_proc.fit_transform(descriptions["Description"])

In [None]:
# cast the vector array to a data frame with columns named by the features selected by the vectorizer
word_vectors_df = pd.DataFrame(
    word_vectors.toarray(), columns=tfid_proc.get_feature_names_out())

In [None]:
word_vectors_df.head()

In [None]:
word_vectors_df.shape

Load in the description vectors (produced and saved in the appendix)

In [None]:
description_vectors = pd.read_pickle("{game_data_dir}/description_vectors.pkl")

In [None]:
description_vectors.shape

### Weight Scale Datasets

##### TF-IDF Mechanics

In [None]:
mechanics = pd.read_pickle(f"{game_data_dir}/mechanics_clean.pkl")

mechanics = pd.get_dummies(mechanics, columns=["mechanic"], prefix="", prefix_sep="").groupby("BGGId").sum()

# get floor of mechanics presence in catalog (.05% of games)
mechanics_floor = round(mechanics.shape[0] * 0.005)

# make a list of mechanics more than the floor
sums = pd.DataFrame(mechanics.sum() >= mechanics_floor)

# get indices for the mechanics keeping
keep_mechanics = sums.loc[sums[0] == True].index

mechanics = mechanics[keep_mechanics]

mechanics.head()

In [None]:
# scaled mechanics
scaled_mechanics = scale_dataset(mechanics, mechanics_weight, tfidf=True)

# make new column for games without any mechanics information
no_mechanics_index = list(scaled_mechanics.loc[scaled_mechanics.sum(axis=1) == 0].index)
scaled_mechanics["No Mechanics"] = 0
scaled_mechanics.loc[no_mechanics_index, "No Mechanics"] = mechanics_weight[1]

##### TF-Scale Only Datasets

In [None]:
# scale themes
scaler = MinMaxScaler(feature_range=themes_weight)
scaled_themes = scaler.fit_transform(themes)

# get list of titles to reapply to DF after transformation
titles = list(themes.columns)

scaled_themes = pd.DataFrame(scaled_themes, columns=titles)
scaled_themes.drop("BGGId", axis=1, inplace=True)

# scaled designers
scaled_designers = scale_dataset(designers, designers_weight)

# scaled game families
game_families = pd.get_dummies(games["Family"])
scaled_families = scale_dataset(game_families, family_weights)

# scaled subcategories
scaled_subcategories = scale_dataset(subcategories, subcategories_weight)

### Master CBF Frame

In [None]:
# include these columns for comparison
games_included_columns = [
    "GameWeight",
    "AvgRating",
    "BayesAvgRating",
    "BestPlayers",
    "Playtime",
    "Cat:War",
    "Cat:CGS",
    "Cat:Abstract",
    "Cat:Party",
    "Cat:Childrens",
]

# make smaller df of the included columns
scaled_games = games[games_included_columns]

# get list of game names
game_names = list(games["Name"])
# get list of game ids
game_ids = list(games["BGGId"])

# create game lookup table
game_lookup = {}
for key, value in zip(game_ids, game_names):
    game_lookup[key] = value

# instantiate MissForest imputer and fill all nans in scaled_games
imputer = MissForest()
scaled_games = pd.DataFrame(
    imputer.fit_transform(scaled_games), columns=games_included_columns
)

In [None]:
# set up weight groups for hard coded pipeline
weight_groups = [
    games_weight_weight,
    rating_weight,
    bayes_weight,
    players_weight,
    playtime_weight,
    categories_weight,
    family_weights,
]

# process scaled_games with pipeline
scaled_games = pd.DataFrame(
    processing_pipeline(weight_groups, scaled_games), columns=games_included_columns
)

# make list of games and ids (is this used anywhere?)
game_and_id = list(zip(game_names, game_ids))

In [None]:
# put together master dataframe with other already processed dataframes
master_games = pd.concat(
    (
        scaled_games,
        scaled_themes,
        scaled_mechanics,
        scaled_families,
        scaled_designers,
        scaled_subcategories,
        description_vectors,
        word_vectors,
    ),
    axis=1,
)  # , description_vectors, word_vectors

# put game id on master_games DF
master_games["BGGId"] = game_ids

# set index to id
master_games.set_index("BGGId", inplace=True)

# master_games.head()

In [None]:
master_games.to_pickle("{game_data_dir}/master_games_scaled.pkl")

In [None]:
"""# clean up

del mechanics
del designers
del subcategories
del themes
del scaled_mechanics
del scaled_families
del scaled_designers
del scaled_games
del scaled_subcategories
del scaled_themes
del description_vectors

gc.collect()"""

## Item Similarity via Cosine Distance

In [None]:
master_games = pd.read_pickle("{game_data_dir}/master_games_scaled.pkl")

In [None]:
# Load games
games = pd.read_pickle("{game_data_dir}/games.pkl")

# get list of game ids
game_ids = list(games["BGGId"])

game_names = list(games["Name"].str.lower())

In [None]:
# get cosine similarities!
cosine_sims = cosine_similarity(master_games)

# do similarities by game id
sims_byid = pd.DataFrame(cosine_sims, columns=game_ids)
sims_byid["Game_Id"] = game_ids
sims_byid.set_index("Game_Id", inplace=True, drop=True)

In [None]:
# SCALE IF NEEDED

threshold = 0.95

sims_byid = sims_byid.round(6)

for column in list(sims_byid.columns):
    lower = sims_byid[column].min()
    sims_byid[column].replace(1.0, lower, inplace=True)
    if sims_byid[column].max() > threshold:
        continue
    else:
        scaler = MinMaxScaler(feature_range=(lower, threshold))
        scaled_values = scaler.fit_transform(np.array(sims_byid[column]).reshape(-1, 1))
        sims_byid[column] = scaled_values

In [None]:
sims_byid = sims_byid.round(2)
sims_byid.head()

In [None]:
for item in np.arange(0, len(game_names), 1):
    game_names[item] = re.sub("[^A-Za-z0-9\s]+", "", game_names[item])

In [None]:
sims_byname = sims_byid.copy()
sims_byname.set_axis(game_names, axis=1, inplace=True)
sims_byname.set_axis(game_names, axis=0, inplace=True)

In [None]:
# save to pickles, we really only need the id one
sims_byid.to_pickle("{game_data_dir}/game_cosine_similarity_byid.pkl")
sims_byname.to_pickle("{game_data_dir}/game_cosine_similarity_byname.pkl")

### CHECK GAME HERE

In [None]:
# save to pickles, we really only need the id one
sims_byid = pd.read_pickle("{game_data_dir}/game_cosine_similarity_byid.pkl")
sims_byname = pd.read_pickle(
    "{game_data_dir}/game_cosine_similarity_byname.pkl"
)

This is why we made the name one

In [None]:
# test my specific game set here

test_dict = {
    "Dominion": list(sims_byname["dominion"].sort_values(ascending=False)[:15].index),
    "D_Sim": list(sims_byname["dominion"].sort_values(ascending=False)[:15]),
    "Gloomhaven": list(
        sims_byname["gloomhaven"].sort_values(ascending=False)[:15].index
    ),
    "G_Sim": list(sims_byname["gloomhaven"].sort_values(ascending=False)[:15]),
    "Pandemic": list(sims_byname["pandemic"].sort_values(ascending=False)[:15].index),
    "Pa_Sim": list(sims_byname["pandemic"].sort_values(ascending=False)[:15]),
    "Splendor": list(sims_byname["splendor"].sort_values(ascending=False)[:15].index),
    "Sp_Sim": list(sims_byname["splendor"].sort_values(ascending=False)[:15]),
    "Viticulture Essential Edition": list(
        sims_byname["viticulture essential edition"]
        .sort_values(ascending=False)[:15]
        .index
    ),
    "V_Sim": list(
        sims_byname["viticulture essential edition"].sort_values(ascending=False)[:15]
    ),
    "Agricola": list(sims_byname["agricola"].sort_values(ascending=False)[:15].index),
    "Ag_Sim": list(sims_byname["agricola"].sort_values(ascending=False)[:15]),
    "Space Base": list(
        sims_byname["space base"].sort_values(ascending=False)[:15].index
    ),
    "Spa_Sim": list(sims_byname["space base"].sort_values(ascending=False)[:15]),
    "Terraforming Mars": list(
        sims_byname["terraforming mars"].sort_values(ascending=False)[:15].index
    ),
    "Te_Sim": list(sims_byname["terraforming mars"].sort_values(ascending=False)[:15]),
    #'Puerto Rico':list(sims_byname['Puerto Rico'].sort_values(ascending=False)[:15].index), 'Pu_Sim':list(sims_byname['Puerto Rico'].sort_values(ascending=False)[:15]),
    "Chess": list(sims_byname["chess"].sort_values(ascending=False)[:15].index),
    "Ch_Sim": list(sims_byname["chess"].sort_values(ascending=False)[:15]),
    #'Backgammon':list(sims_byname['Backgammon'].sort_values(ascending=False)[:15].index), 'B_Sim':list(sims_byname['Backgammon'].sort_values(ascending=False)[:15]),
    #'Sagrada':list(sims_byname['Sagrada'].sort_values(ascending=False)[:15].index), 'Sa_Sim':list(sims_byname['Sagrada'].sort_values(ascending=False)[:15]),
    "Azul": list(sims_byname["azul"].sort_values(ascending=False)[:15].index),
    "Az_Sim": list(sims_byname["azul"].sort_values(ascending=False)[:15]),
    "Codenames": list(sims_byname["codenames"].sort_values(ascending=False)[:15].index),
    "Co_Sim": list(sims_byname["codenames"].sort_values(ascending=False)[:15]),
    "Secret Hitler": list(
        sims_byname["secret hitler"].sort_values(ascending=False)[:15].index
    ),
    "Se_Sim": list(sims_byname["secret hitler"].sort_values(ascending=False)[:15]),
    "Monopoly": list(sims_byname["monopoly"].sort_values(ascending=False)[:15].index),
    "M_Sim": list(sims_byname["monopoly"].sort_values(ascending=False)[:15]),
    "Lords of Waterdeep": list(
        sims_byname["lords of waterdeep"].sort_values(ascending=False)[:15].index
    ),
    "L_Sim": list(sims_byname["lords of waterdeep"].sort_values(ascending=False)[:15]),
    "Stone Age": list(sims_byname["stone age"].sort_values(ascending=False)[:15].index),
    "St_Sim": list(sims_byname["stone age"].sort_values(ascending=False)[:15]),
    "Century: Spice Road": list(
        sims_byname["century spice road"].sort_values(ascending=False)[:15].index
    ),
    "Ce_Sim": list(sims_byname["century spice road"].sort_values(ascending=False)[:15]),
    #'Scrabble':list(sims_byname['Scrabble'].sort_values(ascending=False)[:15].index), 'Sc_Sim':list(sims_byname['Scrabble'].sort_values(ascending=False)[:15]),
    "18MS: The Railroads Come to Mississippi": list(
        sims_byname["18ms the railroads come to mississippi"]
        .sort_values(ascending=False)[:15]
        .index
    ),
    "18xx": list(
        sims_byname["18ms the railroads come to mississippi"].sort_values(
            ascending=False
        )[:15]
    ),
    #'Roads to Gettysburg II: Lee Strikes North':list(sims_byname['Roads to Gettysburg II: Lee Strikes North'].sort_values(ascending=False)[:15].index), 'War3':list(sims_byname['Roads to Gettysburg II: Lee Strikes North'].sort_values(ascending=False)[:15]),
    "Power Grid": list(
        sims_byname["power grid"].sort_values(ascending=False)[:15].index
    ),
    "P_Grid": list(sims_byname["power grid"].sort_values(ascending=False)[:15]),
}

pd.DataFrame(test_dict)

In [None]:
# test specific games here
game = "mariposas"
game = game.lower()

results = pd.DataFrame(
    data={"Similarity": sims_byname[game].sort_values(ascending=False)[0:21]}
)
results.index = results.index.str.title()
results

# Appendix

## Glove Word Embeddings

In [None]:
# amount of vocabulary to use, will pick the top 10000 words seen in the corpus
features = 5000

# max text sequence length, must match tokens in transfer file, we are using glove 300d so it is 300
max_words = 300

In [None]:
# instantiate our word tokenizer
tokenizer = Tokenizer(num_words=features)

# Create vocabulary with training texts
tokenizer.fit_on_texts(
    games["Description"]
)  # nltk method which creates a vocab index based on the word frequency, every word gets own integer value

# pad the train text to 300, or cut off if over
tokenized_train = tokenizer.texts_to_sequences(
    games["Description"]
)  # transforms each text to a sequence of integers
tokenized_train = sequence.pad_sequences(
    tokenized_train, maxlen=max_words, truncating="post", padding="post"
)  # truncates or pads the vector to the max_words

In [None]:
# identify the embedding filename; we are using the Glove 42B 300d embeddings
glove_file = "glove.42B.300d.txt"

# create the embeddings index dictionary
embeddings_index = {}  # create a lookup dictionary to store words and their vectors
f = open(glove_file, errors="ignore")  # open our embedding file
for line in f:  # for each line in the file
    values = line.split(
        " "
    )  # split the line on spaces between the word and its vectors
    word = values[0]  # the word is the first entry
    if (
        word in tokenizer.word_index.keys()
    ):  # we check if the word is in our tokenizer word index
        coefs = np.asarray(values[1:], dtype="float32")  # if so, get the word's vectors
        embeddings_index[word] = (
            coefs  # add the word and its vectors to the embeddings_index dictionary
        )
f.close()

print(
    "Found %s word vectors." % len(embeddings_index)
)  # report how many words in our corpus were found in the GloVe words

In [None]:
num_tokens = (
    len(tokenizer.word_index) + 1
)  # for num tokens we always do the length of our word index +1 for a pad token
hits = 0
misses = 0

embedding_matrix = np.zeros(
    (num_tokens, max_words)
)  # setting up an array for our tokens with a row per token and 300 columns
for (
    word,
    i,
) in tokenizer.word_index.items():  # for each word in the tokenizer word index
    embedding_vector = embeddings_index.get(
        word
    )  # get the vector from the embeddings index dictionary
    if embedding_vector is not None:  # if the vector isn't None,
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = (
            embedding_vector  # store the embedding vector in the matrix at that index
        )
        hits += 1
    else:
        misses += 1

print("Converted %d words (%d misses)" % (hits, misses))

### Single Testing

In [None]:
temp = games[:5]

In [None]:
vector_storage = {}

In [None]:
temp.apply(lambda x: vectorize(x), axis=1)

In [None]:
pd.DataFrame(vector_storage).T

### Real Word Vectors

In [None]:
def vectorize(x):

    description = x["Description"]

    tokens = nlp(description)

    vector_storage[x["BGGId"]] = {}

    for token in tokens:

        word = str(token)

        try:
            if np.all(embedding_matrix[tokenizer.word_index[word]]):
                vector_storage[x["BGGId"]][word] = np.mean(
                    embedding_matrix[tokenizer.word_index[word]]
                )
            else:
                continue
        except:
            continue

In [None]:
vector_storage = {}

In [None]:
descriptions = games[["BGGId", "Description"]]

In [None]:
descriptions.apply(lambda x: vectorize(x), axis=1)

In [None]:
description_vectors = pd.DataFrame(vector_storage).T

In [None]:
description_vectors.shape

In [None]:
# Get only vectors that show up more than once!
refined_vectors = description_vectors.loc[
    :, (description_vectors.isnull().sum(axis=0) <= 21923)
]

In [None]:
refined_vectors.fillna(0, inplace=True)
refined_vectors.reset_index(inplace=True)
refined_vectors.drop("level_0", axis=1, inplace=True)

In [None]:
refined_vectors.to_pickle("{game_data_dir}/description_vectors.pkl")

In [None]:
del vector_storage
del refined_vectors
del embedding_matrix
del tokenizer
del descriptions
del embedding_vector

gc.collect()