# Notebook Objective and Setup

BGG05 is the building of a content-based item filter. Using category weights, I use my domain expertise to tune an item similarity matrix for all game IDs in the games file.

This content-based filter could be used as-is to find similar games to a user's catalog and predict ratings.

## Notebook Preparation

### Package Imports

In [1]:
import pandas as pd
import numpy as np
import requests
import regex as re
import time
import os
import gc
import copy
import json

# ignore warnings (gets rid of Pandas copy warnings)
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = None

pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 30)

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# # NLP tools
import spacy

nlp = spacy.load("en_core_web_sm")
# import re
# import nltk
# import fasttext
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences
# from tensorflow.python.keras.preprocessing import sequence, text

### Notebook Functions

##### Processing Functions

In [2]:
def clean_text(text):
    # remove numbers
    text = text.replace("&amp","")
    text = re.sub(r"\d+", "", text)
    # remove punctuation except periods
    text = re.sub(r"[^\w\s\.]", "", text)
    text = text.lower()
    text = text.strip()
    return text

def filter_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

def evaluate_quality_words_over_thresh(text):
    word_tokens = word_tokenize(text)
    return len(word_tokens) > 5

In [3]:
def processing_pipeline_games(weight_groups, df):
    """
    !!!Hard-coded processor!!!
    Takes in weight tuples and a dataframe
    Scales specific dataframe columns to tuples

    Inputs:
    weight_groups: list of weight tuples (x, y)
    df: df to be scaled

    Returns:
    Processed Dataframe"""

    # Whole pipeline with continuous then categorical transformers
    total_pipeline = ColumnTransformer(
        [
            (
                "games_weight_weight",
                MinMaxScaler(feature_range=weight_groups[0]),
                ["GameWeight"],
            ),
            ("avgrating", MinMaxScaler(feature_range=weight_groups[1]), ["AvgRating"]),
            (
                "bayes_weight",
                MinMaxScaler(feature_range=weight_groups[2]),
                ["BayesAvgRating"],
            ),
            (
                "players_weight",
                MinMaxScaler(feature_range=weight_groups[3]),
                ["BestPlayers"],
            ),
            (
                "playtime_weight",
                MinMaxScaler(feature_range=weight_groups[4]),
                ["Playtime"],
            ),
            (
                "remainder_weight",
                MinMaxScaler(feature_range=weight_groups[5]),
                ["Cat:Thematic",
    "Cat:Strategy",
    "Cat:Family",
    "Cat:War",
    "Cat:CGS",
    "Cat:Abstract",
    "Cat:Party",
    "Cat:Childrens",],
            ),
        ]
    )

    # Fit and tranform the pipeline on x_train, then transform x_test
    processed = total_pipeline.fit_transform(df)

    return processed

# Content Based Filtering

## Set Weights

These are the scales for each of these categories. All entries in the category will be scaled to this tuple range by the MinMaxScaler.

In [4]:
### DO NOT TOUCH THESE ARE THE PRODUCTION WEIGHTS!!!!!

games_weight_weight = (-1, 1) # game weight. Is a range, so (-, )
rating_weight = (-.5, .5)
bayes_weight = (-.5, .5) # game weighted rating. Is a range, so (-, )
players_weight = (0, 1) # best players. Is a problematic range due to outliers
playtime_weight = (0, 1) # playtime. Is a range so (-, ). Has high outliers

designers_weight = (0, 1) # designers, binary
families_weight = (0, 0.5) # families, binary
mechanics_weight = (0, .75) # mechanics, binary
subcategories_weight = (0, .75) # other mechanics like card game, print&play. binary
family_weights = (0, 0.5) # game families like pandemic, century. binary
categories_weight = (0, 1) # the five large overarching categories, binary
themes_weight = (0, 0.5) # themes like space, western. binary

In [5]:
weights = {"games_weight": games_weight_weight,
           "rating": rating_weight,
           "bayes": bayes_weight,
           "players": players_weight,
           "playtime": playtime_weight,
           "designers": designers_weight,
           "mechanics": mechanics_weight,
           "subcategories": subcategories_weight,
           "family": family_weights,
           "categories": categories_weight,
           "themes": themes_weight}

In [6]:
"""games_weight_weight = (-1, 1) # game weight. Is a range, so (-, )
rating_weight = (-.5, .5)
bayes_weight = (-.001, .001) # game weighted rating. Is a range, so (-, )
players_weight = (0, 1) # best players. Is a problematic range due to outliers
playtime_weight = (0, 2) # playtime. Is a range so (-, ). Has high outliers
families_weight = (0, 0.5) # families, binary
mechanics_weight = (0, .75) # mechanics, binary
subcategories_weight = (0, .75) # other mechanics like card game, print&play. binary
family_weights = (0, 0.5) # game families like pandemic, century. binary
categories_weight = (0, 1) # the five large overarching categories, binary
themes_weight = (-.15, 0.15) # themes like space, western. binary"""

'games_weight_weight = (-1, 1) # game weight. Is a range, so (-, )\nrating_weight = (-.5, .5)\nbayes_weight = (-.001, .001) # game weighted rating. Is a range, so (-, )\nplayers_weight = (0, 1) # best players. Is a problematic range due to outliers\nplaytime_weight = (0, 2) # playtime. Is a range so (-, ). Has high outliers\nfamilies_weight = (0, 0.5) # families, binary\nmechanics_weight = (0, .75) # mechanics, binary\nsubcategories_weight = (0, .75) # other mechanics like card game, print&play. binary\nfamily_weights = (0, 0.5) # game families like pandemic, century. binary\ncategories_weight = (0, 1) # the five large overarching categories, binary\nthemes_weight = (-.15, 0.15) # themes like space, western. binary'

## Load and Prep Data

In [7]:
how_many_games = 5000

game_data_dir = "data/prod/games/game_dfs_clean"

# Load games
games = pd.read_pickle(f"{game_data_dir}/games_clean.pkl")

# I don't want to deal with every game ever to be honest, so let's reduce.
# Let's just take the top 5000 games by average rating
games = games.sort_values("BayesAvgRating", ascending=False).head(how_many_games).reset_index(drop=True)

games = games.sort_values("BGGId").reset_index(drop=True)

bgg_ids = games["BGGId"].tolist()
bgg_names = games["Name"].tolist()
game_lookup = {value.lower():key for key, value in zip(bgg_ids, bgg_names)}

games['AvgRating'] = games['AvgRating'].round(2)
games['BayesAvgRating'] = games['BayesAvgRating'].round(2)
games['GameWeight'] = games['GameWeight'].round(2)

# determine playtime for each game according to community
games["Playtime"] = 0
games["Playtime"] = games.apply(
    lambda x: np.mean(x["ComMinPlaytime"] + x["ComMaxPlaytime"]), axis=1
)

# set upper cap on playtime
over_6_hours = list(games.loc[games["Playtime"] > 480].index)
games.loc[over_6_hours, "Playtime"] = 480
games.head()

Unnamed: 0,BGGId,Name,Description,ImagePath,NumAlternates,NumExpansions,NumImplementations,IsReimplementation,Rank:boardgame,BestPlayers,GoodPlayers,YearPublished,MinPlayers,MaxPlayers,AvgRating,...,Rank:partygames,Rank:wargames,Rank:thematic,Rank:familygames,Rank:childrensgames,Rank:cgs,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens,Playtime
0,1,Die Macher,Die Macher is a game about seven sequential po...,https://cf.geekdo-images.com/rpwCZAjYLD940NWwP...,2,0,0,0,441,5,"['4', '5']",1986,3,5,7.59,...,28017,28017,28017,28017,28017,28017,0,1,0,0,0,0,0,0,480.0
1,3,Samurai,Samurai is set in medieval Japan. Players comp...,https://cf.geekdo-images.com/o9-sNXmFS_TLAb7Zl...,8,0,1,0,276,3,"['2', '3', '4']",1998,2,4,7.47,...,28017,28017,28017,28017,28017,28017,0,1,0,0,0,1,0,0,90.0
2,5,Acquire,"In Acquire, each player strategically invests ...",https://cf.geekdo-images.com/FfguJeknahk88vKT7...,11,2,0,0,348,4,"['3', '4', '5']",1963,2,6,7.35,...,28017,28017,28017,28017,28017,28017,0,1,0,0,0,0,0,0,180.0
3,7,Cathedral,"In Cathedral, each player has a set of pieces ...",https://cf.geekdo-images.com/78Dgd-b_1_xEv4OXV...,5,0,0,0,2396,2,[],1979,2,2,6.55,...,28017,28017,28017,28017,28017,28017,0,0,0,0,0,1,0,1,40.0
4,9,El Caballero,"Although referred to as a sequel to El Grande,...",https://cf.geekdo-images.com/lWotCtplnl0sI3bS_...,1,0,0,0,3767,3,"['2', '3', '4']",1998,2,4,6.47,...,28017,28017,28017,28017,28017,28017,0,1,0,0,0,0,0,0,180.0


### Weight Scale Datasets

In [8]:
def refine_binary_subset(filename, column, weight_type, thresh):
    df = pd.read_pickle(f"{game_data_dir}/{filename}.pkl")
    df = pd.get_dummies(df, columns=[column], prefix="", prefix_sep="").groupby("BGGId").sum().reset_index()

    # get floor of mechanics presence in catalog (.03% of games)
    df_floor = round(df.shape[0] * thresh)
    # make a list of mechanics more than the floor
    sums = pd.DataFrame(df.sum() >= df_floor)

    # get indices for the mechanics keeping
    keep_df = sums.loc[sums[0] == True].index

    df = df[keep_df]

    df = df[df['BGGId'].isin(bgg_ids)].set_index("BGGId")

    df = df.replace(1, weights[weight_type][1])

    return df.reset_index(names="BGGId").drop(columns=["BGGId"])

In [9]:
scaled_mechanics = refine_binary_subset(filename="mechanics_clean",
                                        column="mechanic",
                                        weight_type="mechanics",
                                        thresh=0.003).astype('int8')
scaled_mechanics.head()

Unnamed: 0,Acting,Action / Dexterity,Action / Event,Action Points,Action Queue,Action Retrieval,Alliances,Area Majority / Influence,Area Movement,Area-Impulse,Auction or Bidding,Betting and Bluffing,Bingo,Campaign / Battle Card Driven,Card Play Conflict Resolution,...,Tech Trees / Tech Tracks,Territory Building,Three Dimensional Movement,Tile Placement,Track Movement,Trading,Traitor Game,Trick-taking,Variable Phase Order,Variable Player Powers,Variable Set-up,Victory Points as a Resource,Voting,Worker Placement,Zone of Control
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
scaled_themes = refine_binary_subset(filename="themes_clean",
                                        column="Theme",
                                        weight_type="themes",
                                        thresh=0.003).astype('int8')
scaled_themes.head()

Unnamed: 0,Adventure,Age of Reason,Alternate History,American Civil War,American Revolutionary War,American West,Ancient,Animals,Anime / Manga,Anthropomorphic Animals,Arabian,Archaeology / Paleontology,Art,Aviation / Flight,Burglary and Heists,...,Spies / Secret Agents,Sports,Superheroes,Time Travel,Trains,Transportation,Travel,Trivia,Video Game Theme,Vietnam War,Vikings,Witches,World War I,World War II,Zombies
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
scaled_subcategories = refine_binary_subset(filename="subcategories_clean",
                                        column="boardgamecategory",
                                        weight_type="subcategories",
                                        thresh=0.003).astype('int8')
scaled_subcategories.head()

Unnamed: 0,Card Game,Collectible Components,Educational,Electronic,Miniatures,Print & Play,Puzzle,Word Game
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0


In [12]:
scaled_designers = refine_binary_subset(filename="designers_clean",
                                        column="boardgamedesigner",
                                        weight_type="designers",
                                        thresh=0.003).astype('int8')
scaled_designers.head()

Unnamed: 0,Alan R. Moon,Alex Randolph,Andrew Looney,Antoine Bauza,Bruno Cathala,Bruno Faidutti,Dean Essig,Eric M. Lang,Frank Chadwick,Friedemann Friese,Frédéric Bey,Günter Burkhardt,Inka Brand,James Ernest,Jim Dunnigan,...,Matt Hyra,Matthew Dunstan,Michael Kiesling,Michael Schacht,Mike Elliott,Prospero Hall,Reiner Knizia,Reinhard Staupe,Richard H. Berg,Rob Daviau,Scott Almes,Steve Jackson (I),Ty Bomba,Uwe Rosenberg,Wolfgang Kramer
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# scaled game families
families = pd.get_dummies(games["Family"]).astype(int)

# get floor of mechanics presence in catalog (.01% of games)
families_floor = round(families.shape[0] * 0.001)

# make a list of themes more than the floor
sums = pd.DataFrame(families.sum() >= families_floor)

# get indices for the mechanics keeping
keep_families = sums.loc[sums[0] == True].index

families = families[keep_families]

scaled_families = families.replace(1, weights["family"][1]).astype('int8')

scaled_families.head()

Unnamed: 0,18xx,3M Bookshelf,6 nimmt!,Aeon's End,Agricol,Alea Big Box,Alea Medium Box,Alhambr,Ascension Deck Building,Axis & Allies (Avalon Hill),Azul,BANG!,Betrayal (Avalon Hill),Big in Japan (AEG),Blood Bowl,...,Tiny Epic (Gamelyn Games),Treefrog Line (Warfrog),Two-player games (Kosmos),Two-player games (Space Cowboys),Ugly Animals (Drei Magier),Undaunted (Osprey Games),Unlock! (Space Cowboys),Unmatched,War of the Ring (Nexus/Ares Games),Werewolf / Mafi,Wings of Glory,Yellow Wave Box (Queen),Ystari original,ZOC Bond System,Zombicid
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Master CBF Frame

In [14]:
# # instantiate MissForest imputer and fill all nans in scaled_games
# imputer = KNNImputer(n_neighbors=5)
# scaled_games = pd.DataFrame(
#     imputer.fit_transform(scaled_games), columns=games_included_columns
# )

# include these columns for comparison
games_included_columns = [
    "GameWeight",
    "AvgRating",
    "BayesAvgRating",
    "BestPlayers",
    "Playtime",
    "Cat:Thematic",
    "Cat:Strategy",
    "Cat:Family",
    "Cat:War",
    "Cat:CGS",
    "Cat:Abstract",
    "Cat:Party",
    "Cat:Childrens",
]

games_reduced = games[games_included_columns]

games_reduced.head()

Unnamed: 0,GameWeight,AvgRating,BayesAvgRating,BestPlayers,Playtime,Cat:Thematic,Cat:Strategy,Cat:Family,Cat:War,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
0,4.31,7.59,7.03,5,480.0,0,1,0,0,0,0,0,0
1,2.45,7.47,7.22,3,90.0,0,1,0,0,0,1,0,0
2,2.49,7.35,7.13,4,180.0,0,1,0,0,0,0,0,0
3,1.79,6.55,6.12,2,40.0,0,0,0,0,0,1,0,1
4,2.98,6.47,5.88,3,180.0,0,1,0,0,0,0,0,0


In [15]:
# set up weight groups for hard coded pipeline
weight_groups = [
    games_weight_weight,
    rating_weight,
    bayes_weight,
    players_weight,
    playtime_weight,
    categories_weight,
]

# process scaled_games with pipeline
scaled_games = pd.DataFrame(
    processing_pipeline_games(weight_groups, games_reduced), columns=games_included_columns
)

scaled_games.head()

Unnamed: 0,GameWeight,AvgRating,BayesAvgRating,BestPlayers,Playtime,Cat:Thematic,Cat:Strategy,Cat:Family,Cat:War,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
0,0.732984,0.003145,-0.022727,0.3125,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.240838,-0.034591,0.049242,0.1875,0.1875,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.219895,-0.072327,0.015152,0.25,0.375,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.586387,-0.323899,-0.367424,0.125,0.083333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.036649,-0.349057,-0.458333,0.1875,0.375,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
# put together master dataframe with other already processed dataframes
master_games = pd.concat(
    (
        scaled_games,
        scaled_themes,
        scaled_mechanics,
        scaled_families,
        scaled_designers,
        scaled_subcategories
    ),
    axis=1,
)  # , description_vectors, word_vectors

# put game id on master_games DF
master_games["BGGId"] = bgg_ids

# set index to id
master_games = master_games.set_index("BGGId")

# fill nans with 0
master_games = master_games.fillna(0)

master_games.head()

Unnamed: 0_level_0,GameWeight,AvgRating,BayesAvgRating,BestPlayers,Playtime,Cat:Thematic,Cat:Strategy,Cat:Family,Cat:War,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens,Adventure,Age of Reason,...,Richard H. Berg,Rob Daviau,Scott Almes,Steve Jackson (I),Ty Bomba,Uwe Rosenberg,Wolfgang Kramer,Card Game,Collectible Components,Educational,Electronic,Miniatures,Print & Play,Puzzle,Word Game
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
1,0.732984,0.003145,-0.022727,0.3125,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.240838,-0.034591,0.049242,0.1875,0.1875,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.219895,-0.072327,0.015152,0.25,0.375,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.586387,-0.323899,-0.367424,0.125,0.083333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.036649,-0.349057,-0.458333,0.1875,0.375,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
master_games

Unnamed: 0_level_0,GameWeight,AvgRating,BayesAvgRating,BestPlayers,Playtime,Cat:Thematic,Cat:Strategy,Cat:Family,Cat:War,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens,Adventure,Age of Reason,...,Richard H. Berg,Rob Daviau,Scott Almes,Steve Jackson (I),Ty Bomba,Uwe Rosenberg,Wolfgang Kramer,Card Game,Collectible Components,Educational,Electronic,Miniatures,Print & Play,Puzzle,Word Game
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
1,0.732984,0.003145,-0.022727,0.3125,1.000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.240838,-0.034591,0.049242,0.1875,0.187500,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.219895,-0.072327,0.015152,0.2500,0.375000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.586387,-0.323899,-0.367424,0.1250,0.083333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.036649,-0.349057,-0.458333,0.1875,0.375000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420087,-1.000000,-0.094340,-0.473485,0.3125,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
420805,0.204188,0.031447,-0.431818,0.1875,0.375000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
421006,-0.460733,0.286164,0.075758,0.1250,0.156250,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
422332,-0.303665,0.194969,-0.469697,0.1250,0.125000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
master_games.to_pickle("master_games_scaled.pkl")

In [19]:
# clean up

del scaled_mechanics
del scaled_families
del scaled_designers
del scaled_games
del scaled_subcategories
del scaled_themes

gc.collect()

1330

## Item Similarity via Cosine Distance

In [20]:
{x:y for x, y in game_lookup.items() if "haven" in x}

{'gloomhaven': 174430,
 'haven': 205127,
 'founders of gloomhaven': 214032,
 'gloomhaven: jaws of the lion': 291457,
 'frosthaven': 295770,
 'hidden games crime scene: the new haven case': 304847,
 'gloomhaven: buttons & bugs': 393672}

In [21]:
master_games = pd.read_pickle("exploratory_notebooks/master_games_scaled.pkl")
master_games.head()

Unnamed: 0_level_0,GameWeight,AvgRating,BayesAvgRating,BestPlayers,Playtime,Cat:Thematic,Cat:Strategy,Cat:Family,Cat:War,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens,Adventure,Age of Reason,...,Richard H. Berg,Rob Daviau,Scott Almes,Steve Jackson (I),Ty Bomba,Uwe Rosenberg,Wolfgang Kramer,Card Game,Collectible Components,Educational,Electronic,Miniatures,Print & Play,Puzzle,Word Game
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
1,0.732984,0.003145,-0.022727,0.3125,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.240838,-0.034591,0.049242,0.1875,0.1875,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,-0.219895,-0.072327,0.015152,0.25,0.375,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,-0.586387,-0.323899,-0.367424,0.125,0.083333,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.036649,-0.349057,-0.458333,0.1875,0.375,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# get cosine similarities!
cosine_sims = cosine_similarity(master_games)

# do similarities by game id
sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids)
sims_byid["Game_Id"] = bgg_ids
sims_byid = sims_byid.set_index("Game_Id", drop=True)

sims_byid = sims_byid.round(2)

sims_byid = sims_byid.replace(1.00, 0)
sims_byid.head()

Unnamed: 0_level_0,1,3,5,7,9,10,11,12,13,14,15,16,18,19,20,...,414317,415147,415776,415945,416851,417197,418059,419195,419704,420077,420087,420805,421006,422332,422732
Game_Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
1,0.0,0.37,0.71,-0.1,0.57,0.02,-0.07,0.44,0.41,0.3,0.5,0.53,0.43,-0.08,0.81,...,-0.09,-0.15,0.58,-0.14,-0.17,0.49,0.9,-0.14,0.81,-0.01,-0.29,0.57,0.46,-0.05,-0.31
3,0.37,0.0,0.59,0.35,0.39,0.07,0.44,0.44,0.37,0.42,0.43,0.54,0.37,0.09,0.51,...,0.08,0.1,0.38,0.09,0.08,0.54,0.47,0.14,0.47,0.06,0.14,0.03,0.57,0.08,0.12
5,0.71,0.59,0.0,0.1,0.67,0.15,0.17,0.72,0.63,0.68,0.73,0.92,0.62,0.18,0.89,...,0.15,0.21,0.69,0.17,0.14,0.89,0.81,0.28,0.82,0.13,0.25,0.19,0.9,0.18,0.2
7,-0.1,0.35,0.1,0.0,0.1,0.16,0.17,0.06,0.09,0.19,0.13,0.21,0.09,0.27,0.07,...,0.1,0.37,0.11,0.24,0.21,0.19,-0.07,0.38,0.03,0.18,0.38,0.07,0.08,0.27,0.36
9,0.57,0.39,0.67,0.1,0.0,0.14,0.07,0.44,0.43,0.52,0.56,0.72,0.44,0.21,0.74,...,-0.02,0.28,0.5,0.11,0.09,0.67,0.64,0.24,0.71,0.15,0.17,0.37,0.51,0.21,0.14


In [11]:
for item in np.arange(0, len(bgg_names), 1):
    bgg_names[item] = re.sub("[^A-Za-z0-9\s]+", "", bgg_names[item])

In [12]:
sims_byname = sims_byid.copy()

lowercase_bgg_names = [x.lower() for x in bgg_names]
sims_byname = sims_byname.set_axis(lowercase_bgg_names, axis=1).set_axis(lowercase_bgg_names, axis=0)
sims_byname.head()

Unnamed: 0,die macher,samurai,acquire,cathedral,el caballero,elfenland,bohnanza,ra,catan,basari,cosmic encounter,marracash,roborally,wacky wacky west,full metal plante,...,harmonies,spectacular,kingdom legacy feudal kingdom,unmatched slings and arrows,castle combo,rebirth,seti search for extraterrestrial intelligence,fishing,phoenix new horizon,the mandalorian adventures,flip 7,black forest,the lord of the rings duel for middleearth,star wars the deckbuilding game clone wars edition,agent avenue
die macher,0.0,0.37,0.71,-0.1,0.57,0.02,-0.07,0.44,0.41,0.3,0.5,0.53,0.43,-0.08,0.81,...,-0.09,-0.15,0.58,-0.14,-0.17,0.49,0.9,-0.14,0.81,-0.01,-0.29,0.57,0.46,-0.05,-0.31
samurai,0.37,0.0,0.59,0.35,0.39,0.07,0.44,0.44,0.37,0.42,0.43,0.54,0.37,0.09,0.51,...,0.08,0.1,0.38,0.09,0.08,0.54,0.47,0.14,0.47,0.06,0.14,0.03,0.57,0.08,0.12
acquire,0.71,0.59,0.0,0.1,0.67,0.15,0.17,0.72,0.63,0.68,0.73,0.92,0.62,0.18,0.89,...,0.15,0.21,0.69,0.17,0.14,0.89,0.81,0.28,0.82,0.13,0.25,0.19,0.9,0.18,0.2
cathedral,-0.1,0.35,0.1,0.0,0.1,0.16,0.17,0.06,0.09,0.19,0.13,0.21,0.09,0.27,0.07,...,0.1,0.37,0.11,0.24,0.21,0.19,-0.07,0.38,0.03,0.18,0.38,0.07,0.08,0.27,0.36
el caballero,0.57,0.39,0.67,0.1,0.0,0.14,0.07,0.44,0.43,0.52,0.56,0.72,0.44,0.21,0.74,...,-0.02,0.28,0.5,0.11,0.09,0.67,0.64,0.24,0.71,0.15,0.17,0.37,0.51,0.21,0.14


In [25]:
# save to pickles, we really only need the id one
sims_byid.to_pickle(f"{game_data_dir}/game_cosine_similarity_byid.pkl")
sims_byname.to_pickle(f"{game_data_dir}/game_cosine_similarity_byname.pkl")

In [None]:
del sims_byname
del sims_byid
gc.collect()

### CHECK GAME HERE

In [40]:
# save to pickles, we really only need the id one
# sims_byid = pd.read_pickle("{game_data_dir}/game_cosine_similarity_byid.pkl")
sims_byname = pd.read_pickle(
    f"{game_data_dir}/game_cosine_similarity_byname.pkl"
)

In [None]:
# find any entries with a particular string in the index or column name
def find_string_in_index_or_column(df, string):
    df =  df[df.index.str.contains(string, case=False, na=False)]
    columns=list(df.columns[df.columns.str.contains(string, case=False, na=False)])
    return df[columns]

find_string_in_index_or_column(sims_byname, "haven")


This is why we made the name one

In [None]:
test_games = ["dominion", "gloomhaven", "pandemic", "splendor", "viticulture essential edition", "great western trail", "terraforming mars", "chess", "azul", "codenames"]

test_dict = {}

for item in test_games:
    test_dict[item] = list(sims_byname[item].sort_values(ascending=False)[:10].index)
    test_dict[f"{item[:3]}_sim"] = list(sims_byname[item].sort_values(ascending=False)[:10])

pd.DataFrame(test_dict)

In [None]:
break

# Clean game descriptions for critical components

In [None]:
# gather entire corpus of game descriptions

# games = pd.read_pickle(f"{game_data_dir}/games_clean.pkl")
# games.head()

In [None]:
len(games)

In [9]:
description_freq = {}

def tokenize_description(one_row):

    if type(one_row['Description']) == float:
        return None
    one_row_desc = one_row["Description"]
    one_row_title = one_row["Name"]
    title_tokens = word_tokenize(one_row_title)
    title_tokens = [word.lower() for word in title_tokens if word.isalpha()]

    

    description = filter_stopwords(clean_text(one_row_desc)).replace(" ."," ").replace("  "," ").strip()
    description = word_tokenize(description)
    description = [word for word in description if word not in title_tokens]

    description = [PorterStemmer().stem(word) for word in description]

    for word in description:
        if word in description_freq:
            description_freq[word] += 1
        else:
            description_freq[word] = 1

    return " ".join(description)


In [None]:
games['cleaned_description'] = games.apply(tokenize_description, axis=1)
games.head()

In [None]:
descriptions = games[games['cleaned_description'].notna()][['BGGId','cleaned_description']]
descriptions.head()

In [None]:


# prepare the vectorizer with the chosen parameters
tfid_proc = TfidfVectorizer(
    sublinear_tf=True,
    use_idf=True,
    max_df=0.1,
    min_df=0.01,
    ngram_range=(1, 2),
    max_features=1000,
)

# fit the vectorizer to the descriptions
word_vectors = tfid_proc.fit_transform(descriptions["cleaned_description"])

# cast the vector array to a data frame with columns named by the features selected by the vectorizer
word_vectors_df = pd.DataFrame(
    word_vectors.toarray(), columns=tfid_proc.get_feature_names_out())

word_vectors_df.head()

In [None]:
len(word_vectors_df)

In [None]:
games_df_with_word_vectors = pd.concat([descriptions, word_vectors_df], axis=1).drop(columns=['cleaned_description'])
games_df_with_word_vectors.head()

In [None]:
games = games.merge(games_df_with_word_vectors, on='BGGId', how='left')
games.head()