In [11]:
import pandas as pd
import numpy as np
from config import CONFIGS
import os
import os

from utils.processing_functions import explode_columnar_df
from utils.weaviate_client import WeaviateClient

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_distances

ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

read_dir = "data/prod/games/game_dfs_clean/"
sims_dir = "data/prod/games/similarity_files/"

weaviate_client = WeaviateClient()
weaviate_client.connect_weaviate_client_docker()


            Please make sure to close the connection using `client.close()`.


<weaviate.client.WeaviateClient at 0x12ea6eba0>

In [12]:
def get_closet_picks(df, game_id, model, n_picks=10):
    uuid = df.loc[df['bggid']==game_id]['UUID'].values[0]

    similars = weaviate_client.find_near_objects(collection_name=model, uuid=uuid, limit=50)

    picks = {}

    for item in similars:
        picks[str(item.uuid)]=item.metadata.distance
        
    picks = pd.DataFrame.from_dict(picks, columns=['distance'], orient='index').sort_values(by='distance', ascending=True).reset_index().rename(columns={'index':'UUID'})

    picks = picks.merge(df, on='UUID', how='inner')[1:]

    return picks[['bggid','distance']].head(n_picks)

In [13]:
def build_results_for_game(game_name):
    game_id = game_id_map[game_name]
    print(f"\nEvaluation: {game_name} {game_id}")

    single_game_results = {}

    for model, model_df in arrays_df_storage.items():
        print(f"Model: {model}")

        picks = model_df[game_id].sort_values(ascending=True)[1:500]

        single_game_results[model] = picks

    for collection_name, model_df in models_df_storage.items():
        print(f"Model: {collection_name}")

        picks = get_closet_picks(df=model_df, game_id=game_id, model=collection_name, n_picks=50)

        single_game_results[collection_name] = {x:y for x,y in zip(picks['bggid'], picks['distance'])}

    results_df = pd.DataFrame(single_game_results)

    scaler = MinMaxScaler(feature_range=(0,1))
    for col in results_df.columns:
        results_df[col] = scaler.fit_transform(results_df[[col]])

    results_df = results_df.fillna(1).round(3)
    results_df['average_score'] = results_df.mean(axis=1).round(2)

    results_df['name'] = results_df.index.map(id_game_map)

    return results_df.sort_values("average_score", ascending=True).head(20)

# Content Similarity Models

### File Setup for All Models

In [14]:
games_df = pd.read_pickle(f"{read_dir}games_clean.pkl")
games_df = games_df.sort_values("BayesAvgRating", ascending=False)[
            0 : 1000
        ].reset_index(drop=True)
games_df.head(2)

Unnamed: 0,BGGId,Name,Description,ImagePath,NumAlternates,NumExpansions,NumImplementations,IsReimplementation,Rank:boardgame,BestPlayers,...,Rank:childrensgames,Rank:cgs,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
0,224517,Brass: Birmingham,Brass: Birmingham is an economic strategy game...,https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg...,6,0,1,1,1,3,...,28017,28017,0,1,0,0,0,0,0,0
1,161936,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,https://cf.geekdo-images.com/-Qer2BBPG7qGGDu6K...,11,0,2,1,2,4,...,28017,28017,1,1,0,0,0,0,0,0


In [15]:
id_game_map = {x:y for x,y in zip(games_df['BGGId'],games_df['Name'])}
game_id_map = {y:x for x,y in zip(games_df['BGGId'],games_df['Name'])}
bgg_ids = games_df['BGGId'].values
relevant_df = games_df[['BGGId']]
relevant_df.head(2)

Unnamed: 0,BGGId
0,224517
1,161936


In [16]:
test_games = ["Dominion", "Gloomhaven", "Pandemic", "Splendor", "Viticulture Essential Edition", "Great Western Trail", "Terraforming Mars", "Chess", "Azul", "Codenames", "Pandemic Legacy: Season 1"]

models_df_storage = {}
arrays_df_storage = {}

game_name = test_games[-1]
game_id = int(games_df[games_df['Name']==game_name]['BGGId'].values[0])
game_name, game_id

('Pandemic Legacy: Season 1', 161936)

In [17]:
rag_cleaned_sentences_df = pd.read_pickle(f'{sims_dir}top_1000_cleaned_rag.pkl')
rag_cleaned_sentences_df = relevant_df.merge(rag_cleaned_sentences_df, on="BGGId", how="left")
rag_cleaned_sentences_df = rag_cleaned_sentences_df.drop(columns=["Positive_Components","Positive_Sentences","Negative_Components","Negative_Sentences"], axis=1)
rag_cleaned_sentences_df.columns = [x.lower() for x in rag_cleaned_sentences_df.columns]
rag_cleaned_sentences_df.head(2)

Unnamed: 0,bggid,description,about
0,224517,brass birmingham economic strategy game sequel...,game strategic economic simulation set industr...
1,161936,pandemic legacy cooperative campaign game over...,game cooperative board game players work toget...


In [18]:
all_rag_attributes_df = pd.read_pickle(f"{sims_dir}top_1000_cleaned_rag_with_ratings_extrap.pkl")
all_rag_attributes_df = all_rag_attributes_df.drop(columns=["Description","About","Positive_Components","Negative_Components","Positive_Sentences","Negative_Sentences"])
all_rag_attributes_df.columns = all_rag_attributes_df.columns.str.replace(" ","_")
all_rag_attributes_df.columns = [x.lower() for x in all_rag_attributes_df.columns]
all_rag_attributes_df = rag_cleaned_sentences_df.merge(all_rag_attributes_df, on="bggid", how="left")
all_rag_attributes_df.head(2)

Unnamed: 0,bggid,description,about,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_thematic_integration,positive_component_quality,negative_steep_learning_curve,...,negative_anticlimactic_endings,positive_excellent_production_values,positive_attractive_artwork,positive_accessible_non_gamers,positive_variable_experiences,negative_catch_up_mechanics,positive_simple_to_learning,negative_action_limitations,positive_good_themes,positive_good_interactions
0,224517,brass birmingham economic strategy game sequel...,game strategic economic simulation set industr...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,161936,pandemic legacy cooperative campaign game over...,game cooperative board game players work toget...,0.834,0.739,0.848,0.951,0.746,0.683,0.843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
rag_pros_cons_only_df = all_rag_attributes_df.drop(columns=["about","description"],axis=1)
rag_pros_cons_only_df = rag_pros_cons_only_df.fillna(0).set_index("bggid")
rag_pros_cons_only_df.columns = [x.lower() for x in rag_pros_cons_only_df.columns]
rag_pros_cons_only_df.head(2)

Unnamed: 0_level_0,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_thematic_integration,positive_component_quality,negative_steep_learning_curve,negative_fiddly_mechanics,negative_variable_enjoyment,negative_lengthy_playtime,...,negative_anticlimactic_endings,positive_excellent_production_values,positive_attractive_artwork,positive_accessible_non_gamers,positive_variable_experiences,negative_catch_up_mechanics,positive_simple_to_learning,negative_action_limitations,positive_good_themes,positive_good_interactions
bggid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
224517,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161936,0.834,0.739,0.848,0.951,0.746,0.683,0.843,0.78,0.729,0.789,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
positive_columns = [x for x in rag_pros_cons_only_df.columns if x.startswith('positive_')]
negative_columns = [x for x in rag_pros_cons_only_df.columns if x.startswith('negative_')]

all_columns_to_produce = positive_columns + negative_columns
all_columns_to_produce[:5]

['positive_strategic_depth',
 'positive_excellent_design',
 'positive_player_interaction',
 'positive_replayability',
 'positive_thematic_integration']

## Array Models - Themes, Categories, Mechanics, Mechanics, Pros-Cons

### Themes

In [21]:
model = "themes"

themes_df = pd.read_pickle(f"{read_dir}themes_clean.pkl")
themes_df = explode_columnar_df(themes_df)
themes_df = relevant_df.merge(themes_df, on='BGGId', how='left').set_index('BGGId')
themes_df = themes_df.astype(float).replace(0,0.01)
themes_df = themes_df.fillna(0.01)

cosine_sims = cosine_distances(themes_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

results = [id_game_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]
results

['Pandemic',
 'Pandemic: The Cure',
 'Pandemic: Rising Tide',
 'Dice Hospital',
 'Codex Naturalis',
 'Iberia',
 'Pandemic Legacy: Season 2',
 'Terraforming Mars',
 'Tang Garden']

Sanity Checks

In [22]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = themes_df[themes_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_matches['name'] = only_matches.index.map(id_game_map)


Unnamed: 0_level_0,Economic,Environmental,Gardening,Industry / Manufacturing,Medical,Post-Apocalyptic,Post-Napoleonic,Scienc,Science Fiction,Space Exploration,Trains,name
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
167791,1.0,1.0,0.01,1.0,0.01,0.01,0.01,1.0,1.0,1.0,0.01,Terraforming Mars
221107,0.01,1.0,0.01,0.01,1.0,1.0,0.01,0.01,1.0,0.01,0.01,Pandemic Legacy: Season 2
198928,0.01,0.01,0.01,0.01,1.0,0.01,1.0,1.0,0.01,0.01,1.0,Iberia
30549,0.01,0.01,0.01,0.01,1.0,0.01,0.01,1.0,0.01,0.01,0.01,Pandemic
150658,0.01,1.0,0.01,0.01,1.0,0.01,0.01,0.01,0.01,0.01,0.01,Pandemic: The Cure
218121,0.01,0.01,0.01,0.01,1.0,0.01,0.01,0.01,0.01,0.01,0.01,Dice Hospital
252153,0.01,1.0,1.0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,Tang Garden
234671,0.01,1.0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,Pandemic: Rising Tide
314503,0.01,1.0,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,0.01,Codex Naturalis


In [23]:
pd.DataFrame(cosine_distances(tester_df[matching_columns]), columns=names, index=names)

Unnamed: 0,Terraforming Mars,Pandemic Legacy: Season 2,Iberia,Pandemic,Pandemic: The Cure,Dice Hospital,Tang Garden,Pandemic: Rising Tide,Codex Naturalis
Terraforming Mars,0.0,0.579497,0.779534,0.69397,0.69397,0.971275,0.69397,0.571367,0.571367
Pandemic Legacy: Season 2,0.579497,0.0,0.734946,0.632207,0.285798,0.484953,0.632207,0.484953,0.484953
Iberia,0.779534,0.734946,0.0,0.285798,0.632207,0.484953,0.978617,0.974715,0.974715
Pandemic,0.69397,0.632207,0.285798,0.0,0.48983,0.285704,0.979659,0.978237,0.978237
Pandemic: The Cure,0.69397,0.285798,0.632207,0.48983,0.0,0.285704,0.48983,0.285704,0.285704
Dice Hospital,0.971275,0.484953,0.484953,0.285704,0.285704,0.0,0.978237,0.979121,0.979121
Tang Garden,0.69397,0.632207,0.978617,0.979659,0.48983,0.978237,0.0,0.285704,0.285704
Pandemic: Rising Tide,0.571367,0.484953,0.974715,0.978237,0.285704,0.979121,0.285704,0.0,0.0
Codex Naturalis,0.571367,0.484953,0.974715,0.978237,0.285704,0.979121,0.285704,0.0,0.0


### Categories Only

In [24]:
# TO DO: Add this to the game cleaning script

# category_df = df[['BGGId','Cat:Thematic', 'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract', 'Cat:Party', 'Cat:Childrens']]
# subcats_df = pd.read_pickle(f"{read_dir}subcategories_clean.pkl")
# subcats_df = explode_columnar_df(subcats_df).reset_index()
# category_df = category_df.merge(subcats_df, on="BGGId", how="left")
# category_df = category_df.set_index("BGGId")
# category_df.to_pickle(f"{read_dir}categories_clean.pkl")
# category_df.head()

In [25]:
model = "categories"

cat_df = pd.read_pickle(f"{read_dir}categories_clean.pkl")
cat_df = relevant_df.merge(cat_df, on='BGGId', how='left').set_index('BGGId')
cat_df = cat_df.astype(float).replace(0,0.01)
cat_df = cat_df.fillna(0.01)

cosine_sims = cosine_distances(cat_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

results = [id_game_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]
results


['Too Many Bones',
 'Too Many Bones: Undertow',
 'Abomination: The Heir of Frankenstein',
 'Dungeon Petz',
 'Tiny Epic Dungeons',
 'Reign of Cthulhu',
 'Pandemic Legacy: Season 0',
 'Dune',
 'Pandemic Legacy: Season 2']

Sanity Checks

In [26]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = cat_df[cat_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_matches['name'] = only_matches.index.map(id_game_map)


Unnamed: 0_level_0,Cat:Thematic,Cat:Strategy,name
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
192135,1.0,1.0,Too Many Bones
314040,1.0,1.0,Pandemic Legacy: Season 0
221107,1.0,1.0,Pandemic Legacy: Season 2
235802,1.0,1.0,Too Many Bones: Undertow
97207,1.0,1.0,Dungeon Petz
121,1.0,1.0,Dune
192153,1.0,1.0,Reign of Cthulhu
331787,1.0,1.0,Tiny Epic Dungeons
239472,1.0,1.0,Abomination: The Heir of Frankenstein


In [27]:
pd.DataFrame(cosine_distances(tester_df[matching_columns]), columns=names, index=names)

Unnamed: 0,Too Many Bones,Pandemic Legacy: Season 0,Pandemic Legacy: Season 2,Too Many Bones: Undertow,Dungeon Petz,Dune,Reign of Cthulhu,Tiny Epic Dungeons,Abomination: The Heir of Frankenstein
Too Many Bones,0.0,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16
Pandemic Legacy: Season 0,2.220446e-16,0.0,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16
Pandemic Legacy: Season 2,2.220446e-16,2.220446e-16,0.0,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16
Too Many Bones: Undertow,2.220446e-16,2.220446e-16,2.220446e-16,0.0,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16
Dungeon Petz,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,0.0,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16
Dune,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,0.0,2.220446e-16,2.220446e-16,2.220446e-16
Reign of Cthulhu,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,0.0,2.220446e-16,2.220446e-16
Tiny Epic Dungeons,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,0.0,2.220446e-16
Abomination: The Heir of Frankenstein,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,2.220446e-16,0.0


### Mechanics Only

In [28]:
model = "mechanics"

mech_df = pd.read_pickle(f"{read_dir}mechanics_clean.pkl")
mech_df = explode_columnar_df(mech_df)
mech_df = relevant_df.merge(mech_df, on='BGGId', how='left').set_index('BGGId')
mech_df = mech_df.astype(float).replace(0,0.01)
mech_df = mech_df.fillna(0.01)

cosine_sims = cosine_distances(mech_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

results = [id_game_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]
results

['Pandemic Legacy: Season 0',
 'Pandemic Legacy: Season 2',
 'Reign of Cthulhu',
 'Pandemic',
 'Iberia',
 'Star Wars: The Clone Wars',
 'Pandemic: Rising Tide',
 'Defenders of the Realm',
 'Fall of Rome']

Sanity Checks

In [29]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = mech_df[mech_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_matches['name'] = only_matches.index.map(id_game_map)


Unnamed: 0_level_0,Action Points,Area Movement,Cooperative Game,Dice Rolling,Die Icon Resolution,Events,Exploration,Hand Management,Legacy Game,Network and Route Building,Pick-up and Deliver,Point to Point Movement,Scenario / Mission / Campaign Game,Set Collection,Solo / Solitaire Game,Tags,Trading,Variable Player Powers,name
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
314040,1.0,0.01,1.0,0.01,0.01,0.01,0.01,1.0,1.0,0.01,0.01,1.0,1.0,1.0,0.01,1.0,1.0,1.0,Pandemic Legacy: Season 0
221107,1.0,0.01,1.0,0.01,0.01,0.01,1.0,1.0,1.0,0.01,1.0,1.0,1.0,1.0,0.01,1.0,1.0,1.0,Pandemic Legacy: Season 2
198928,1.0,0.01,1.0,0.01,0.01,0.01,0.01,1.0,0.01,1.0,0.01,1.0,0.01,1.0,1.0,0.01,1.0,1.0,Iberia
30549,1.0,0.01,1.0,0.01,0.01,0.01,0.01,1.0,0.01,0.01,0.01,1.0,0.01,1.0,0.01,1.0,0.01,1.0,Pandemic
260428,1.0,0.01,1.0,1.0,0.01,1.0,0.01,1.0,0.01,0.01,0.01,1.0,0.01,1.0,1.0,0.01,0.01,1.0,Fall of Rome
192153,1.0,0.01,1.0,0.01,0.01,0.01,0.01,1.0,0.01,0.01,0.01,1.0,0.01,1.0,0.01,0.01,1.0,1.0,Reign of Cthulhu
370913,1.0,0.01,1.0,0.01,1.0,0.01,0.01,1.0,0.01,0.01,0.01,1.0,0.01,1.0,1.0,0.01,0.01,1.0,Star Wars: The Clone Wars
65532,1.0,0.01,1.0,1.0,0.01,0.01,0.01,1.0,0.01,0.01,0.01,1.0,0.01,0.01,0.01,0.01,0.01,1.0,Defenders of the Realm
234671,1.0,1.0,1.0,0.01,0.01,0.01,0.01,1.0,0.01,0.01,0.01,0.01,0.01,1.0,1.0,0.01,1.0,1.0,Pandemic: Rising Tide


In [30]:
pd.DataFrame(cosine_distances(tester_df[matching_columns]), columns=names, index=names)

Unnamed: 0,Pandemic Legacy: Season 0,Pandemic Legacy: Season 2,Iberia,Pandemic,Fall of Rome,Reign of Cthulhu,Star Wars: The Clone Wars,Defenders of the Realm,Pandemic: Rising Tide
Pandemic Legacy: Season 0,0.0,0.085308,0.256868,0.159758,0.360171,0.159758,0.322474,0.346758,0.322474
Pandemic Legacy: Season 2,0.085308,0.0,0.319701,0.230796,0.414005,0.230796,0.379476,0.401332,0.379476
Iberia,0.256868,0.319701,0.0,0.239029,0.217778,0.115563,0.171505,0.312776,0.171505
Pandemic,0.159758,0.230796,0.239029,0.0,0.239029,0.139992,0.194201,0.223838,0.325154
Fall of Rome,0.360171,0.414005,0.217778,0.239029,0.0,0.239029,0.171505,0.179422,0.286998
Reign of Cthulhu,0.159758,0.230796,0.115563,0.139992,0.239029,0.0,0.194201,0.223838,0.194201
Star Wars: The Clone Wars,0.322474,0.379476,0.171505,0.194201,0.171505,0.194201,0.0,0.272527,0.244994
Defenders of the Realm,0.346758,0.401332,0.312776,0.223838,0.179422,0.223838,0.272527,0.0,0.413969
Pandemic: Rising Tide,0.322474,0.379476,0.171505,0.325154,0.286998,0.194201,0.244994,0.413969,0.0


### Pros Cons

In [31]:
model = "pros_cons_only"

cosine_sims = cosine_distances(rag_pros_cons_only_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

results = [id_game_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]
results

['Pandemic Legacy: Season 1',
 'Civilization',
 'Root',
 'Sherlock Holmes Consulting Detective: The Thames Murders & Other Cases',
 'The Resistance',
 'Sentinels of the Multiverse',
 'Shadows of Brimstone: City of the Ancients',
 'T.I.M.E Stories',
 'Zombicide: Black Plague',
 'Blood Bowl: Team Manager – The Card Game']

Sanity Checks

In [32]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = rag_pros_cons_only_df[rag_pros_cons_only_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  only_matches['name'] = only_matches.index.map(id_game_map)


Unnamed: 0_level_0,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_thematic_integration,positive_component_quality,negative_steep_learning_curve,negative_fiddly_mechanics,negative_variable_enjoyment,negative_lengthy_playtime,...,negative_anticlimactic_raids,negative_box_size,negative_length_of_campaign,negative_cold_feel,positive_engaging_for_groups,positive_complex_deductions,negative_lengthy_delivery,negative_complex_growth,negative_initial_impressions,name
bggid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
161936,0.834,0.739,0.848,0.951,0.746,0.683,0.843,0.78,0.729,0.789,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Pandemic Legacy: Season 1
237182,0.916,0.76,0.832,0.876,0.503,0.904,0.908,0.776,0.875,0.78,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Root
2511,0.817,0.739,0.882,1.0,0.0,0.694,0.806,0.852,0.935,0.825,...,0.0,0.0,0.0,0.599,0.856,1.0,0.0,0.0,0.0,Sherlock Holmes Consulting Detective: The Tham...
176189,0.8,0.7,0.86,0.906,0.604,0.974,0.723,0.782,0.907,0.746,...,0.613,0.542,0.624,0.0,0.0,0.0,0.713,0.0,0.0,Zombicide: Black Plague
146508,0.846,0.804,0.865,0.906,0.758,0.761,0.811,0.82,0.863,0.686,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,T.I.M.E Stories
41114,0.849,0.784,0.865,1.0,0.758,0.771,0.737,0.796,0.833,0.767,...,0.0,0.0,0.0,0.0,0.734,0.0,0.0,0.0,0.0,The Resistance
90137,0.857,0.763,0.809,1.0,0.715,0.8,0.748,0.785,0.65,0.92,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Blood Bowl: Team Manager – The Card Game
71,1.0,0.674,0.886,0.86,0.632,0.74,0.799,0.784,0.789,0.848,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Civilization
146791,0.768,0.731,0.9,1.0,0.689,0.881,0.912,0.845,0.793,0.871,...,0.0,0.573,0.0,0.0,0.0,0.0,0.0,0.561,0.0,Shadows of Brimstone: City of the Ancients
102652,0.805,0.82,0.907,1.0,0.759,0.767,0.849,0.812,0.762,0.902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sentinels of the Multiverse


In [33]:
pd.DataFrame(cosine_distances(tester_df[matching_columns]), columns=names, index=names)

Unnamed: 0,Pandemic Legacy: Season 1,Root,Sherlock Holmes Consulting Detective: The Thames Murders & Other Cases,Zombicide: Black Plague,T.I.M.E Stories,The Resistance,Blood Bowl: Team Manager – The Card Game,Civilization,Shadows of Brimstone: City of the Ancients,Sentinels of the Multiverse
Pandemic Legacy: Season 1,0.0,0.111021,0.113168,0.118809,0.117744,0.115339,0.119808,0.107789,0.116765,0.114523
Root,0.111021,0.0,0.105557,0.099437,0.106628,0.112583,0.092057,0.099136,0.118204,0.110933
Sherlock Holmes Consulting Detective: The Thames Murders & Other Cases,0.113168,0.105557,0.0,0.101491,0.107547,0.117561,0.106506,0.109479,0.10738,0.111333
Zombicide: Black Plague,0.118809,0.099437,0.101491,0.0,0.104505,0.111056,0.079401,0.11104,0.102189,0.110696
T.I.M.E Stories,0.117744,0.106628,0.107547,0.104505,0.0,0.10702,0.105418,0.113355,0.122146,0.112817
The Resistance,0.115339,0.112583,0.117561,0.111056,0.10702,0.0,0.114434,0.115712,0.11604,0.127946
Blood Bowl: Team Manager – The Card Game,0.119808,0.092057,0.106506,0.079401,0.105418,0.114434,0.0,0.111704,0.105801,0.10109
Civilization,0.107789,0.099136,0.109479,0.11104,0.113355,0.115712,0.111704,0.0,0.113079,0.107564
Shadows of Brimstone: City of the Ancients,0.116765,0.118204,0.10738,0.102189,0.122146,0.11604,0.105801,0.113079,0.0,0.102174
Sentinels of the Multiverse,0.114523,0.110933,0.111333,0.110696,0.112817,0.127946,0.10109,0.107564,0.102174,0.0


### Weight

In [34]:
model="weight"
weight_df = games_df[['BGGId', 'GameWeight']].set_index("BGGId").round(2)

# Create the matrix of absolute differences
matrix = pd.DataFrame(
    np.abs(weight_df['GameWeight'].values[:, None] - weight_df['GameWeight'].values),
    index=weight_df.index,
    columns=weight_df.index
)

arrays_df_storage[model] = matrix

results = [id_game_map[x] for x in (matrix[game_id].sort_values(ascending=True)[:10].index)]
results

['Pandemic Legacy: Season 1',
 "Everdell: Collector's Edition",
 'Egizia',
 "Aeon's End: Legacy",
 'Galaxy Trucker: Anniversary Edition',
 'Thunderstone Quest',
 'Endeavor: Age of Sail',
 'Tash-Kalar: Arena of Legends',
 'In the Hall of the Mountain King',
 'Five Tribes: The Djinns of Naqala']

Sanity Checks

In [35]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = weight_df[weight_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches.sort_values("GameWeight")

Unnamed: 0_level_0,GameWeight,name
BGGId,Unnamed: 1_level_1,Unnamed: 2_level_1
161936,2.83,Pandemic Legacy: Season 1
241451,2.83,Aeon's End: Legacy
140068,2.83,Galaxy Trucker: Anniversary Edition
319422,2.83,Everdell: Collector's Edition
58421,2.83,Egizia
157354,2.84,Five Tribes: The Djinns of Naqala
233398,2.84,Endeavor: Age of Sail
215341,2.84,Thunderstone Quest
265402,2.84,In the Hall of the Mountain King
146278,2.84,Tash-Kalar: Arena of Legends


## Vector DB Models

### About Only

In [36]:
# collection_name = "about_only"

# about_df = rag_cleaned_sentences_df[['bggid','about']].copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True)

# about_df = weaviate_client.add_bgg_collection_batch(df=about_df, collection_name=collection_name, use_about=True)

# about_df.to_pickle(f'{sims_dir}about_df.pkl')

In [37]:
collection_name = "about_only"

about_df = pd.read_pickle(f'{sims_dir}about_df.pkl')
models_df_storage[collection_name] = about_df

picks = get_closet_picks(about_df, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_game_map[x] for x in picks['bggid']]

picks

Unnamed: 0,bggid,distance,name
1,221107,0.154367,Pandemic Legacy: Season 2
2,240196,0.208618,Betrayal Legacy
3,257499,0.219105,Arkham Horror (Third Edition)
4,30549,0.225171,Pandemic
5,12493,0.231488,Twilight Imperium: Third Edition
6,291572,0.232008,Oath
7,253344,0.234579,Cthulhu: Death May Die
8,370913,0.237479,Star Wars: The Clone Wars
9,314040,0.238463,Pandemic Legacy: Season 0
10,146508,0.242157,T.I.M.E Stories


### About and Description

In [38]:
# collection_name = "about_and_desc"

# desc_df = rag_cleaned_sentences_df[['bggid','about', 'description']].copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, use_description=True)

# about_df = weaviate_client.add_bgg_collection_batch(df=desc_df, collection_name=collection_name, use_about=True, use_description=True)

# desc_df.to_pickle(f'{sims_dir}desc_df.pkl')

In [39]:
# collection_name = "about_and_desc"

# desc_df = pd.read_pickle(f'{sims_dir}desc_df.pkl')
# models_df_storage[collection_name] = desc_df

# picks = get_closet_picks(desc_df, game_id, model=collection_name, n_picks=10)

# picks['name'] = [id_game_map[x] for x in picks['bggid']]

# picks

### About and Keywords

In [40]:
# collection_name = "about_and_pros_cons"

# about_and_pros_cons = all_rag_attributes_df.drop(columns=['description']).copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, attributes=all_columns_to_produce)

# about_df = weaviate_client.add_bgg_collection_batch(df=about_and_pros_cons, collection_name=collection_name, use_about=True, attributes=all_columns_to_produce)

# about_and_pros_cons.to_pickle(f'{sims_dir}about_and_pros_cons.pkl')

In [41]:
collection_name = "about_and_pros_cons"

about_and_pros_cons = pd.read_pickle(f'{sims_dir}about_and_pros_cons.pkl')
models_df_storage[collection_name] = about_and_pros_cons

picks = get_closet_picks(about_and_pros_cons, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_game_map[x] for x in picks['bggid']]

picks

Unnamed: 0,bggid,distance,name
1,221107,0.148028,Pandemic Legacy: Season 2
2,240196,0.206159,Betrayal Legacy
3,257499,0.218998,Arkham Horror (Third Edition)
4,291572,0.224447,Oath
5,30549,0.225073,Pandemic
6,286063,0.230958,The 7th Citadel
7,12493,0.231088,Twilight Imperium: Third Edition
8,253344,0.234771,Cthulhu: Death May Die
9,167355,0.237707,Nemesis
10,205059,0.239126,Mansions of Madness: Second Edition


### With Keywords, About, and Desc

In [42]:
# collection_name = "all_attributes"

# # Optional scaling of attributes
# # scaler = MinMaxScaler(feature_range=(0,1))
# # for col in all_columns_to_produce:
# #     df[col] = scaler.fit_transform(df[[col]])

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, use_description=True, attributes=all_columns_to_produce)

# all_attributes_df = weaviate_client.add_bgg_collection_batch(df=all_rag_attributes_df, collection_name=collection_name, use_about=True, use_description=True, attributes=all_columns_to_produce)

# all_attributes_df.to_pickle(f'{sims_dir}all_attributes_df.pkl')

In [43]:
# collection_name = "all_attributes"

# all_attributes_df = pd.read_pickle(f'{sims_dir}all_attributes_df.pkl')
# models_df_storage[collection_name] = all_attributes_df

# picks = get_closet_picks(all_attributes_df, game_id, model=collection_name, n_picks=10)

# picks['name'] = [id_game_map[x] for x in picks['bggid']]

# picks

### All Comparisons

In [44]:
models_df_storage.keys(), arrays_df_storage.keys()

(dict_keys(['about_only', 'about_and_pros_cons']),
 dict_keys(['themes', 'categories', 'mechanics', 'pros_cons_only', 'weight']))

In [45]:
test_games

['Dominion',
 'Gloomhaven',
 'Pandemic',
 'Splendor',
 'Viticulture Essential Edition',
 'Great Western Trail',
 'Terraforming Mars',
 'Chess',
 'Azul',
 'Codenames',
 'Pandemic Legacy: Season 1']

In [46]:
all_game_results = {}
for game_name in test_games:
    all_game_results[game_name] = build_results_for_game(game_name)



Evaluation: Dominion 36218
Model: themes
Model: categories
Model: mechanics
Model: pros_cons_only
Model: weight
Model: about_only
Model: about_and_pros_cons

Evaluation: Gloomhaven 174430
Model: themes
Model: categories
Model: mechanics
Model: pros_cons_only
Model: weight
Model: about_only
Model: about_and_pros_cons

Evaluation: Pandemic 30549
Model: themes
Model: categories
Model: mechanics
Model: pros_cons_only
Model: weight
Model: about_only
Model: about_and_pros_cons

Evaluation: Splendor 148228
Model: themes
Model: categories
Model: mechanics
Model: pros_cons_only
Model: weight
Model: about_only
Model: about_and_pros_cons

Evaluation: Viticulture Essential Edition 183394
Model: themes
Model: categories
Model: mechanics
Model: pros_cons_only
Model: weight
Model: about_only
Model: about_and_pros_cons

Evaluation: Great Western Trail 193738
Model: themes
Model: categories
Model: mechanics
Model: pros_cons_only
Model: weight
Model: about_only
Model: about_and_pros_cons

Evaluation: T

In [47]:
all_game_results[game_name]

Unnamed: 0,themes,categories,mechanics,pros_cons_only,weight,about_only,about_and_pros_cons,average_score,name
221107,0.295,0.0,0.104,0.465,0.694,0.0,0.0,0.22,Pandemic Legacy: Season 2
30549,0.0,1.0,0.194,0.395,0.694,0.546,0.577,0.49,Pandemic
314040,1.0,0.0,0.0,0.977,0.516,0.648,0.685,0.55,Pandemic Legacy: Season 0
198928,0.295,0.584,0.312,0.442,0.387,1.0,1.0,0.57,Iberia
367966,0.593,0.0,1.0,0.884,0.016,1.0,1.0,0.64,Endeavor: Deep Sea
334986,0.499,0.584,0.73,0.581,0.177,0.92,1.0,0.64,Daybreak
234671,0.288,0.584,0.392,1.0,0.484,1.0,1.0,0.68,Pandemic: Rising Tide
240196,1.0,1.0,0.836,1.0,0.097,0.418,0.435,0.68,Betrayal Legacy
163602,1.0,0.584,0.73,0.837,0.129,0.763,0.77,0.69,XCOM: The Board Game
241451,1.0,1.0,0.564,0.279,0.0,1.0,1.0,0.69,Aeon's End: Legacy
