In [1]:
import pandas as pd
import numpy as np
from config import CONFIGS
import os
import os
from datetime import datetime

from utils.processing_functions import explode_columnar_df
from utils.weaviate_client import WeaviateClient

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_distances

# visualization packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# turn off jupyter warnings
import warnings
warnings.filterwarnings('ignore')

ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

read_dir = "data/prod/games/game_dfs_clean/"
sims_dir = "data/prod/games/similarity_files/"

weaviate_client = WeaviateClient()
weaviate_client.connect_weaviate_client_docker()


IS_LOCAL: True

Checking for local config file and evaluating for updates from S3.
Loading config from local


  warn(


<weaviate.client.WeaviateClient at 0x12c915580>

# Content Similarity Models

### File Setup for All Models

In [2]:
games_df = pd.read_pickle(f"{read_dir}games_clean.pkl")
games_df = games_df.sort_values("BayesAvgRating", ascending=False)[
            0 : 1000
        ].reset_index(drop=True)

id_game_map = {x:y for x,y in zip(games_df['BGGId'],games_df['Name'])}
game_id_map = {y:x for x,y in zip(games_df['BGGId'],games_df['Name'])}
bgg_ids = games_df['BGGId'].values
relevant_df = games_df[['BGGId']]

rag_cleaned_sentences_df = pd.read_pickle(f'{sims_dir}top_1000_cleaned_rag.pkl')
rag_cleaned_sentences_df = relevant_df.merge(rag_cleaned_sentences_df, on="BGGId", how="left")
rag_cleaned_sentences_df = rag_cleaned_sentences_df.drop(columns=["Positive_Components","Positive_Sentences","Negative_Components","Negative_Sentences"], axis=1)
rag_cleaned_sentences_df.columns = [x.lower() for x in rag_cleaned_sentences_df.columns]

rag_cleaned_sentences_df.head()

Unnamed: 0,bggid,description,about
0,224517,brass birmingham economic strategy game sequel...,game strategic economic simulation set industr...
1,161936,pandemic legacy cooperative campaign game over...,game cooperative board game players work toget...
2,174430,gloomhavenis game euroinspired tactical combat...,game cooperative tactical adventure combines e...
3,342942,ark nova plan design modern scientifically man...,game strategic board game players manage zoo f...
4,363622,castles burgundy legendary board game design c...,game strategic board game revolves around deve...


In [3]:
all_rag_attributes_df = pd.read_pickle(f"{sims_dir}top_1000_cleaned_rag_with_ratings_extrap.pkl")
all_rag_attributes_df = all_rag_attributes_df.drop(columns=["Description","About","Positive_Components","Negative_Components","Positive_Sentences","Negative_Sentences"])
all_rag_attributes_df.columns = all_rag_attributes_df.columns.str.replace(" ","_")
all_rag_attributes_df.columns = [x.lower() for x in all_rag_attributes_df.columns]
all_rag_attributes_df = rag_cleaned_sentences_df.merge(all_rag_attributes_df, on="bggid", how="left")

all_rag_attributes_df.head()

Unnamed: 0,bggid,description,about,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_thematic_integration,positive_component_quality,negative_steep_learning_curve,...,negative_anticlimactic_endings,positive_excellent_production_values,positive_attractive_artwork,positive_accessible_non_gamers,positive_variable_experiences,negative_catch_up_mechanics,positive_simple_to_learning,negative_action_limitations,positive_good_themes,positive_good_interactions
0,224517,brass birmingham economic strategy game sequel...,game strategic economic simulation set industr...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,161936,pandemic legacy cooperative campaign game over...,game cooperative board game players work toget...,0.834,0.739,0.848,0.951,0.746,0.683,0.843,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,174430,gloomhavenis game euroinspired tactical combat...,game cooperative tactical adventure combines e...,1.0,0.853,0.785,1.0,0.61,0.893,0.943,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,342942,ark nova plan design modern scientifically man...,game strategic board game players manage zoo f...,1.0,0.774,1.0,0.904,0.734,0.974,0.832,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,363622,castles burgundy legendary board game design c...,game strategic board game revolves around deve...,0.855,0.784,0.862,1.0,0.758,0.922,0.835,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
rag_pros_cons_only_df = all_rag_attributes_df.drop(columns=["about","description"],axis=1)
rag_pros_cons_only_df = rag_pros_cons_only_df.fillna(0).set_index("bggid")
rag_pros_cons_only_df.columns = [x.lower() for x in rag_pros_cons_only_df.columns]

positive_columns = [x for x in rag_pros_cons_only_df.columns if x.startswith('positive_')]
negative_columns = [x for x in rag_pros_cons_only_df.columns if x.startswith('negative_')]
all_columns_to_produce = positive_columns + negative_columns

models_df_storage = {}
arrays_df_storage = {}

rag_pros_cons_only_df.head()

Unnamed: 0_level_0,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_thematic_integration,positive_component_quality,negative_steep_learning_curve,negative_fiddly_mechanics,negative_variable_enjoyment,negative_lengthy_playtime,...,negative_anticlimactic_endings,positive_excellent_production_values,positive_attractive_artwork,positive_accessible_non_gamers,positive_variable_experiences,negative_catch_up_mechanics,positive_simple_to_learning,negative_action_limitations,positive_good_themes,positive_good_interactions
bggid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
224517,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
161936,0.834,0.739,0.848,0.951,0.746,0.683,0.843,0.78,0.729,0.789,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
174430,1.0,0.853,0.785,1.0,0.61,0.893,0.943,0.834,0.773,0.824,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
342942,1.0,0.774,1.0,0.904,0.734,0.974,0.832,0.822,0.769,0.902,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
363622,0.855,0.784,0.862,1.0,0.758,0.922,0.835,0.78,0.775,0.798,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
test_games = ["Dominion", "Gloomhaven", "Too Many Bones", "Pandemic", "Splendor", "Viticulture Essential Edition", "Great Western Trail", "Everdell", "Chess", "Azul", "Codenames", "Pandemic Legacy: Season 1"]

game_name = "Pandemic"
game_id = int(games_df[games_df['Name']==game_name]['BGGId'].values[0])
print(game_name, game_id)

Pandemic 30549


In [6]:
def get_closet_picks_weaviate(df, game_id, model, n_picks=10):
    uuid = df.loc[df['bggid']==game_id]['UUID'].values[0]

    similars = weaviate_client.find_near_objects(collection_name=model, uuid=uuid, limit=n_picks)

    picks = {}

    for item in similars:
        picks[str(item.uuid)]=item.metadata.distance
        
    picks = pd.DataFrame.from_dict(picks, columns=['distance'], orient='index').sort_values(by='distance', ascending=True).reset_index().rename(columns={'index':'UUID'})

    picks = picks.merge(df, on='UUID', how='inner')[1:]

    return picks[['bggid','distance']].head(n_picks)

In [7]:
def build_results_for_game(game_name):
    game_id = game_id_map[game_name]
    print(f"\nEvaluation: {game_name} {game_id}")

    single_game_results = {}

    for model, model_df in arrays_df_storage.items():
        print(f"Model: {model}")

        mask = model_df[game_id].sort_values(ascending=True) < 1.0
        picks = model_df[game_id].sort_values(ascending=True)[mask]

        # remove the entry that is the same as the game_id, by index. It is not necessarily entry 0
        picks = picks.drop(game_id)

        single_game_results[model] = picks

    for collection_name, model_df in models_df_storage.items():
        print(f"Model: {collection_name}")

        picks = get_closet_picks_weaviate(df=model_df, game_id=game_id, model=collection_name, n_picks=100)

        single_game_results[collection_name] = {x:y for x,y in zip(picks['bggid'], picks['distance'])}

    results_df = pd.DataFrame(single_game_results)

    results_df = results_df.fillna(1).round(3)
    results_df['average_score'] = results_df.mean(axis=1).round(2)

    results_df['name'] = results_df.index.map(id_game_map)

    return results_df.sort_values("average_score", ascending=True).head(50)

In [8]:
def explode_merge_fill(df, reference_df, fill=False, explode=False):
    if explode:
        df = explode_columnar_df(df)
    df = reference_df.merge(df, on='BGGId', how='left').set_index('BGGId')

    if fill==True:
        df = df.astype(float).replace(0,0.01)
        df = df.fillna(0.01)
    else:
        df = df.fillna(0)
    return df

def get_cosine_distances_df(df):
    cosine_sims = cosine_distances(df).round(3)
    return pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

## Array Models - Themes, Categories, Mechanics, Mechanics, Pros-Cons, Game Family

### Weight

In [9]:
# model="weight"
# weight_df = games_df[['BGGId', 'GameWeight']].set_index("BGGId").round(2)

# # Create the matrix of absolute differences
# matrix = pd.DataFrame(
#     np.abs(weight_df['GameWeight'].values[:, None] - weight_df['GameWeight'].values),
#     index=weight_df.index,
#     columns=weight_df.index
# ).round(4)

# arrays_df_storage[model] = matrix

# {id_game_map[x]:y for x,y in zip(matrix[game_id].sort_values(ascending=True)[:15].index, matrix[game_id].sort_values(ascending=True)[:15])}

### Game Family

In [10]:
model="game_family"

family_df = explode_merge_fill(games_df[['BGGId', 'Family']], relevant_df, explode=True, fill=False)
sims_by_id = get_cosine_distances_df(family_df)

arrays_df_storage[model] = sims_by_id

results = [id_game_map[x] for x in (sims_by_id[game_id].sort_values(ascending=True)[:10].index)]

{id_game_map[x]:y for x,y in zip(sims_by_id[game_id].sort_values(ascending=True)[:15].index, sims_by_id[game_id].sort_values(ascending=True)[:15])}

{'Fall of Rome': 0.0,
 'Pandemic': 0.0,
 'Star Wars: The Clone Wars': 0.0,
 'Iberia': 0.0,
 'Pandemic: Rising Tide': 0.0,
 'Pandemic Legacy: Season 0': 0.0,
 'Pandemic: The Cure': 0.0,
 'Reign of Cthulhu': 0.0,
 'Pandemic Legacy: Season 1': 0.0,
 'Pandemic Legacy: Season 2': 0.0,
 'Horizons of Spirit Island': 1.0,
 'Pax Porfiriana': 1.0,
 'Baseball Highlights: 2045': 1.0,
 'Pathfinder Adventure Card Game: Rise of the Runelords – Base Set': 1.0,
 'Nexus Ops': 1.0}

### Themes

In [11]:
model = "themes"

themes_df = pd.read_pickle(f"{read_dir}themes_clean.pkl")
themes_df = explode_merge_fill(themes_df, relevant_df, explode=True, fill=False)
sims_by_id = get_cosine_distances_df(themes_df)

arrays_df_storage[model] = sims_by_id

{id_game_map[x]:y for x,y in zip(sims_by_id[game_id].sort_values(ascending=True)[:15].index, sims_by_id[game_id].sort_values(ascending=True)[:15])}


{'Pandemic': 0.0,
 'Pandemic Legacy: Season 1': 0.184,
 'Iberia': 0.293,
 'Dice Hospital': 0.293,
 'Pandemic: The Cure': 0.5,
 'Terraforming Mars: Ares Expedition': 0.592,
 'Pandemic Legacy: Season 2': 0.646,
 'Messina 1347': 0.646,
 'Clinic: Deluxe Edition': 0.646,
 'Terraforming Mars': 0.711,
 'Vinhos': 1.0,
 "Warp's Edge": 1.0,
 'Colosseum': 1.0,
 'Pathfinder Adventure Card Game: Rise of the Runelords – Base Set': 1.0,
 'Nexus Ops': 1.0}

### Categories Only

In [12]:
# TO DO: Add this to the game cleaning script

# category_df = df[['BGGId','Cat:Thematic', 'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract', 'Cat:Party', 'Cat:Childrens']]
# subcats_df = pd.read_pickle(f"{read_dir}subcategories_clean.pkl")
# subcats_df = explode_columnar_df(subcats_df).reset_index()
# category_df = category_df.merge(subcats_df, on="BGGId", how="left")
# category_df = category_df.set_index("BGGId")
# category_df.to_pickle(f"{read_dir}categories_clean.pkl")
# category_df.head()

In [13]:
model = "categories"

cat_df = pd.read_pickle(f"{read_dir}categories_clean.pkl")

cat_df = explode_merge_fill(cat_df, relevant_df)

sims_by_id = get_cosine_distances_df(cat_df)

arrays_df_storage[model] = sims_by_id

{id_game_map[x]:y for x,y in zip(sims_by_id[game_id].sort_values(ascending=True)[:15].index, sims_by_id[game_id].sort_values(ascending=True)[:15])}

{'Small World Underground': 0.0,
 'Kingsburg': 0.0,
 'First Rat': 0.0,
 'Chinatown': 0.0,
 'Raccoon Tycoon': 0.0,
 'My City': 0.0,
 'Small World': 0.0,
 'Kingdom Builder': 0.0,
 'Stone Age: Anniversary': 0.0,
 'Dice Hospital': 0.0,
 'Fresco': 0.0,
 'Libertalia: Winds of Galecrest': 0.0,
 'Isle of Skye: From Chieftain to King': 0.0,
 'Clank!: Catacombs': 0.0,
 'Blue Moon City': 0.0}

### Mechanics Only

In [14]:
model = "mechanics"

mech_df = pd.read_pickle(f"{read_dir}mechanics_clean.pkl")

mech_df = explode_merge_fill(mech_df, relevant_df, explode=True)
sims_by_id = get_cosine_distances_df(mech_df)

arrays_df_storage[model] = sims_by_id

{id_game_map[x]:y for x,y in zip(sims_by_id[game_id].sort_values(ascending=True)[:15].index, sims_by_id[game_id].sort_values(ascending=True)[:15])}

{'Pandemic': 0.0,
 'Reign of Cthulhu': 0.143,
 'Pandemic Legacy: Season 0': 0.163,
 'Pandemic Legacy: Season 1': 0.163,
 'Star Wars: The Clone Wars': 0.198,
 'Defenders of the Realm': 0.228,
 'Pandemic Legacy: Season 2': 0.236,
 'Fall of Rome': 0.244,
 'Iberia': 0.244,
 'Pandemic: The Cure': 0.283,
 'Cerebria: The Inside World': 0.332,
 'Pandemic: Rising Tide': 0.332,
 'Black Orchestra': 0.345,
 'Zombicide Season 2: Prison Outbreak': 0.383,
 'Unmatched Adventures: Tales to Amaze': 0.402}

### Pros Cons

In [15]:
model = "pros_cons_only"

sims_by_id = get_cosine_distances_df(rag_pros_cons_only_df)

arrays_df_storage[model] = sims_by_id

{id_game_map[x]:y for x,y in zip(sims_by_id[game_id].sort_values(ascending=True)[:15].index, sims_by_id[game_id].sort_values(ascending=True)[:15])}

{'Pandemic': 0.0,
 'Anachrony': 0.091,
 'Sub Terra': 0.097,
 'Forbidden Island': 0.098,
 'Western Legends': 0.099,
 'Dice Forge': 0.1,
 'Slay the Spire: The Board Game': 0.102,
 'Gears of War: The Board Game': 0.102,
 'Escape: The Curse of the Temple': 0.102,
 'The Lord of the Rings: The Card Game': 0.103,
 'Viticulture Essential Edition': 0.103,
 'Keep the Heroes Out!': 0.104,
 'SpaceCorp: 2025-2300AD': 0.104,
 "Andromeda's Edge": 0.104,
 'This War of Mine: The Board Game': 0.104}

### Various Numerical Attributes

In [16]:
various_df = games_df[['BGGId', 'Name',
    'BestPlayers',# 'YearPublished',
       'MfgPlaytime',
       'BayesAvgRating',
       'Rank:strategygames', 'Rank:abstracts', 'Rank:partygames',
       'Rank:wargames', 'Rank:thematic', 'Rank:familygames',
       'Rank:childrensgames', 'Rank:cgs', 'GameWeight']].copy()

# Find the mask where 'Rank:strategygames' is NA
mask = various_df['Rank:strategygames'].isna()

# Assign the index positions only to the rows matching the mask
various_df.loc[mask, 'Rank:strategygames'] = various_df.index[mask]

# various_df['GameAgeYears'] = datetime.now().year - various_df['YearPublished']
# various_df = various_df.drop(columns=['Name','YearPublished'], axis=1)

various_df = various_df.drop(columns=['Name'], axis=1)

various_df.columns = [x.lower() for x in various_df.columns]

various_df = various_df.set_index('bggid')

scaler = MinMaxScaler(feature_range=(0,1))

for col in various_df.columns:
    various_df[col] = scaler.fit_transform(various_df[[col]])

various_df = various_df.fillna(0)

various_df.head()

Unnamed: 0_level_0,bestplayers,mfgplaytime,bayesavgrating,rank:strategygames,rank:abstracts,rank:partygames,rank:wargames,rank:thematic,rank:familygames,rank:childrensgames,rank:cgs,gameweight
bggid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
224517,0.181818,0.1,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.751198
161936,0.272727,0.05,0.979446,3.6e-05,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.479042
174430,0.181818,0.1,0.961011,0.000107,1.0,1.0,1.0,3.6e-05,1.0,1.0,1.0,0.762142
342942,0.090909,0.125,0.959305,7.1e-05,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.724519
363622,0.090909,0.1,0.911909,0.000107,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.490902


In [17]:
# sns.set(font_scale=1)
# games_c = various_df.drop("Name", axis=1).set_index("BGGId").corr()
# # plot a heat map for all correlations in our data set

# # make our figure
# fig, ax = plt.subplots(figsize=(10, 10))

# # we want our heatmap to not show the upper triangle, which is redundant data
# games_c_mask = np.triu(np.ones_like(games_c, dtype=bool))

# # adjust mask and df to hide center diagonal
# games_c_mask = games_c_mask[1:, :-1]
# corr = games_c.iloc[1:, :-1].copy()

# # color map
# cmap = sns.diverging_palette(220, 20, as_cmap=True)

# # plot heatmap
# sns.heatmap(
#     corr,
#     mask=games_c_mask,
#     annot=True,
#     fmt=".2f",
#     cmap=cmap,
#     vmin=-1,
#     vmax=1,
#     cbar_kws={"shrink": 0.8},
#     square=True,
# )

# # yticks rotate
# plt.yticks(rotation=1)

# # title
# title = "CORRELATION MATRIX\nRanking Categories\n"
# plt.title(title, fontsize=14)

# # plt.savefig('images/heatmap.png')

# plt.show()

In [18]:
model = "various_attributes"

sims_byid = get_cosine_distances_df(various_df)

arrays_df_storage[model] = sims_byid

{id_game_map[x]:y for x,y in zip(sims_by_id[game_id].sort_values(ascending=True)[:15].index, sims_by_id[game_id].sort_values(ascending=True)[:15])}

{'Pandemic': 0.0,
 'Anachrony': 0.091,
 'Sub Terra': 0.097,
 'Forbidden Island': 0.098,
 'Western Legends': 0.099,
 'Dice Forge': 0.1,
 'Slay the Spire: The Board Game': 0.102,
 'Gears of War: The Board Game': 0.102,
 'Escape: The Curse of the Temple': 0.102,
 'The Lord of the Rings: The Card Game': 0.103,
 'Viticulture Essential Edition': 0.103,
 'Keep the Heroes Out!': 0.104,
 'SpaceCorp: 2025-2300AD': 0.104,
 "Andromeda's Edge": 0.104,
 'This War of Mine: The Board Game': 0.104}

## Vector DB Models

### About Only

In [19]:
collection_name = "about"

about_df = rag_cleaned_sentences_df[['bggid','about']].copy()

weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True)

about_df = weaviate_client.add_bgg_collection_batch(df=about_df, collection_name=collection_name, use_about=True)

about_df.to_pickle(f'{sims_dir}about_df.pkl')

In [20]:
collection_name = "about"

about_df = pd.read_pickle(f'{sims_dir}about_df.pkl')
models_df_storage[collection_name] = about_df

picks = get_closet_picks_weaviate(about_df, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_game_map[x] for x in picks['bggid']]

picks

Unnamed: 0,bggid,distance,name
1,161936,0.216663,Pandemic Legacy: Season 1
2,43528,0.2266,World Without End
3,314040,0.22933,Pandemic Legacy: Season 0
4,214029,0.234133,SpaceCorp: 2025-2300AD
5,265402,0.236717,In the Hall of the Mountain King
6,269511,0.239793,Cooper Island
7,150658,0.240081,Pandemic: The Cure
8,234671,0.241333,Pandemic: Rising Tide
9,286751,0.243859,Zombicide: 2nd Edition


### Description

In [21]:
# collection_name = "description"

# desc_df = rag_cleaned_sentences_df[['bggid','description']].copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=False, use_description=True)

# about_df = weaviate_client.add_bgg_collection_batch(df=desc_df, collection_name=collection_name, use_about=False, use_description=True)

# desc_df.to_pickle(f'{sims_dir}desc_df.pkl')

In [22]:
collection_name = "description"

only_desc_df = pd.read_pickle(f'{sims_dir}desc_df.pkl')
models_df_storage[collection_name] = only_desc_df

picks = get_closet_picks_weaviate(only_desc_df, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_game_map[x] for x in picks['bggid']]

picks

WeaviateQueryError: Query call with protocol GRPC search failed with message <AioRpcError of RPC that terminated with:
	status = StatusCode.UNKNOWN
	details = "explorer: get class: concurrentTargetVectorSearch): explorer: get class: vectorize search vector: nearObject params: vector not found"
	debug_error_string = "UNKNOWN:Error received from peer  {grpc_message:"explorer: get class: concurrentTargetVectorSearch): explorer: get class: vectorize search vector: nearObject params: vector not found", grpc_status:2, created_time:"2025-01-02T13:04:20.882134-08:00"}"
>.

### About and Description

In [None]:
collection_name = "about_and_desc"

about_and_desc_df = rag_cleaned_sentences_df[['bggid','about', 'description']].copy()

weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, use_description=True)

about_df = weaviate_client.add_bgg_collection_batch(df=about_and_desc_df, collection_name=collection_name, use_about=True, use_description=True)

about_and_desc_df.to_pickle(f'{sims_dir}about_and_desc_df.pkl')

In [None]:
collection_name = "about_and_desc"

desc_df = pd.read_pickle(f'{sims_dir}about_and_desc_df.pkl')
models_df_storage[collection_name] = desc_df

picks = get_closet_picks_weaviate(desc_df, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_game_map[x] for x in picks['bggid']]

picks

### About and Keywords

In [None]:
# collection_name = "about_and_pros_cons"

# about_and_pros_cons = all_rag_attributes_df.drop(columns=['description']).copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, attributes=all_columns_to_produce)

# about_df = weaviate_client.add_bgg_collection_batch(df=about_and_pros_cons, collection_name=collection_name, use_about=True, attributes=all_columns_to_produce)

# about_and_pros_cons.to_pickle(f'{sims_dir}about_and_pros_cons.pkl')

In [None]:
collection_name = "about_and_pros_cons"

about_and_pros_cons = pd.read_pickle(f'{sims_dir}about_and_pros_cons.pkl')
models_df_storage[collection_name] = about_and_pros_cons

picks = get_closet_picks_weaviate(about_and_pros_cons, game_id, model=collection_name, n_picks=30)

picks['name'] = [id_game_map[x] for x in picks['bggid']]

picks

In [None]:
weird_matches = about_and_pros_cons[about_and_pros_cons['bggid'].isin([game_id,43528,269511,161936,265402,231733])][['bggid','about']]
weird_dict = {id_game_map[x]:y for x,y in weird_matches.values}

for key, value in weird_dict.items():
    print(f"\n{key}: {value}")

### With Keywords, About, and Desc

In [None]:
# collection_name = "all_attributes"

# # Optional scaling of attributes
# # scaler = MinMaxScaler(feature_range=(0,1)) 
# # for col in all_columns_to_produce:
# #     df[col] = scaler.fit_transform(df[[col]])

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, use_description=True, attributes=all_columns_to_produce)

# all_attributes_df = weaviate_client.add_bgg_collection_batch(df=all_rag_attributes_df, collection_name=collection_name, use_about=True, use_description=True, attributes=all_columns_to_produce)

# all_attributes_df.to_pickle(f'{sims_dir}all_attributes_df.pkl')

In [None]:
collection_name = "all_attributes"

all_attributes_df = pd.read_pickle(f'{sims_dir}all_attributes_df.pkl')
models_df_storage[collection_name] = all_attributes_df

picks = get_closet_picks_weaviate(all_attributes_df, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_game_map[x] for x in picks['bggid']]

picks

### Scaling Work on Results

In [57]:
def build_results_for_game(game_name):
    game_id = game_id_map[game_name]
    print(f"\nEvaluation: {game_name} {game_id}")

    single_game_results = {}

    for model, model_df in arrays_df_storage.items():
        print(f"Model: {model}")

        mask = model_df[game_id].sort_values(ascending=True) < 1.0
        picks = model_df[game_id].sort_values(ascending=True)[mask]

        # remove the entry that is the same as the game_id, by index. It is not necessarily entry 0
        picks = picks.drop(game_id)

        single_game_results[model] = picks

    for collection_name, model_df in models_df_storage.items():
        print(f"Model: {collection_name}")

        picks = get_closet_picks_weaviate(df=model_df, game_id=game_id, model=collection_name, n_picks=100)

        single_game_results[collection_name] = {x:y for x,y in zip(picks['bggid'], picks['distance'])}

    results_df = pd.DataFrame(single_game_results)

    results_df = results_df.fillna(1).round(3)

    # for col in results_df.columns:
    #     scaler = MinMaxScaler(feature_range=(0,results_df[col].max()))
    #     results_df[col] = scaler.fit_transform(results_df[[col]]).round(3)

    results_df['average_score'] = results_df.mean(axis=1).round(2)

    results_df['name'] = results_df.index.map(id_game_map)
    results_df = results_df.sort_values("average_score", ascending=True).head(20)

    return results_df

In [None]:
test_df = build_results_for_game("Pandemic")

test_df

In [None]:
about_and_pros_cons.head()

In [31]:
pandemic_ids = [int(x) for x in games_df[games_df['Name'].str.lower().str.contains("pandemic")]['BGGId'].values]
pandemic_names = [id_game_map[i] for i in pandemic_ids]

In [None]:
pandemic_cats = cat_df[cat_df.index.isin(pandemic_ids)]
cosine_sims = cosine_distances(pandemic_cats).round(3)
pd.DataFrame(cosine_sims, columns=pandemic_names, index=pandemic_names)

In [None]:
pandemic_cats = mech_df[mech_df.index.isin(pandemic_ids)]
cosine_sims = cosine_distances(pandemic_cats).round(3)
pd.DataFrame(cosine_sims, columns=pandemic_names, index=pandemic_names)

In [None]:
pandemic_cats = rag_pros_cons_only_df[rag_pros_cons_only_df.index.isin(pandemic_ids)]
cosine_sims = cosine_distances(pandemic_cats).round(3)
pd.DataFrame(cosine_sims, columns=pandemic_names, index=pandemic_names)

In [None]:
pandemic_cats = various_df[various_df.index.isin(pandemic_ids)]
cosine_sims = cosine_distances(pandemic_cats).round(3)
pd.DataFrame(cosine_sims, columns=pandemic_names, index=pandemic_names)

### All Comparisons

In [None]:
break

In [None]:
models_df_storage.keys(), arrays_df_storage.keys()

In [None]:
test_games

In [None]:
build_results_for_game("Too Many Bones")

In [None]:
build_results_for_game("Gloomhaven")

In [None]:
all_game_results = {}
for game_name in test_games:
    all_game_results[game_name] = build_results_for_game(game_name)

In [None]:
all_game_results["Dominion"]

In [None]:
all_game_results["Gloomhaven"]

In [None]:
all_game_results["Everdell"]

In [None]:
all_game_results["Viticulture Essential Edition"]

In [None]:
all_game_results["Pandemic"]

In [None]:
all_game_results["Splendor"]

In [None]:
all_game_results["Great Western Trail"]

In [None]:
all_game_results["Chess"]

In [None]:
all_game_results["Azul"]

In [None]:
all_game_results["Codenames"]

In [None]:
all_game_results["Pandemic Legacy: Season 1"]

In [None]:
game_id = game_id_map["Gloomhaven"]
print(game_id)
this_game_family = games_df.loc[games_df["BGGId"]==game_id]['Family']
ids_in_this_family = games_df.loc[games_df["Family"]==this_game_family.values[0]]['BGGId']
games_df[games_df["BGGId"].isin(ids_in_this_family)]

In [None]:
test_df = build_results_for_game("Gloomhaven")
test_df