In [None]:
import pandas as pd
import numpy as np
from config import CONFIGS
import os
import os
from datetime import datetime

from utils.processing_functions import explode_columnar_df
from utils.weaviate_client import WeaviateClient

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_distances

# visualization packages
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

read_dir = "data/prod/games/game_dfs_clean/"
sims_dir = "data/prod/games/similarity_files/"

weaviate_client = WeaviateClient()
weaviate_client.connect_weaviate_client_docker()


# Content Similarity Models

### File Setup for All Models

In [None]:
games_df = pd.read_pickle(f"{read_dir}games_clean.pkl")
games_df = games_df.sort_values("BayesAvgRating", ascending=False)[
            0 : 1000
        ].reset_index(drop=True)
games_df.head(2)

In [None]:
id_game_map = {x:y for x,y in zip(games_df['BGGId'],games_df['Name'])}
game_id_map = {y:x for x,y in zip(games_df['BGGId'],games_df['Name'])}
bgg_ids = games_df['BGGId'].values
relevant_df = games_df[['BGGId']]
relevant_df.head(2)

In [None]:
test_games = ["Dominion", "Gloomhaven", "Pandemic", "Splendor", "Viticulture Essential Edition", "Great Western Trail", "Everdell", "Chess", "Azul", "Codenames", "Pandemic Legacy: Season 1"]

game_name = test_games[4]
game_id = int(games_df[games_df['Name']==game_name]['BGGId'].values[0])
game_name, game_id

In [None]:
rag_cleaned_sentences_df = pd.read_pickle(f'{sims_dir}top_1000_cleaned_rag.pkl')
rag_cleaned_sentences_df = relevant_df.merge(rag_cleaned_sentences_df, on="BGGId", how="left")
rag_cleaned_sentences_df = rag_cleaned_sentences_df.drop(columns=["Positive_Components","Positive_Sentences","Negative_Components","Negative_Sentences"], axis=1)
rag_cleaned_sentences_df.columns = [x.lower() for x in rag_cleaned_sentences_df.columns]
rag_cleaned_sentences_df.head(2)

In [None]:
all_rag_attributes_df = pd.read_pickle(f"{sims_dir}top_1000_cleaned_rag_with_ratings_extrap.pkl")
all_rag_attributes_df = all_rag_attributes_df.drop(columns=["Description","About","Positive_Components","Negative_Components","Positive_Sentences","Negative_Sentences"])
all_rag_attributes_df.columns = all_rag_attributes_df.columns.str.replace(" ","_")
all_rag_attributes_df.columns = [x.lower() for x in all_rag_attributes_df.columns]
all_rag_attributes_df = rag_cleaned_sentences_df.merge(all_rag_attributes_df, on="bggid", how="left")
all_rag_attributes_df.head(2)

In [None]:
rag_pros_cons_only_df = all_rag_attributes_df.drop(columns=["about","description"],axis=1)
rag_pros_cons_only_df = rag_pros_cons_only_df.fillna(0).set_index("bggid")
rag_pros_cons_only_df.columns = [x.lower() for x in rag_pros_cons_only_df.columns]
rag_pros_cons_only_df.head(2)

In [None]:
positive_columns = [x for x in rag_pros_cons_only_df.columns if x.startswith('positive_')]
negative_columns = [x for x in rag_pros_cons_only_df.columns if x.startswith('negative_')]

all_columns_to_produce = positive_columns + negative_columns
all_columns_to_produce[:5]

In [None]:
models_df_storage = {}
arrays_df_storage = {}

In [None]:
def get_closet_picks(df, game_id, model, n_picks=10):
    uuid = df.loc[df['bggid']==game_id]['UUID'].values[0]

    similars = weaviate_client.find_near_objects(collection_name=model, uuid=uuid, limit=50)

    picks = {}

    for item in similars:
        picks[str(item.uuid)]=item.metadata.distance
        
    picks = pd.DataFrame.from_dict(picks, columns=['distance'], orient='index').sort_values(by='distance', ascending=True).reset_index().rename(columns={'index':'UUID'})

    picks = picks.merge(df, on='UUID', how='inner')[1:]

    return picks[['bggid','distance']].head(n_picks)

In [None]:
def build_results_for_game(game_name):
    game_id = game_id_map[game_name]
    print(f"\nEvaluation: {game_name} {game_id}")

    single_game_results = {}

    for model, model_df in arrays_df_storage.items():
        print(f"Model: {model}")

        picks = model_df[game_id].sort_values(ascending=True)[1:]

        single_game_results[model] = picks

    for collection_name, model_df in models_df_storage.items():
        print(f"Model: {collection_name}")

        picks = get_closet_picks(df=model_df, game_id=game_id, model=collection_name, n_picks=50)

        single_game_results[collection_name] = {x:y for x,y in zip(picks['bggid'], picks['distance'])}

    results_df = pd.DataFrame(single_game_results)

    scaler = MinMaxScaler(feature_range=(0,1))
    for col in results_df.columns:
        results_df[col] = scaler.fit_transform(results_df[[col]])

    results_df = results_df.fillna(1).round(3)
    results_df['average_score'] = results_df.mean(axis=1).round(2)

    results_df['name'] = results_df.index.map(id_game_map)

    return results_df.sort_values("average_score", ascending=True).head(20)

## Array Models - Themes, Categories, Mechanics, Mechanics, Pros-Cons, Game Family

### Themes

In [None]:
model = "themes"

themes_df = pd.read_pickle(f"{read_dir}themes_clean.pkl")
themes_df = explode_columnar_df(themes_df)
themes_df = relevant_df.merge(themes_df, on='BGGId', how='left').set_index('BGGId')
themes_df = themes_df.astype(float).replace(0,0.01)
themes_df = themes_df.fillna(0.01)

cosine_sims = cosine_distances(themes_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

results = [id_game_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]
results

Sanity Checks

In [None]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = themes_df[themes_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches

In [None]:
pd.DataFrame(cosine_distances(tester_df[matching_columns]), columns=names, index=names).round(3)

### Categories Only

In [14]:
# TO DO: Add this to the game cleaning script

# category_df = df[['BGGId','Cat:Thematic', 'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract', 'Cat:Party', 'Cat:Childrens']]
# subcats_df = pd.read_pickle(f"{read_dir}subcategories_clean.pkl")
# subcats_df = explode_columnar_df(subcats_df).reset_index()
# category_df = category_df.merge(subcats_df, on="BGGId", how="left")
# category_df = category_df.set_index("BGGId")
# category_df.to_pickle(f"{read_dir}categories_clean.pkl")
# category_df.head()

In [None]:
cat_df.head(2)

In [None]:
model = "categories"

cat_df = pd.read_pickle(f"{read_dir}categories_clean.pkl")
cat_df = relevant_df.merge(cat_df, on='BGGId', how='left').set_index('BGGId')
# cat_df = cat_df.astype(float).replace(0,0.01)
cat_df = cat_df.fillna(0.01)

cosine_sims = cosine_distances(cat_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

results = [id_game_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]
results


Sanity Checks

In [None]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = cat_df[cat_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches

In [None]:
pd.DataFrame(cosine_distances(tester_df[matching_columns]), columns=names, index=names)

### Mechanics Only

In [None]:
model = "mechanics"

mech_df = pd.read_pickle(f"{read_dir}mechanics_clean.pkl")
mech_df = explode_columnar_df(mech_df)
mech_df = relevant_df.merge(mech_df, on='BGGId', how='left').set_index('BGGId')
mech_df = mech_df.astype(float).replace(0,0.01)
mech_df = mech_df.fillna(0.01)

cosine_sims = cosine_distances(mech_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

results = [id_game_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]
results

Sanity Checks

In [None]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = mech_df[mech_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches

In [None]:
pd.DataFrame(cosine_distances(tester_df[matching_columns]), columns=names, index=names)

### Pros Cons

In [None]:
model = "pros_cons_only"

cosine_sims = cosine_distances(rag_pros_cons_only_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

results = [id_game_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]
results

Sanity Checks

In [None]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = rag_pros_cons_only_df[rag_pros_cons_only_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches

In [None]:
pd.DataFrame(cosine_distances(tester_df[matching_columns]), columns=names, index=names)

### Weight

In [None]:
model="weight"
weight_df = games_df[['BGGId', 'GameWeight']].set_index("BGGId").round(2)

# Create the matrix of absolute differences
matrix = pd.DataFrame(
    np.abs(weight_df['GameWeight'].values[:, None] - weight_df['GameWeight'].values),
    index=weight_df.index,
    columns=weight_df.index
)

arrays_df_storage[model] = matrix

results = [id_game_map[x] for x in (matrix[game_id].sort_values(ascending=True)[:10].index)]
results

Sanity Checks

In [None]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = weight_df[weight_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches.sort_values("GameWeight")

### Game Family

In [None]:
model="game_family"
family_df = explode_columnar_df(games_df[['BGGId', 'Family']])
family_df = family_df.replace(0,0.01)

family_bgg_ids = family_df.index

cosine_sims = cosine_distances(family_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=family_bgg_ids, index=family_bgg_ids)

arrays_df_storage[model] = sims_byid

results = [id_game_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]
results

Sanity Checks

In [None]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = family_df[family_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches

### Various Numerical Attributes

In [None]:
various_df = games_df[['BGGId', 'Name',
    'BestPlayers',# 'YearPublished',
       'MfgPlaytime',
       'BayesAvgRating',
       'Rank:strategygames', 'Rank:abstracts', 'Rank:partygames',
       'Rank:wargames', 'Rank:thematic', 'Rank:familygames',
       'Rank:childrensgames', 'Rank:cgs', 'GameWeight']].copy()

# Find the mask where 'Rank:strategygames' is NA
mask = various_df['Rank:strategygames'].isna()

# Assign the index positions only to the rows matching the mask
various_df.loc[mask, 'Rank:strategygames'] = various_df.index[mask]

# various_df['GameAgeYears'] = datetime.now().year - various_df['YearPublished']
# various_df = various_df.drop(columns=['Name','YearPublished'], axis=1)

various_df = various_df.drop(columns=['Name'], axis=1)

various_df.columns = [x.lower() for x in various_df.columns]

various_df.head()

In [29]:
# sns.set(font_scale=1)
# games_c = various_df.drop("Name", axis=1).set_index("BGGId").corr()
# # plot a heat map for all correlations in our data set

# # make our figure
# fig, ax = plt.subplots(figsize=(10, 10))

# # we want our heatmap to not show the upper triangle, which is redundant data
# games_c_mask = np.triu(np.ones_like(games_c, dtype=bool))

# # adjust mask and df to hide center diagonal
# games_c_mask = games_c_mask[1:, :-1]
# corr = games_c.iloc[1:, :-1].copy()

# # color map
# cmap = sns.diverging_palette(220, 20, as_cmap=True)

# # plot heatmap
# sns.heatmap(
#     corr,
#     mask=games_c_mask,
#     annot=True,
#     fmt=".2f",
#     cmap=cmap,
#     vmin=-1,
#     vmax=1,
#     cbar_kws={"shrink": 0.8},
#     square=True,
# )

# # yticks rotate
# plt.yticks(rotation=1)

# # title
# title = "CORRELATION MATRIX\nRanking Categories\n"
# plt.title(title, fontsize=14)

# # plt.savefig('images/heatmap.png')

# plt.show()

In [None]:
various_df = various_df.set_index('bggid')

scaler = MinMaxScaler(feature_range=(0,1))

for col in various_df.columns:
    various_df[col] = scaler.fit_transform(various_df[[col]])

various_df = various_df.fillna(0)

various_df.head(2)

In [None]:
model = "various_attributes"

cosine_sims = cosine_distances(various_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

results = [id_game_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]
results

Sanity Checks

In [None]:
results_ids = [game_id_map[x] for x,y in game_id_map.items() if x in results]
tester_df = various_df[various_df.index.isin(results_ids)]
matching_columns = [x for x in tester_df.columns if tester_df[x].sum()>(tester_df.shape[0]*.01 + .001)]
only_matches = tester_df[matching_columns]
only_matches['name'] = only_matches.index.map(id_game_map)
names = only_matches['name'].values
only_matches

In [None]:
pd.DataFrame(cosine_distances(tester_df[matching_columns]), columns=names, index=names)

## Vector DB Models

### About Only

In [34]:
# collection_name = "about_only"

# about_df = rag_cleaned_sentences_df[['bggid','about']].copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True)

# about_df = weaviate_client.add_bgg_collection_batch(df=about_df, collection_name=collection_name, use_about=True)

# about_df.to_pickle(f'{sims_dir}about_df.pkl')

In [None]:
collection_name = "about_only"

about_df = pd.read_pickle(f'{sims_dir}about_df.pkl')
models_df_storage[collection_name] = about_df

picks = get_closet_picks(about_df, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_game_map[x] for x in picks['bggid']]

picks

### About and Description

In [36]:
# collection_name = "about_and_desc"

# desc_df = rag_cleaned_sentences_df[['bggid','about', 'description']].copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, use_description=True)

# about_df = weaviate_client.add_bgg_collection_batch(df=desc_df, collection_name=collection_name, use_about=True, use_description=True)

# desc_df.to_pickle(f'{sims_dir}desc_df.pkl')

In [37]:
# collection_name = "about_and_desc"

# desc_df = pd.read_pickle(f'{sims_dir}desc_df.pkl')
# models_df_storage[collection_name] = desc_df

# picks = get_closet_picks(desc_df, game_id, model=collection_name, n_picks=10)

# picks['name'] = [id_game_map[x] for x in picks['bggid']]

# picks

### About and Keywords

In [38]:
# collection_name = "about_and_pros_cons"

# about_and_pros_cons = all_rag_attributes_df.drop(columns=['description']).copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, attributes=all_columns_to_produce)

# about_df = weaviate_client.add_bgg_collection_batch(df=about_and_pros_cons, collection_name=collection_name, use_about=True, attributes=all_columns_to_produce)

# about_and_pros_cons.to_pickle(f'{sims_dir}about_and_pros_cons.pkl')

In [None]:
collection_name = "about_and_pros_cons"

about_and_pros_cons = pd.read_pickle(f'{sims_dir}about_and_pros_cons.pkl')
models_df_storage[collection_name] = about_and_pros_cons

picks = get_closet_picks(about_and_pros_cons, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_game_map[x] for x in picks['bggid']]

picks

### With Keywords, About, and Desc

In [40]:
# collection_name = "all_attributes"

# # Optional scaling of attributes
# # scaler = MinMaxScaler(feature_range=(0,1))
# # for col in all_columns_to_produce:
# #     df[col] = scaler.fit_transform(df[[col]])

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, use_description=True, attributes=all_columns_to_produce)

# all_attributes_df = weaviate_client.add_bgg_collection_batch(df=all_rag_attributes_df, collection_name=collection_name, use_about=True, use_description=True, attributes=all_columns_to_produce)

# all_attributes_df.to_pickle(f'{sims_dir}all_attributes_df.pkl')

In [41]:
# collection_name = "all_attributes"

# all_attributes_df = pd.read_pickle(f'{sims_dir}all_attributes_df.pkl')
# models_df_storage[collection_name] = all_attributes_df

# picks = get_closet_picks(all_attributes_df, game_id, model=collection_name, n_picks=10)

# picks['name'] = [id_game_map[x] for x in picks['bggid']]

# picks

### All Comparisons

In [None]:
models_df_storage.keys(), arrays_df_storage.keys()

In [None]:
test_games

In [44]:
game_name = test_games[-1]

In [None]:
all_game_results = {}
for game_name in test_games:
    all_game_results[game_name] = build_results_for_game(game_name)

In [None]:
all_game_results["Dominion"]

In [None]:
all_game_results["Gloomhaven"]

In [None]:
all_game_results["Everdell"]

In [None]:
all_game_results["Pandemic"]

In [None]:
all_game_results["Splendor"]

In [None]:
all_game_results["Viticulture Essential Edition"]

In [None]:
all_game_results["Great Western Trail"]

In [None]:
all_game_results["Chess"]

In [None]:
all_game_results["Azul"]

In [None]:
all_game_results["Codenames"]

In [None]:
all_game_results["Pandemic Legacy: Season 1"]