In [None]:
import pandas as pd
import numpy as np
from config import CONFIGS
import os
import os

from utils.processing_functions import explode_columnar_df
from utils.weaviate_client import WeaviateClient

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

read_dir = "data/prod/games/game_dfs_clean/"
sims_dir = "data/prod/games/similarity_files/"

weaviate_client = WeaviateClient()
weaviate_client.connect_weaviate_client_docker()


In [2]:
def get_closet_picks(df, game_id, model="pros_cons_only", n_picks=10):
    uuid = df.loc[df.index==game_id]['UUID'].values[0]

    similars = weaviate_client.find_near_objects(collection_name=model, uuid=uuid, limit=50)

    picks = {}

    for item in similars:
        picks[str(item.uuid)]=item.metadata.distance
        
    picks = pd.DataFrame.from_dict(picks, columns=['distance'], orient='index').sort_values(by='distance', ascending=True).reset_index().rename(columns={'index':'UUID'})

    picks = picks.merge(df, on='UUID', how='inner')[1:]

    return picks[['bggid','distance']].head(n_picks)

# Content Similarity Models

### File Setup for All Models

In [None]:
games_df = pd.read_pickle(f"{read_dir}games_clean.pkl")
games_df = games_df.sort_values("BayesAvgRating", ascending=False)[
            0 : 1000
        ].reset_index(drop=True)
games_df.head(2)

In [None]:
id_name_map = {x:y for x,y in zip(games_df['BGGId'],games_df['Name'])}
bgg_ids = games_df['BGGId'].values
relevant_df = games_df[['BGGId']]
relevant_df.head(3)


In [None]:
test_games = ["Dominion", "Gloomhaven", "Pandemic", "Splendor", "Viticulture Essential Edition", "Great Western Trail", "Terraforming Mars", "Chess", "Azul", "Codenames", "Pandemic Legacy: Season 1"]

models_df_storage = {}
arrays_df_storage = {}

game_name = test_games[7]
game_id = int(games_df[games_df['Name']==game_name]['BGGId'].values[0])
game_name, game_id

In [None]:
rag_cleaned_sentences_df = pd.read_pickle(f'{sims_dir}top_1000_cleaned_rag.pkl')
rag_cleaned_sentences_df = relevant_df.merge(rag_cleaned_sentences_df, on="BGGId", how="left")
rag_cleaned_sentences_df = rag_cleaned_sentences_df.drop(columns=["Positive_Components","Positive_Sentences","Negative_Components","Negative_Sentences"], axis=1)
rag_cleaned_sentences_df.head(2)

In [None]:
all_rag_attributes_df = pd.read_pickle(f"{sims_dir}top_1000_cleaned_rag_with_ratings_extrap.pkl")
all_rag_attributes_df = all_rag_attributes_df.drop(columns=["Description","About","Positive_Components","Negative_Components","Positive_Sentences","Negative_Sentences"])
all_rag_attributes_df.columns = all_rag_attributes_df.columns.str.replace(" ","_")
all_rag_attributes_df = rag_cleaned_sentences_df.merge(all_rag_attributes_df, on="BGGId", how="left")
all_rag_attributes_df.head(2)

In [None]:
rag_pos_neg_df = all_rag_attributes_df.drop(columns=["About","Description"],axis=1)
rag_pos_neg_df = rag_pos_neg_df.fillna(0).set_index("BGGId")
rag_pos_neg_df.head(2)

In [None]:
positive_columns = [x for x in rag_pos_neg_df.columns if x.startswith('positive_')]
negative_columns = [x for x in rag_pos_neg_df.columns if x.startswith('negative_')]

all_columns_to_produce = positive_columns + negative_columns
all_columns_to_produce[:5]

## Array Models

### Themes

In [None]:
model = "themes"

themes_df = pd.read_pickle(f"{read_dir}themes_clean.pkl")
themes_df = explode_columnar_df(themes_df)
themes_df = relevant_df.merge(themes_df, on='BGGId', how='left').set_index('BGGId')
themes_df = themes_df.astype(float).replace(0,0.01)
themes_df = themes_df.fillna(0.01)
themes_df.head()

In [None]:
check_items = [x for x,y in id_name_map.items() if y in test_games]
themes_df[themes_df.index.isin(check_items)]

In [None]:
cosine_sims = cosine_similarity(themes_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

[id_name_map[x] for x in (sims_byid[game_id].sort_values(ascending=False)[1:10].index)]

### Categories Only

In [24]:
# TO DO: Add this to the game cleaning script

# category_df = df[['BGGId','Cat:Thematic', 'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract', 'Cat:Party', 'Cat:Childrens']]
# subcats_df = pd.read_pickle(f"{read_dir}subcategories_clean.pkl")
# subcats_df = explode_columnar_df(subcats_df).reset_index()
# category_df = category_df.merge(subcats_df, on="BGGId", how="left")
# category_df = category_df.set_index("BGGId")
# category_df.to_pickle(f"{read_dir}categories_clean.pkl")
# category_df.head()

In [None]:
model = "categories"

cat_df = pd.read_pickle(f"{read_dir}categories_clean.pkl")
cat_df = relevant_df.merge(cat_df, on='BGGId', how='left').set_index('BGGId')
cat_df = cat_df.astype(float).replace(0,0.01)
cat_df = cat_df.fillna(0.01)
cat_df.head()

In [None]:
cosine_sims = cosine_distances(cat_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

[id_name_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]


### Mechanics Only

In [None]:
model = "mechanics"

mech_df = pd.read_pickle(f"{read_dir}mechanics_clean.pkl")
mech_df = explode_columnar_df(mech_df)
mech_df = relevant_df.merge(mech_df, on='BGGId', how='left').set_index('BGGId')
mech_df = mech_df.astype(float).replace(0,0.01)
mech_df = mech_df.fillna(0.01)
mech_df.head()

In [None]:
cosine_sims = cosine_distances(mech_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

[id_name_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]


### Pos/Neg

In [None]:
model = "pos_neg"

rag_pos_neg_df.head()

In [None]:
cosine_sims = cosine_distances(rag_pos_neg_df).round(3)

sims_byid = pd.DataFrame(cosine_sims, columns=bgg_ids, index=bgg_ids)

arrays_df_storage[model] = sims_byid

[id_name_map[x] for x in (sims_byid[game_id].sort_values(ascending=True)[:10].index)]

### Weight

In [None]:
weight_df = games_df[['BGGId', 'GameWeight']].set_index("BGGId")#.sort_values("GameWeight", ascending=False)

weight_df.head(10)

In [None]:
# Create the matrix of absolute differences
matrix = pd.DataFrame(
    np.abs(weight_df['GameWeight'].values[:, None] - weight_df['GameWeight'].values),
    index=weight_df.index,
    columns=weight_df.index
)

arrays_df_storage[model] = matrix

[id_name_map[x] for x in (matrix[game_id].sort_values(ascending=True)[:10].index)]

## Vector DB Models

### About Only

In [None]:
# collection_name = "about_only"

# about_df = rag_df[['bggid','about']].copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True)

# about_df = weaviate_client.add_bgg_collection_batch(df=about_df, collection_name=collection_name, use_about=True)

# about_df.to_pickle(f'{sims_dir}about_df.pkl')

In [None]:
collection_name = "about_only"

about_df = pd.read_pickle(f'{sims_dir}about_df.pkl')
models_df_storage[collection_name] = about_df

picks = get_closet_picks(about_df, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_name_map[x] for x in picks['bggid']]

picks

### About and Description

In [None]:
# collection_name = "about_and_desc"

# desc_df = rag_df[['bggid','about', 'description']].copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, use_description=True)

# about_df = weaviate_client.add_bgg_collection_batch(df=desc_df, collection_name=collection_name, use_about=True, use_description=True)

# desc_df.to_pickle(f'{sims_dir}desc_df.pkl')

In [None]:
collection_name = "about_and_desc"

desc_df = pd.read_pickle(f'{sims_dir}desc_df.pkl')
models_df_storage[collection_name] = desc_df

picks = get_closet_picks(desc_df, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_name_map[x] for x in picks['bggid']]

picks

### About and Keywords

In [49]:
# collection_name = "about_and_pros_cons"

# about_and_pros_cons = rag_df.drop(columns=['description']).copy()

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, attributes=all_columns_to_produce)

# about_df = weaviate_client.add_bgg_collection_batch(df=about_and_pros_cons, collection_name=collection_name, use_about=True, attributes=all_columns_to_produce)

# about_and_pros_cons.to_pickle(f'{sims_dir}about_and_pros_cons.pkl')

In [None]:
collection_name = "about_and_pros_cons"

about_and_pros_cons = pd.read_pickle(f'{sims_dir}about_and_pros_cons.pkl')
models_df_storage[collection_name] = about_and_pros_cons

picks = get_closet_picks(about_and_pros_cons, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_name_map[x] for x in picks['bggid']]

picks

### With Keywords, About, and Desc

In [None]:
# collection_name = "all_attributes"

# Optional scaling of attributes
# scaler = MinMaxScaler(feature_range=(0,1))
# for col in all_columns_to_produce:
#     df[col] = scaler.fit_transform(df[[col]])

# weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, use_description=True, attributes=all_columns_to_produce)

# all_attributes_df = weaviate_client.add_bgg_collection_batch(df=rag_df, collection_name=collection_name, use_about=True, use_description=True, attributes=all_columns_to_produce)

# all_attributes_df.to_pickle(f'{sims_dir}all_attributes_df.pkl')

In [None]:
collection_name = "all_attributes"

all_attributes_df = pd.read_pickle(f'{sims_dir}all_attributes_df.pkl')
models_df_storage[collection_name] = all_attributes_df

picks = get_closet_picks(all_attributes_df, game_id, model=collection_name, n_picks=10)

picks['name'] = [id_name_map[x] for x in picks['bggid']]

picks

### All Comparisons

In [None]:
models_df_storage.keys()

In [None]:
arrays_df_storage.keys()

In [None]:
test_games

In [35]:
game_name = test_games[7]

test_results = {}
test_results[game_name] = {}

In [None]:
for collection_name, model_df in models_df_storage.items():
    print(f"Model {collection_name}")

    game_id = int(games_df[games_df['Name']==game_name]['BGGId'].values[0])

    picks = get_closet_picks(model_df, game_id, model=collection_name, n_picks=10)

    test_results[game_name][collection_name] = {x:y for x,y in zip(picks['bggid'], picks['distance'])}

picks

In [None]:
for model, model_df in arrays_df_storage.items():
    print(f"Model {model}")

    game_id = int(games_df[games_df['Name']==game_name]['BGGId'].values[0])
    picks = model_df[game_id].sort_values(ascending=True)[1:30]

    test_results[game_name][model] = picks

In [None]:
this_game_results = test_results[game_name]
results_df = pd.DataFrame(this_game_results)

scaler = MinMaxScaler(feature_range=(0,1))
for col in results_df.columns:
    results_df[col] = scaler.fit_transform(results_df[[col]])

results_df = results_df.fillna(1).round(3)
results_df['average_score'] = results_df.mean(axis=1).round(2)

results_df['name'] = results_df.index.map(id_name_map)

results_df.sort_values("average_score", ascending=True).head(20)

In [None]:
break

### Supplementary Data

In [None]:
family_df = df[['BGGId', 'Family']]
family_df.to_pickle(f"{read_dir}family.pkl")


drop_columns = ['Description', 'ImagePath', 'NumAlternates',
       'NumExpansions', 'NumImplementations', 'IsReimplementation',
       'Rank:boardgame','GoodPlayers', 'YearPublished','StdDev',
       'NumOwned', 'NumWant', 'NumWish', 'NumWeightVotes', 'ComMinPlaytime', 'ComMaxPlaytime', 'MfgAgeRec',
       'NumUserRatings', 'ComAgeRec', 'Family','LanguageEase','Rank:strategygames', 'Rank:abstracts', 'Rank:partygames',
       'Rank:wargames', 'Rank:thematic', 'Rank:familygames',
       'Rank:childrensgames','Rank:cgs','AvgRating','Kickstarted','Cat:Thematic', 'Cat:Strategy', 'Cat:War', 'Cat:Family', 'Cat:CGS',
       'Cat:Abstract', 'Cat:Party', 'Cat:Childrens','BestPlayers', 'MinPlayers', 'MaxPlayers','MfgPlaytime']
name_df = name_df.drop(columns=drop_columns)

name_df.head()