In [1]:
import pandas as pd
from config import CONFIGS
import os
import re
import os
import json
import gc
from typing import Tuple

from utils.processing_functions import load_file_local_first, save_file_local_first, explode_columnar_df
import weaviate
import weaviate.classes as wvc
from pydantic import BaseModel, ConfigDict
from weaviate.classes.config import Configure
from weaviate.classes.query import Filter
from weaviate.classes.query import MetadataQuery
from weaviate.util import generate_uuid5

from sklearn.preprocessing import MinMaxScaler


ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

read_dir = "data/prod/games/game_dfs_clean/"

In [2]:
class WeaviateClient(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    weaviate_client: weaviate.client = None
    collection: weaviate.collections.Collection = None

    def model_post_init(self, __context):
        self.weaviate_client = self.connect_weaviate_client_docker()

    def connect_weaviate_client_docker(self) -> weaviate.client:
        if not IS_LOCAL:
            client = weaviate.connect_to_local(
                host="127.0.0.1",
                port=8081,
                grpc_port=50051,
                headers={
                    "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"],
                },
            )
            return client

        return weaviate.connect_to_local(
            port=8081,
            headers={
                "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"],
            },
        )
    
    def find_near_objects(self, collection_name, uuid, limit:int =20):
        self.collection = self.weaviate_client.collections.get(collection_name)
        response = self.collection.query.near_object(
            near_object=uuid,
            limit=limit,
            return_metadata=MetadataQuery(distance=True),
        )
        return response.objects
    
    def close_client(self):
        self.weaviate_client.close()
    
    def create_bgg_collection(self, collection_name: str, reset=True, use_about=True, attributes:list=[]) -> None:

        if self.weaviate_client.collections.exists(collection_name):
            print("Collection already exists for this block")
            if reset:
                self.weaviate_client.collections.delete(collection_name)
                print("Deleted and recreating collection")
            return
        
        build_properties = [
                wvc.config.Property(
                    name="bggid",
                    data_type=wvc.config.DataType.TEXT,
                    skip_vectorization=True,
                    vectorize_property_name=False,
                ),
                wvc.config.Property(
                    name="name",
                    data_type=wvc.config.DataType.TEXT,
                    skip_vectorization=True,
                    vectorize_property_name=False,
                ),
            ]
        if use_about:
            build_properties.append(wvc.config.Property(name="about", data_type=wvc.config.DataType.TEXT))
        if len(attributes):
            build_properties+=[wvc.config.Property(
                    name=x, data_type=wvc.config.DataType.NUMBER, vectorize_property_name=False, skip_vectorization=True
                ) for x in attributes]


        self.weaviate_client.collections.create(
            name=collection_name,
            vectorizer_config=[
                Configure.NamedVectors.text2vec_transformers(
                    name="title_vector",
                    source_properties=["title"],
                )
            ],
            properties=build_properties,
        )

    def add_bgg_collection_batch(self, df:pd.DataFrame, collection_name:str, use_about=True, attributes:list=[]) -> None:

        self.collection = self.weaviate_client.collections.get(collection_name)
        uuids = []

        with self.collection.batch.dynamic() as batch:

            for index,item in df.iterrows():

                game_object = {
                    "bggid": str(item["bggid"]),
                    "name": str(item["name"]).lower()
                    }
                if use_about:
                    game_object.update({"about": str(item["about"]).lower()})
                if len(attributes):
                    game_object.update({x: float(item[x]) for x in attributes})
                
                uuid = generate_uuid5(game_object)
                uuids.append(uuid)

                if self.collection.data.exists(uuid):
                    continue
                else:
                    batch.add_object(properties=game_object, uuid=uuid)               
        
        df["UUID"] = uuids
        return df
    

    def close_client(self):
        self.weaviate_client.close()


weaviate_client = WeaviateClient()

  warn(


### Content Similarity

In [3]:
name_df = pd.read_pickle(f"{read_dir}games_clean.pkl")
name_df.columns

Index(['BGGId', 'Name', 'Description', 'ImagePath', 'NumAlternates',
       'NumExpansions', 'NumImplementations', 'IsReimplementation',
       'Rank:boardgame', 'BestPlayers', 'GoodPlayers', 'YearPublished',
       'MinPlayers', 'MaxPlayers', 'AvgRating', 'BayesAvgRating', 'StdDev',
       'NumOwned', 'NumWant', 'NumWish', 'NumWeightVotes', 'GameWeight',
       'MfgPlaytime', 'ComMinPlaytime', 'ComMaxPlaytime', 'MfgAgeRec',
       'NumUserRatings', 'ComAgeRec', 'LanguageEase', 'Family', 'Kickstarted',
       'Rank:strategygames', 'Rank:abstracts', 'Rank:partygames',
       'Rank:wargames', 'Rank:thematic', 'Rank:familygames',
       'Rank:childrensgames', 'Rank:cgs', 'Cat:Thematic', 'Cat:Strategy',
       'Cat:War', 'Cat:Family', 'Cat:CGS', 'Cat:Abstract', 'Cat:Party',
       'Cat:Childrens'],
      dtype='object')

In [4]:
mech_df = pd.read_pickle(f"{read_dir}mechanics_clean.pkl")
mech_df.head()

Unnamed: 0,BGGId,mechanic
0,1,Simultaneous Action Selection
1,1,Area Majority / Influence
2,1,Alliances
3,1,Negotiation
4,1,Dice Rolling


In [5]:
mech_df = explode_columnar_df(mech_df)
mech_df.head()

mechanic,BGGId,Acting,Action / Dexterity,Action / Event,Action Points,Action Queue,Action Retrieval,Action Timer,Advantage Token,Alliances,Area Majority / Influence,Area Movement,Area-Impulse,Auction or Bidding,Automatic Resource Growth,Betting and Bluffing,Bias,Bids As Wagers,Bingo,Bribery,Campaign / Battle Card Driven,Card Play Conflict Resolution,Catch the Leader,Chaining,Chit-Pull System,Command Cards,Commodity Speculation,Communication Limits,Connections,Constrained Bidding,Contracts,Cooperative Game,Crayon Rail System,Critical Hits and Failures,Cube Tower,Deck Construction,"Deck, Bag, and Pool Building",Deduction,Delayed Purchase,Dice Rolling,...,Secret Unit Deployment,Selection Order Bid,Semi-Cooperative Game,Set Collection,Simulation,Simultaneous Action Selection,Singing,Single Loser Game,Slide / Push,Solo / Solitaire Game,Speed Matching,Spelling,Square Grid,Stacking and Balancing,Stat Check Resolution,Static Capture,Stock Holding,Storytelling,Sudden Death Ending,Tableau Building,Tags,Take That,Targeted Clues,Team-Based Game,Tech Trees / Tech Tracks,Territory Building,Three Dimensional Movement,Tile Placement,Track Movement,Trading,Traitor Game,Trick-taking,Tug of War,Variable Phase Order,Variable Player Powers,Variable Set-up,Victory Points as a Resource,Voting,Worker Placement,Zone of Control
0,1,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0


In [9]:
name_df = pd.read_pickle(f"{read_dir}games_clean.pkl")
name_df = name_df[['BGGId','Name']]
df = pd.read_pickle("top_1000_cleaned_rag_with_ratings_extrap.pkl")
df = name_df.merge(df, on="BGGId", how="inner")
df = df.drop(columns=["Description","Positive_Components","Negative_Components","Positive_Sentences","Negative_Sentences"])
df = df.rename(columns={x:x.lower().replace(" ","_") for x in df.columns.tolist()})
df = df.fillna(0)
small_df = df[['bggid',	'name',	'about']]
del name_df
gc.collect()

354

### Match Keywords Only

In [15]:
collection_name = "attributes"

In [19]:
keyword_df = df.copy().drop(columns=['about'])
keyword_df.head()

Unnamed: 0,bggid,name,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_theme_integration,positive_component_quality,negative_steep_learning_curve,negative_fiddly_mechanics,...,positive_high_production_values,negative_catch_mechanics,positive_fast_playtime,negative_interaction_levels,positive_aesthetics,negative_betting_mechanics,positive_positive_interactions,positive_simple_ruleset,positive_theme_artwork,positive_fun_interactions
0,1,Die Macher,1.0,0.679171,0.852776,0.954712,0.564833,0.681429,0.841166,0.757878,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100423,Elder Sign,0.783073,0.642909,0.660516,0.909084,0.566755,0.914048,0.713301,0.742889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100901,Flash Point: Fire Rescue,0.753824,0.666693,0.788914,0.928217,0.0,0.77739,0.725883,0.949124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,101721,Mage Wars Arena,1.0,0.699487,0.776928,1.0,0.564833,0.706544,0.871574,0.705961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,102652,Sentinels of the Multiverse,0.718493,0.654972,0.845987,1.0,0.0,0.712531,0.749143,0.70932,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
positive_columns = [x for x in keyword_df.columns if x.startswith('positive_')]
negative_columns = [x for x in keyword_df.columns if x.startswith('negative_')]

all_columns_to_produce = positive_columns + negative_columns

# scaler = MinMaxScaler(feature_range=(0,10))

# for col in all_columns_to_produce:
#     keyword_df[col] = scaler.fit_transform(keyword_df[[col]])

all_columns_to_produce[:5]

keyword_df.head()

Unnamed: 0,bggid,name,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_theme_integration,positive_component_quality,negative_steep_learning_curve,negative_fiddly_mechanics,...,positive_high_production_values,negative_catch_mechanics,positive_fast_playtime,negative_interaction_levels,positive_aesthetics,negative_betting_mechanics,positive_positive_interactions,positive_simple_ruleset,positive_theme_artwork,positive_fun_interactions
0,1,Die Macher,1.0,0.679171,0.852776,0.954712,0.564833,0.681429,0.841166,0.757878,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100423,Elder Sign,0.783073,0.642909,0.660516,0.909084,0.566755,0.914048,0.713301,0.742889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100901,Flash Point: Fire Rescue,0.753824,0.666693,0.788914,0.928217,0.0,0.77739,0.725883,0.949124,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,101721,Mage Wars Arena,1.0,0.699487,0.776928,1.0,0.564833,0.706544,0.871574,0.705961,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,102652,Sentinels of the Multiverse,0.718493,0.654972,0.845987,1.0,0.0,0.712531,0.749143,0.70932,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
weaviate_client.connect_weaviate_client_docker()
weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=False, attributes=all_columns_to_produce)

keyword_df = weaviate_client.add_bgg_collection_batch(df=keyword_df, collection_name=collection_name, use_about=False, attributes=all_columns_to_produce)

            Please make sure to close the connection using `client.close()`.


Collection already exists for this block
Deleted and recreating collection




In [22]:
# uuid = keyword_df.loc[keyword_df['name']=='Pandemic Legacy: Season 1']['UUID'].values[0]
uuid = keyword_df.loc[keyword_df['name']=='Gloomhaven']['UUID'].values[0]
# uuid = keyword_df.loc[keyword_df['name']=='Great Western Trail']['UUID'].values[0]
similars = weaviate_client.find_near_objects(collection_name=collection_name, uuid=uuid, limit=20)

picks = {}

for item in similars:
    picks[str(item.uuid)]=item.metadata.distance
    
picks = pd.DataFrame.from_dict(picks, columns=['distance'], orient='index').sort_values(by='distance', ascending=True).reset_index().rename(columns={'index':'UUID'})

picks = picks.merge(keyword_df, on='UUID', how='inner')[1:]
picks.head()

Unnamed: 0,UUID,distance,bggid,name,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_theme_integration,positive_component_quality,...,positive_high_production_values,negative_catch_mechanics,positive_fast_playtime,negative_interaction_levels,positive_aesthetics,negative_betting_mechanics,positive_positive_interactions,positive_simple_ruleset,positive_theme_artwork,positive_fun_interactions
1,ea348ab1-2290-5e76-b40c-414dbaa0f2fa,0.474042,156336,Onirim (Second Edition),0.724251,0.683562,0.649789,0.869044,0.0,0.748381,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2de99977-63af-59d7-87a3-6bacd1dabd7e,0.484817,163412,Patchwork,0.782137,0.633349,0.777539,1.0,0.625905,0.676372,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,d3bf25a5-14ce-5f55-9863-b123de29f4bd,0.488493,155987,Abyss,1.0,0.683562,0.784182,1.0,0.506414,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,45014bbd-c876-585f-b5e4-2b17d460895c,0.489124,156546,Monikers,0.758232,0.0,0.830274,0.846207,0.0,0.647151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2fdb5cd7-c121-5f1c-8f9b-db786edc508a,0.495072,148228,Splendor,0.760423,0.718305,1.0,1.0,0.711721,0.775869,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Without attributes

In [None]:
collection_name = "similarities"

small_df.head()

In [None]:
weaviate_client.connect_weaviate_client_docker()
weaviate_client.create_bgg_collection(collection_name=collection_name, reset=False, use_about=True)

small_df = weaviate_client.add_bgg_collection_batch(df=small_df, collection_name=collection_name, use_about=True)

In [None]:
uuid = small_df.loc[small_df['name']=='Pandemic Legacy: Season 1']['UUID'].values[0]
uuid = small_df.loc[small_df['name']=='Gloomhaven']['UUID'].values[0]
uuid = small_df.loc[small_df['name']=='Great Western Trail']['UUID'].values[0]
similars = weaviate_client.find_near_objects(collection_name=collection_name, uuid=uuid, limit=20)

picks = {}

for item in similars:
    picks[str(item.uuid)]=item.metadata.distance
    
picks = pd.DataFrame.from_dict(picks, columns=['distance'], orient='index').sort_values(by='distance', ascending=True).reset_index().rename(columns={'index':'UUID'})

picks = picks.merge(small_df, on='UUID', how='inner')[1:]
picks.head()

### With all attributes

In [None]:
collection_name = "all_attributes"
df.head()

In [None]:
positive_columns = [x for x in df.columns if x.startswith('positive_')]
negative_columns = [x for x in df.columns if x.startswith('negative_')]

all_columns_to_produce = positive_columns + negative_columns

scaler = MinMaxScaler(feature_range=(0,.5))

for col in all_columns_to_produce:
    df[col] = scaler.fit_transform(df[[col]])

all_columns_to_produce[:5]

In [None]:
weaviate_client.connect_weaviate_client_docker()
weaviate_client.create_bgg_collection(collection_name=collection_name, reset=True, use_about=True, attributes=all_columns_to_produce)

df = weaviate_client.add_bgg_collection_batch(df=df, collection_name=collection_name, use_about=True, attributes=all_columns_to_produce)

In [None]:
uuid = df.loc[df['name']=='Pandemic Legacy: Season 1']['UUID'].values[0]
uuid = df.loc[df['name']=='Gloomhaven']['UUID'].values[0]
similars = weaviate_client.find_near_objects(collection_name=collection_name, uuid=uuid, limit=20)

picks = {}

for item in similars:
    picks[str(item.uuid)]=item.metadata.distance
    
picks = pd.DataFrame.from_dict(picks, columns=['distance'], orient='index').sort_values(by='distance', ascending=True).reset_index().rename(columns={'index':'UUID'})

picks = picks.merge(df, on='UUID', how='inner')
picks.head()