In [3]:
import pandas as pd
from config import CONFIGS
import os
import re
import os
import json
import gc
from typing import Tuple

from utils.processing_functions import load_file_local_first, save_file_local_first
import weaviate
import weaviate.classes as wvc
from pydantic import BaseModel, ConfigDict
from weaviate.classes.config import Configure
from weaviate.classes.query import Filter
from weaviate.classes.query import MetadataQuery
from weaviate.util import generate_uuid5

from sklearn.preprocessing import MinMaxScaler


ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

In [4]:


class WeaviateClient(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    weaviate_client: weaviate.client = None
    collection: weaviate.collections.Collection = None

    def model_post_init(self, __context):
        self.weaviate_client = self.connect_weaviate_client_docker()

    def connect_weaviate_client_docker(self) -> weaviate.client:
        if not IS_LOCAL:
            client = weaviate.connect_to_local(
                host="127.0.0.1",
                port=8081,
                grpc_port=50051,
                headers={
                    "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"],
                },
            )
            return client

        return weaviate.connect_to_local(
            port=8081,
            headers={
                "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"],
            },
        )
    
    def find_near_objects(self, collection_name, uuid, limit:int =20):
        self.collection = self.weaviate_client.collections.get(collection_name)
        response = self.collection.query.near_object(
            near_object=uuid,
            limit=limit,
            return_metadata=MetadataQuery(distance=True),
        )
        return response.objects
    
    def close_client(self):
        self.weaviate_client.close()
    
    def create_collection(self, collection_name: str, reset=True, use_about=True, attributes:list=[]) -> None:

        if self.weaviate_client.collections.exists(collection_name):
            print("Collection already exists for this block")
            if reset:
                self.weaviate_client.collections.delete(collection_name)
                print("Deleted and recreating collection")
            return
        
        build_properties = [
                wvc.config.Property(
                    name="bggid",
                    data_type=wvc.config.DataType.TEXT,
                    skip_vectorization=True,
                    vectorize_property_name=False,
                ),
                wvc.config.Property(
                    name="name",
                    data_type=wvc.config.DataType.TEXT,
                    skip_vectorization=True,
                    vectorize_property_name=False,
                ),
            ]
        if use_about:
            build_properties.append(wvc.config.Property(name="about", data_type=wvc.config.DataType.TEXT))
        if len(attributes):
            build_properties+=[wvc.config.Property(
                    name=x, data_type=wvc.config.DataType.NUMBER, vectorize_property_name=False, skip_vectorization=True
                ) for x in attributes]


        self.weaviate_client.collections.create(
            name=collection_name,
            vectorizer_config=[
                Configure.NamedVectors.text2vec_transformers(
                    name="title_vector",
                    source_properties=["title"],
                )
            ],
            properties=build_properties,
        )

    def add_collection_item(self, item:pd.Series, collection_name:str, use_about=True, attributes:list=[]) -> None:

        self.collection = self.weaviate_client.collections.get(collection_name)

        game_object = {
            "bggid": str(item["bggid"]),
            "name": str(item["name"]).lower()
            }

        if use_about:
            game_object.update({"about": str(item["about"]).lower()})
        if len(attributes):
            game_object.update({x: float(item[x]) for x in attributes})
        
        uuid = generate_uuid5(game_object)

        # if self.collection.data.exists(uuid):
        #     return uuid
        # else:
        #     self.collection.data.insert(properties=game_object, uuid=uuid)
        
        self.collection.data.insert(properties=game_object, uuid=uuid)

        return uuid

    def close_client(self):
        self.weaviate_client.close()


weaviate_client = WeaviateClient()

            Please make sure to close the connection using `client.close()`.


### Content Similarity

In [5]:
name_df = pd.read_pickle("data/prod/games/game_dfs_clean/games_clean.pkl")
name_df = name_df[['BGGId','Name']]
df = pd.read_pickle("top_1000_cleaned_rag_with_ratings_extrap.pkl")
df = name_df.merge(df, on="BGGId", how="inner")
df = df.drop(columns=["Description","Positive_Components","Negative_Components","Positive_Sentences","Negative_Sentences"])
df = df.rename(columns={x:x.lower().replace(" ","_") for x in df.columns.tolist()})
df = df.fillna(0)
small_df = df[['bggid',	'name',	'about']]
del name_df
gc.collect()



198

### Match Keywords Only

In [6]:
collection_name = "attributes_only"

In [7]:
keyword_df = df.copy().drop(columns=['about'])

positive_columns = [x for x in keyword_df.columns if x.startswith('positive_')]
negative_columns = [x for x in keyword_df.columns if x.startswith('negative_')]

all_columns_to_produce = positive_columns + negative_columns

scaler = MinMaxScaler(feature_range=(0,.5))

for col in all_columns_to_produce:
    keyword_df[col] = scaler.fit_transform(keyword_df[[col]])

all_columns_to_produce[:5]

['positive_strategic_depth',
 'positive_excellent_design',
 'positive_player_interaction',
 'positive_replayability',
 'positive_theme_integration']

In [8]:
weaviate_client.connect_weaviate_client_docker()
weaviate_client.create_collection(collection_name=collection_name, reset=True, use_about=False, attributes=all_columns_to_produce)

keyword_df['UUID'] = keyword_df.apply(lambda x: weaviate_client.add_collection_item(item=x, collection_name=collection_name, use_about=False, attributes=all_columns_to_produce), axis=1)

            Please make sure to close the connection using `client.close()`.


Collection already exists for this block
Deleted and recreating collection


  keyword_df['UUID'] = keyword_df.apply(lambda x: weaviate_client.add_collection_item(item=x, collection_name=collection_name, use_about=False, attributes=all_columns_to_produce), axis=1)


In [9]:
# uuid = keyword_df.loc[keyword_df['name']=='Pandemic Legacy: Season 1']['UUID'].values[0]
uuid = keyword_df.loc[keyword_df['name']=='Gloomhaven']['UUID'].values[0]
# uuid = keyword_df.loc[keyword_df['name']=='Great Western Trail']['UUID'].values[0]
similars = weaviate_client.find_near_objects(collection_name=collection_name, uuid=uuid, limit=20)

picks = {}

for item in similars:
    picks[str(item.uuid)]=item.metadata.distance
    
picks = pd.DataFrame.from_dict(picks, columns=['distance'], orient='index').sort_values(by='distance', ascending=True).reset_index().rename(columns={'index':'UUID'})

picks = picks.merge(keyword_df, on='UUID', how='inner')[1:]
picks.head()

Unnamed: 0,UUID,distance,bggid,name,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_theme_integration,positive_component_quality,...,positive_high_production_values,negative_catch_mechanics,positive_fast_playtime,negative_interaction_levels,positive_aesthetics,negative_betting_mechanics,positive_positive_interactions,positive_simple_ruleset,positive_theme_artwork,positive_fun_interactions
1,1d67fd86-5e19-5ac5-addc-293f36eb075f,0.250801,295770,Frosthaven,0.059685,0.338486,0.198812,0.261751,0.307074,0.279081,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,fb9a30f4-d1e9-5772-b273-e4334f2c218a,0.352217,156336,Onirim (Second Edition),0.128706,0.341781,0.087924,0.317437,0.0,0.212475,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0caabfb1-a16e-5ceb-8d9f-9af2616f3ee7,0.353354,361545,Twilight Inscription,0.27633,0.0,0.5,0.175675,0.2634,0.204351,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,b7c8d345-b0f5-5f7d-b65e-620dbefdb6f6,0.355034,262211,Cloudspire,0.5,0.295573,0.21225,0.251952,0.335822,0.454335,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,425e3fae-e509-5749-8a40-bbf5cb31b343,0.355323,12333,Twilight Struggle,0.5,0.292255,0.207746,0.3104,0.5,0.097917,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Without attributes

In [10]:
collection_name = "similarities"

small_df.head()

Unnamed: 0,bggid,name,about
0,1,Die Macher,This game revolves around political strategy a...
1,100423,Elder Sign,This game is a cooperative board game that imm...
2,100901,Flash Point: Fire Rescue,This game is a cooperative board game where pl...
3,101721,Mage Wars Arena,This game is a tactical strategy experience th...
4,102652,Sentinels of the Multiverse,This game is a cooperative card game where pla...


In [11]:
weaviate_client.connect_weaviate_client_docker()
weaviate_client.create_collection(collection_name=collection_name, reset=True, use_about=True)

small_df['UUID'] = small_df.apply(lambda x: weaviate_client.add_collection_item(item=x, collection_name=collection_name, use_about=True), axis=1)

            Please make sure to close the connection using `client.close()`.


Collection already exists for this block
Deleted and recreating collection


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_df['UUID'] = small_df.apply(lambda x: weaviate_client.add_collection_item(item=x, collection_name=collection_name, use_about=True), axis=1)


In [12]:
uuid = small_df.loc[small_df['name']=='Pandemic Legacy: Season 1']['UUID'].values[0]
uuid = small_df.loc[small_df['name']=='Gloomhaven']['UUID'].values[0]
uuid = small_df.loc[small_df['name']=='Great Western Trail']['UUID'].values[0]
similars = weaviate_client.find_near_objects(collection_name=collection_name, uuid=uuid, limit=20)

picks = {}

for item in similars:
    picks[str(item.uuid)]=item.metadata.distance
    
picks = pd.DataFrame.from_dict(picks, columns=['distance'], orient='index').sort_values(by='distance', ascending=True).reset_index().rename(columns={'index':'UUID'})

picks = picks.merge(small_df, on='UUID', how='inner')[1:]
picks.head()

Unnamed: 0,UUID,distance,bggid,name,about
1,ea29573d-8e39-5502-b05b-dc639e8d0625,0.132097,341169,Great Western Trail: Second Edition,This game is a strategic board game that combi...
2,158b9223-9652-5128-9d4d-f1cb28fa1c50,0.149783,364011,Great Western Trail: Argentina,This game is a strategic board game that revol...
3,32417ccd-d076-5be2-9392-a907dcb7f1c4,0.208928,380607,Great Western Trail: New Zealand,This game is a strategic board game that revol...
4,b2c4cca7-ba95-539d-8fba-f3c6c139dc8a,0.286462,140620,Lewis & Clark: The Expedition,This game is a strategic adventure set during ...
5,938b010e-c5da-5692-beb4-bf07e0cb0c20,0.29101,390092,Ticket to Ride Legacy: Legends of the West,This game is a legacy-style board game that bu...


### With all attributes

In [13]:
collection_name = "all_attributes"
df.head()

Unnamed: 0,bggid,name,about,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_theme_integration,positive_component_quality,negative_steep_learning_curve,...,positive_high_production_values,negative_catch_mechanics,positive_fast_playtime,negative_interaction_levels,positive_aesthetics,negative_betting_mechanics,positive_positive_interactions,positive_simple_ruleset,positive_theme_artwork,positive_fun_interactions
0,1,Die Macher,This game revolves around political strategy a...,1.0,0.679171,0.852776,0.954712,0.564833,0.681429,0.841166,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,100423,Elder Sign,This game is a cooperative board game that imm...,0.783073,0.642909,0.660516,0.909084,0.566755,0.914048,0.713301,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,100901,Flash Point: Fire Rescue,This game is a cooperative board game where pl...,0.753824,0.666693,0.788914,0.928217,0.0,0.77739,0.725883,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,101721,Mage Wars Arena,This game is a tactical strategy experience th...,1.0,0.699487,0.776928,1.0,0.564833,0.706544,0.871574,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,102652,Sentinels of the Multiverse,This game is a cooperative card game where pla...,0.718493,0.654972,0.845987,1.0,0.0,0.712531,0.749143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
positive_columns = [x for x in df.columns if x.startswith('positive_')]
negative_columns = [x for x in df.columns if x.startswith('negative_')]

all_columns_to_produce = positive_columns + negative_columns

scaler = MinMaxScaler(feature_range=(0,.5))

for col in all_columns_to_produce:
    df[col] = scaler.fit_transform(df[[col]])

all_columns_to_produce[:5]

['positive_strategic_depth',
 'positive_excellent_design',
 'positive_player_interaction',
 'positive_replayability',
 'positive_theme_integration']

In [15]:
weaviate_client.connect_weaviate_client_docker()
weaviate_client.create_collection(collection_name=collection_name, reset=True, use_about=True, attributes=all_columns_to_produce)

df['UUID'] = df.apply(lambda x: weaviate_client.add_collection_item(item=x, collection_name=collection_name, use_about=True, attributes=all_columns_to_produce), axis=1)

            Please make sure to close the connection using `client.close()`.


Collection already exists for this block
Deleted and recreating collection


  df['UUID'] = df.apply(lambda x: weaviate_client.add_collection_item(item=x, collection_name=collection_name, use_about=True, attributes=all_columns_to_produce), axis=1)


In [19]:
uuid = df.loc[df['name']=='Pandemic Legacy: Season 1']['UUID'].values[0]
uuid = df.loc[df['name']=='Gloomhaven']['UUID'].values[0]
similars = weaviate_client.find_near_objects(collection_name=collection_name, uuid=uuid, limit=20)

picks = {}

for item in similars:
    picks[str(item.uuid)]=item.metadata.distance
    
picks = pd.DataFrame.from_dict(picks, columns=['distance'], orient='index').sort_values(by='distance', ascending=True).reset_index().rename(columns={'index':'UUID'})

picks = picks.merge(df, on='UUID', how='inner')
picks.head()

Unnamed: 0,UUID,distance,bggid,name,about,positive_strategic_depth,positive_excellent_design,positive_player_interaction,positive_replayability,positive_theme_integration,...,positive_high_production_values,negative_catch_mechanics,positive_fast_playtime,negative_interaction_levels,positive_aesthetics,negative_betting_mechanics,positive_positive_interactions,positive_simple_ruleset,positive_theme_artwork,positive_fun_interactions
0,2f84f2f0-d994-5325-b074-6f85d21462d7,-1.192093e-07,174430,Gloomhaven,This game is a cooperative tactical adventure ...,0.317556,0.333521,0.111487,0.5,0.336718,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,c989fd18-38a2-5faf-b32d-f63932b847f7,0.1609258,295770,Frosthaven,This game is a cooperative dungeon crawler tha...,0.059685,0.338486,0.198812,0.261751,0.307074,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,e2f0181a-4c20-5abd-aff1-cfd4923474a3,0.1839179,291457,Gloomhaven: Jaws of the Lion,This game is a cooperative dungeon crawler tha...,0.161335,0.277698,0.219685,0.5,0.298665,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,73253b0c-d129-52e6-a05b-2e69a7d67543,0.1891032,150997,Shadows of Brimstone: Swamps of Death,This game is a cooperative dungeon crawler set...,0.169594,0.283825,0.236671,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8f173b94-8b6a-56d0-b3ea-324c2cac6421,0.2011019,286063,The 7th Citadel,This game is a narrative-driven adventure that...,0.123985,0.287421,0.100394,0.335205,0.268172,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
