In [None]:
import pandas as pd
from config import CONFIGS
import os
import re
import os
import json
from typing import Tuple

# from utils.processing_functions import load_file_local_first, save_file_local_first
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer


ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

In [2]:
def get_major_and_all_components(component_type:str, df:pd.DataFrame) -> Tuple[list, list]:

    # Extract unique components from Positive_Components
    unique_positive_components = set()
    for components in df[f'{component_type}_Components']:
        unique_positive_components.update(components)
    major_components = [x for x in list(unique_positive_components) if x != ''] 
    # sort major components by number of words in the entry, highest to lowest
    major_components = sorted(major_components, key=lambda x: len(x.split()), reverse=True)

    # Extract unique elements from both Positive_Components and Positive_Sentences
    unique_sentence_components = set()  # Start with Positive_Components
    for sentences in df[f'{component_type}_Sentences']:
        unique_sentence_components.update(sentences)
    sentence_components = [x for x in list(unique_sentence_components) if x != ''] 
    sentence_components = sorted(sentence_components, key=lambda x: len(x.split()), reverse=True)

    # Extract unique elements from both Positive_Components and Positive_Sentences
    unique_all_components = set(unique_positive_components)  # Start with Positive_Components
    for sentences in df[f'{component_type}_Sentences']:
        unique_all_components.update(sentences)
    all_components = [x for x in list(unique_all_components) if x != ''] 
    all_components = sorted(all_components, key=lambda x: len(x.split()), reverse=True)


    return all_components, major_components, sentence_components

### Clean up positives and negatives into vectors

In [None]:
df = pd.read_pickle("top_1000_cleaned_rag.pkl")
df.head()

In [None]:
all_positives, major_positives, sentence_positives = get_major_and_all_components("Positive", df)
all_negatives, major_negatives, sentence_negatives = get_major_and_all_components("Negative", df)

len(all_positives), len(major_positives), len(sentence_positives), len(all_negatives), len(major_negatives), len(sentence_negatives)

In [5]:
positive_attributes_store = json.loads(open("positive_matches.json").read())
# negative_attributes_store = json.loads(open("negative_matches.json").read())

In [None]:
len(keywords_to_iterate)

In [None]:
df_as_dict = df.to_dict(orient='records')

for game_entry in df_as_dict[:1]:
    print(f"Preparing BGG ID: {game_entry['BGGId']}")

    tester_initial_positive_components = game_entry['Positive_Components']
    tester_initial_positive_sentences = game_entry['Positive_Sentences']
    
    for positive_component in tester_initial_positive_components:
        game_entry[f"positive {positive_component}"] = 1

    for evaluation_item in all_positives[:1]:

        if evaluation_item in tester_initial_positive_sentences:
            print(f"Skipping {evaluation_item}")
            continue
        if evaluation_item in tester_initial_positive_components:
            print(f"Skipping {evaluation_item}")
            continue
        else:
            print(f"\nEval item or phrase: {evaluation_item}")
            current_major_key = [x for x in major_positives if evaluation_item.startswith(x)][0]
            
            # Create matches based on sentence similarities
            this_item_sentence_matches = positive_attributes_store[evaluation_item]
            sentence_matches_excluding_current_major_key = {x:y for x,y in this_item_sentence_matches.items() if not x.startswith(current_major_key)}
            
            keywords_to_iterate = list(sentence_matches_excluding_current_major_key.keys())

            while(len(keywords_to_iterate)):
                key_or_phrase = keywords_to_iterate.pop(0)
                this_item_major_key = [x for x in major_positives if key_or_phrase.startswith(x)][0]
                if f"positive {this_item_major_key}" not in game_entry:
                    game_entry[f"positive {this_item_major_key}"] = 1 - sentence_matches_excluding_current_major_key[key_or_phrase]
                keys_to_delete = [x for x in all_positives if x.startswith(this_item_major_key)]
                keywords_to_iterate = [x for x in keywords_to_iterate if x not in keys_to_delete]

            if evaluation_item not in major_positives:
                continue

            # Create matches based on keyword similarities
            this_item_keyword_matches = positive_attributes_store[current_major_key]

            for this_item_major_key in this_item_keyword_matches.keys():
                if f"positive {this_item_major_key}" not in game_entry:
                    game_entry[f"positive {this_item_major_key}"] = 1 - this_item_keyword_matches[this_item_major_key]
                if f"positive {this_item_major_key}" in game_entry:
                    game_entry[f"positive {this_item_major_key}"] = max(game_entry[f"positive {this_item_major_key}"], 1 - this_item_keyword_matches[this_item_major_key])



In [None]:
key_or_phrase

In [None]:
this_item_major_key = [x for x in major_positives if key_or_phrase.startswith(x)]
this_item_major_key

In [None]:
[x for x in major_positives if "collect" in x]

In [None]:
df_as_dict[0]

In [None]:
break

# Weaviate create attribute embeddings

In [None]:
import weaviate
import weaviate.classes as wvc
from pydantic import BaseModel, ConfigDict
from weaviate.classes.config import Configure
from weaviate.classes.query import Filter
from weaviate.classes.query import MetadataQuery
from weaviate.util import generate_uuid5

class WeaviateClient(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    weaviate_client: weaviate.client = None
    collection: weaviate.collections.Collection = None

    def model_post_init(self, __context):
        self.weaviate_client = self.connect_weaviate_client_docker()

    def connect_weaviate_client_docker(self) -> weaviate.client:
        if not IS_LOCAL:
            client = weaviate.connect_to_local(
                host="127.0.0.1",
                port=8081,
                grpc_port=50051,
                headers={
                    "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"],
                },
            )
            return client

        return weaviate.connect_to_local(
            port=8081,
            headers={
                "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"],
            },
        )
    
    def find_near_objects(self, collection_name, uuid, limit:int =20):
        self.collection = self.weaviate_client.collections.get(collection_name)
        response = self.collection.query.near_object(
            near_object=uuid,
            limit=limit,
            return_metadata=MetadataQuery(distance=True),
        )
        return response.objects
    
    def close_client(self):
        self.weaviate_client.close()
    
    def add_similarity_collection_item(self, item:pd.Series, collection_name: str = "similarity_collection") -> None:

        self.collection = self.weaviate_client.collections.get(collection_name)

        print(f"Adding data for game {item["BGGId"]}")

        game_object = {
            "bggid": str(item["BGGId"]),
            "name": str(item["Name"]).lower(),
            # "description": str(item["Description"]).lower(),
            "about": str(item["About"]).lower(),
            "positive": str(item["Positive"]).lower(),
            "negative": str(item["Negative"]).lower(),
        }
        
        uuid = generate_uuid5(game_object)

        if self.collection.data.exists(uuid):
            return
        else:
            self.collection.data.insert(properties=game_object, uuid=uuid)

        return uuid

    def create_similarity_collection(self, collection_name: str = "similarity_collection") -> None:

        if self.weaviate_client.collections.exists(collection_name):
            print("Collection already exists for this block")
            self.weaviate_client.collections.delete(collection_name)
            print("Deleted and recreating collection")
            return

        self.weaviate_client.collections.create(
            name=collection_name,
            vectorizer_config=[
                Configure.NamedVectors.text2vec_transformers(
                    name="title_vector",
                    source_properties=["title"],
                )
            ],
            properties=[
                wvc.config.Property(
                    name="bggid",
                    data_type=wvc.config.DataType.TEXT,
                    skip_vectorization=True,
                    vectorize_property_name=False,
                ),
                wvc.config.Property(
                    name="name",
                    data_type=wvc.config.DataType.TEXT,
                    skip_vectorization=True,
                    vectorize_property_name=False,
                ),
                # wvc.config.Property(
                #     name="description", data_type=wvc.config.DataType.TEXT
                # ),
                wvc.config.Property(name="about", data_type=wvc.config.DataType.TEXT),
                wvc.config.Property(
                    name="positive", data_type=wvc.config.DataType.TEXT
                ),
                wvc.config.Property(
                    name="negative", data_type=wvc.config.DataType.TEXT
                ),
            ],
        )

    def add_attributes_collection_item(self, attribute:str, collection_name: str = "game_attributes") -> None:

        self.collection = self.weaviate_client.collections.get(collection_name)

        print(f"Adding data for attribute {attribute}")

        attribute_object = {
            "attribute_name": attribute,
            "attribute": attribute,
        }
        
        uuid = generate_uuid5(attribute_object)

        if self.collection.data.exists(uuid):
            return uuid
        else:
            self.collection.data.insert(properties=attribute_object, uuid=uuid)

        return uuid
    
    def create_attributes_collection(self, collection_name: str = "game_attributes") -> None:

        if self.weaviate_client.collections.exists(collection_name):
            print("Collection already exists for this block")
            self.weaviate_client.collections.delete(collection_name)
            print("Deleted and recreating collection")
            return

        self.weaviate_client.collections.create(
            name=collection_name,
            vectorizer_config=[
                Configure.NamedVectors.text2vec_transformers(
                    name="title_vector",
                    source_properties=["title"],
                )
            ],
            properties=[
                wvc.config.Property(
                    name="attribute_name",
                    data_type=wvc.config.DataType.TEXT,
                    skip_vectorization=True,
                    vectorize_property_name=False,
                ),
                wvc.config.Property(
                    name="attribute", data_type=wvc.config.DataType.TEXT, vectorize_property_name=False
                ),
            ],
        )

    def close_client(self):
        self.weaviate_client.close()


weaviate_client = WeaviateClient()

In [None]:
df = pd.read_pickle("top_1000_cleaned_rag.pkl")
df.head()

### Create Positives Storage

In [5]:
all_positives, major_positives, sentence_positives = get_major_and_all_components("Positive", df)

In [None]:
weaviate_client.create_attributes_collection(collection_name="positives")

positive_attributes_store = {}

for item in all_positives:
    uuid = weaviate_client.add_attributes_collection_item(item, collection_name="positives")
    positive_attributes_store[item] = uuid

In [None]:
match_storage = {}
total_entries = len(all_positives)
entries_completed = 0


for major_component in all_positives:
    uuid = positive_attributes_store[major_component]
    pos_similars = weaviate_client.find_near_objects(collection_name="positives", uuid=uuid, limit=2000)
    matches_without_major_component = {x.properties['attribute_name']:x.metadata.distance for x in pos_similars if x.metadata.distance <= .40 and not x.properties['attribute_name'].startswith(major_component)}
    match_storage[major_component] = matches_without_major_component
    # report back every 100 entries completed
    entries_completed += 1
    if entries_completed % 100 == 0:
        print(f"Completed {entries_completed} of {total_entries}")
    

In [8]:
with open('positive_matches.json', 'w') as f:
    json.dump(match_storage, f)

### Create Negatives Storage

In [9]:
all_negatives, major_negatives, sentence_negatives = get_major_and_all_components("Negative", df)

In [None]:
weaviate_client.create_attributes_collection(collection_name="negatives")

negative_attributes_store = {}

for item in all_negatives:
    uuid = weaviate_client.add_attributes_collection_item(item, collection_name="negatives")
    negative_attributes_store[item] = uuid

In [None]:
match_storage = {}
total_entries = len(all_negatives)
entries_completed = 0

for major_component in all_negatives:
    uuid = negative_attributes_store[major_component]
    pos_similars = weaviate_client.find_near_objects(collection_name="negatives", uuid=uuid, limit=2000)
    matches_without_major_component = {x.properties['attribute_name']:x.metadata.distance for x in pos_similars if x.metadata.distance <= .40 and not x.properties['attribute_name'].startswith(major_component)}
    match_storage[major_component] = matches_without_major_component
    entries_completed += 1
    if entries_completed % 100 == 0:
        print(f"Completed {entries_completed} of {total_entries}")
    

In [12]:
with open('negative_matches.json', 'w') as f:
    json.dump(match_storage, f)

### Content Similarity

In [None]:
top_1000 = load_file_local_first(
            path=GAME_CONFIGS["clean_dfs_directory"], file_name="top_1000_with_attached_rag.pkl"
        )
top_1000 = top_1000[['BGGId','Name']]
df = load_file_local_first(
            path=GAME_CONFIGS["clean_dfs_directory"], file_name="top_1000_cleaned_rag.pkl"
        )
df = df.merge(top_1000, on='BGGId', how='left')
df.head()

In [None]:

weaviate_client.connect_weaviate_client_docker()
weaviate_client.create_similarity_collection()
def add_objects_to_db(row):
    uuid = weaviate_client.add_similarity_collection_item(row)
    return uuid
df['UUID'] = df.apply(add_objects_to_db, axis=1)

In [38]:
df.to_pickle('temp_top_1000_with_uuid.pkl')

In [None]:
# get gloomhaven uuid
uuid = df.iloc[2]['UUID']
uuid

In [None]:
similars = weaviate_client.find_near_objects(collection_name="similarity_collection", uuid)

In [None]:
picks = []

for item in similars:
    picks.append(str(item.uuid))
    
df[df['UUID'].isin(picks)]