In [None]:
import pandas as pd
from config import CONFIGS
import os
import re
import os
import json
from typing import Tuple

# from utils.processing_functions import load_file_local_first, save_file_local_first
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer


ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

In [2]:
def get_major_and_all_components(component_type:str, df:pd.DataFrame) -> Tuple[list, list]:

    # Extract unique components from components
    unique_positive_components = set()
    for components in df[f'{component_type}_Components']:
        unique_positive_components.update(components)
    major_components = [x for x in list(unique_positive_components) if x != ''] 
    # sort major components by number of words in the entry, highest to lowest
    major_components = sorted(major_components, key=lambda x: len(x.split()), reverse=True)

    # Extract unique elements from both Positive_Components and Positive_Sentences
    unique_sentence_components = set()  # Start with Positive_Components
    for sentences in df[f'{component_type}_Sentences']:
        unique_sentence_components.update(sentences)
    sentence_components = [x for x in list(unique_sentence_components) if x != ''] 
    sentence_components = sorted(sentence_components, key=lambda x: len(x.split()), reverse=True)

    # Extract unique elements from both Positive_Components and Positive_Sentences
    unique_all_components = set(unique_positive_components)  # Start with Positive_Components
    for sentences in df[f'{component_type}_Sentences']:
        unique_all_components.update(sentences)
    all_components = [x for x in list(unique_all_components) if x != ''] 
    all_components = sorted(all_components, key=lambda x: len(x.split()), reverse=True)


    return all_components, major_components, sentence_components

# Weaviate create attribute embeddings

In [None]:
import weaviate
import weaviate.classes as wvc
from pydantic import BaseModel, ConfigDict
from weaviate.classes.config import Configure
from weaviate.classes.query import Filter
from weaviate.classes.query import MetadataQuery
from weaviate.util import generate_uuid5

class WeaviateClient(BaseModel):
    model_config = ConfigDict(arbitrary_types_allowed=True)
    weaviate_client: weaviate.client = None
    collection: weaviate.collections.Collection = None

    def model_post_init(self, __context):
        self.weaviate_client = self.connect_weaviate_client_docker()

    def connect_weaviate_client_docker(self) -> weaviate.client:
        if not IS_LOCAL:
            client = weaviate.connect_to_local(
                host="127.0.0.1",
                port=8081,
                grpc_port=50051,
                headers={
                    "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"],
                },
            )
            return client

        return weaviate.connect_to_local(
            port=8081,
            headers={
                "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"],
            },
        )
    
    def find_near_objects(self, collection_name, uuid, limit:int =20):
        self.collection = self.weaviate_client.collections.get(collection_name)
        response = self.collection.query.near_object(
            near_object=uuid,
            limit=limit,
            return_metadata=MetadataQuery(distance=True),
        )
        return response.objects
    
    def close_client(self):
        self.weaviate_client.close()
    
    def add_similarity_collection_item(self, item:pd.Series, collection_name: str = "similarity_collection") -> None:

        self.collection = self.weaviate_client.collections.get(collection_name)

        print(f"Adding data for game {item["BGGId"]}")

        game_object = {
            "bggid": str(item["BGGId"]),
            "name": str(item["Name"]).lower(),
            # "description": str(item["Description"]).lower(),
            "about": str(item["About"]).lower(),
            "positive": str(item["Positive"]).lower(),
            "negative": str(item["Negative"]).lower(),
        }
        
        uuid = generate_uuid5(game_object)

        if self.collection.data.exists(uuid):
            return
        else:
            self.collection.data.insert(properties=game_object, uuid=uuid)

        return uuid

    def create_similarity_collection(self, collection_name: str = "similarity_collection") -> None:

        if self.weaviate_client.collections.exists(collection_name):
            print("Collection already exists for this block")
            self.weaviate_client.collections.delete(collection_name)
            print("Deleted and recreating collection")
            return

        self.weaviate_client.collections.create(
            name=collection_name,
            vectorizer_config=[
                Configure.NamedVectors.text2vec_transformers(
                    name="title_vector",
                    source_properties=["title"],
                )
            ],
            properties=[
                wvc.config.Property(
                    name="bggid",
                    data_type=wvc.config.DataType.TEXT,
                    skip_vectorization=True,
                    vectorize_property_name=False,
                ),
                wvc.config.Property(
                    name="name",
                    data_type=wvc.config.DataType.TEXT,
                    skip_vectorization=True,
                    vectorize_property_name=False,
                ),
                # wvc.config.Property(
                #     name="description", data_type=wvc.config.DataType.TEXT
                # ),
                wvc.config.Property(name="about", data_type=wvc.config.DataType.TEXT),
                wvc.config.Property(
                    name="positive", data_type=wvc.config.DataType.TEXT
                ),
                wvc.config.Property(
                    name="negative", data_type=wvc.config.DataType.TEXT
                ),
            ],
        )

    def add_attributes_collection_item(self, attribute:str, collection_name: str = "game_attributes") -> None:

        self.collection = self.weaviate_client.collections.get(collection_name)

        print(f"Adding data for attribute {attribute}")

        attribute_object = {
            "attribute_name": attribute,
            "attribute": attribute,
        }
        
        uuid = generate_uuid5(attribute_object)

        if self.collection.data.exists(uuid):
            return uuid
        else:
            self.collection.data.insert(properties=attribute_object, uuid=uuid)

        return uuid
    
    def create_attributes_collection(self, collection_name: str = "game_attributes") -> None:

        if self.weaviate_client.collections.exists(collection_name):
            print("Collection already exists for this block")
            self.weaviate_client.collections.delete(collection_name)
            print("Deleted and recreating collection")
            return

        self.weaviate_client.collections.create(
            name=collection_name,
            vectorizer_config=[
                Configure.NamedVectors.text2vec_transformers(
                    name="title_vector",
                    source_properties=["title"],
                )
            ],
            properties=[
                wvc.config.Property(
                    name="attribute_name",
                    data_type=wvc.config.DataType.TEXT,
                    skip_vectorization=True,
                    vectorize_property_name=False,
                ),
                wvc.config.Property(
                    name="attribute", data_type=wvc.config.DataType.TEXT, vectorize_property_name=False
                ),
            ],
        )

    def close_client(self):
        self.weaviate_client.close()


weaviate_client = WeaviateClient()

In [None]:
df = pd.read_pickle("top_1000_cleaned_rag.pkl")
df.head()

### Create Positives Storage

In [5]:
all_positives, major_positives, sentence_positives = get_major_and_all_components("Positive", df)

In [None]:
weaviate_client.create_attributes_collection(collection_name="positives")

positive_attributes_store = {}

for item in all_positives:
    uuid = weaviate_client.add_attributes_collection_item(item, collection_name="positives")
    positive_attributes_store[item] = uuid

In [None]:
match_storage = {}
total_entries = len(all_positives)
entries_completed = 0


for major_component in all_positives:
    uuid = positive_attributes_store[major_component]
    pos_similars = weaviate_client.find_near_objects(collection_name="positives", uuid=uuid, limit=2000)
    matches_without_major_component = {x.properties['attribute_name']:x.metadata.distance for x in pos_similars if x.metadata.distance <= .50 and not x.properties['attribute_name'].startswith(major_component)}
    match_storage[major_component] = matches_without_major_component
    # report back every 100 entries completed
    entries_completed += 1
    if entries_completed % 100 == 0:
        print(f"Completed {entries_completed} of {total_entries}")
    

In [8]:
with open('positive_matches.json', 'w') as f:
    json.dump(match_storage, f)

### Create Negatives Storage

In [9]:
all_negatives, major_negatives, sentence_negatives = get_major_and_all_components("Negative", df)

In [None]:
weaviate_client.create_attributes_collection(collection_name="negatives")

negative_attributes_store = {}

for item in all_negatives:
    uuid = weaviate_client.add_attributes_collection_item(item, collection_name="negatives")
    negative_attributes_store[item] = uuid

In [None]:
match_storage = {}
total_entries = len(all_negatives)
entries_completed = 0

for major_component in all_negatives:
    uuid = negative_attributes_store[major_component]
    pos_similars = weaviate_client.find_near_objects(collection_name="negatives", uuid=uuid, limit=2000)
    matches_without_major_component = {x.properties['attribute_name']:x.metadata.distance for x in pos_similars if x.metadata.distance <= .50 and not x.properties['attribute_name'].startswith(major_component)}
    match_storage[major_component] = matches_without_major_component
    entries_completed += 1
    if entries_completed % 100 == 0:
        print(f"Completed {entries_completed} of {total_entries}")
    

In [12]:
with open('negative_matches.json', 'w') as f:
    json.dump(match_storage, f)

### Clean up positives and negatives into vectors

In [None]:
df = pd.read_pickle("top_1000_cleaned_rag.pkl")
df.head()

In [None]:
len(df)

In [None]:
all_positives, major_positives, sentence_positives = get_major_and_all_components("Positive", df)
all_negatives, major_negatives, sentence_negatives = get_major_and_all_components("Negative", df)

len(all_positives), len(major_positives), len(sentence_positives), len(all_negatives), len(major_negatives), len(sentence_negatives)

In [16]:
keys_storage = {"positive":[all_positives, major_positives, sentence_positives], "negative":[all_negatives, major_negatives, sentence_negatives]}

In [None]:
attributes_store = {"positive":json.loads(open("positive_matches.json").read()),
                    "negative":json.loads(open("negative_matches.json").read())}

In [None]:
df_as_dict = df.to_dict(orient='records')

for game_entry in df_as_dict:
    print(f"Preparing BGG ID: {game_entry['BGGId']}")

    keyword_mood_bar = {x:"positive" for x in game_entry['Positive_Components']}
    keyword_mood_bar.update({x:"negative" for x in game_entry['Negative_Components']})
    
    for component in game_entry['Positive_Components'] + game_entry['Negative_Components']:
        game_entry[f"{keyword_mood_bar[component]} {component}"] = 1

    sentence_mood_bar = {x:"positive" for x in game_entry['Positive_Sentences']}
    sentence_mood_bar.update({x:"negative" for x in game_entry['Negative_Sentences']})

    for review_key, mood in keyword_mood_bar.items():

        attribute_store = attributes_store[mood]
        key_store = keys_storage[mood]
        this_items_matches = attribute_store[review_key]

        keyword_values = {}

        keywords_to_iterate = list(this_items_matches.keys())

        while(len(keywords_to_iterate)):
            key_or_phrase = keywords_to_iterate.pop(0)
            this_item_major_key = [x for x in key_store[1] if key_or_phrase.startswith(x)][0]
            if f"{mood} {this_item_major_key}" not in game_entry:
                game_entry[f"{mood} {this_item_major_key}"] = 1 - this_items_matches[key_or_phrase]
            
            for item in keywords_to_iterate:
                if item.startswith(this_item_major_key):
                    keywords_to_iterate.remove(item)

    for review_sentence, mood in sentence_mood_bar.items():
        attribute_store = attributes_store[mood]
        key_store = keys_storage[mood]

        current_major_key = [x for x in key_store[1] if review_sentence.startswith(x)][0]

        this_item_sentence_matches = attribute_store[review_sentence]
        sentence_matches_excluding_current_major_key = {x:y for x,y in this_item_sentence_matches.items() if not x.startswith(current_major_key)}

        while len(sentence_matches_excluding_current_major_key):
            # get the first dictionary item
            key_or_phrase = list(sentence_matches_excluding_current_major_key.keys())[0]
            # print(key_or_phrase)
            # print(sentence_matches_excluding_current_major_key[key_or_phrase])

            this_item_major_key = [x for x in key_store[1] if key_or_phrase.startswith(x)][0]
            # print(this_item_major_key)

            all_entries_starting_with_key = [x for x in sentence_matches_excluding_current_major_key if x.startswith(this_item_major_key)]

            ratings_for_this_item_major_key = 1 - sentence_matches_excluding_current_major_key[all_entries_starting_with_key[0]]
            # print(ratings_for_this_item_major_key)

            if f"{mood} {this_item_major_key}" not in game_entry:
                game_entry[f"{mood} {this_item_major_key}"] = ratings_for_this_item_major_key
            else:
                game_entry[f"{mood} {this_item_major_key}"] = max(game_entry[f"{mood} {this_item_major_key}"], ratings_for_this_item_major_key)
            
            # delete all things in sentence_matches_excluding_current_major_key that start with this_item_major_key
            for item in all_entries_starting_with_key:
                del sentence_matches_excluding_current_major_key[item]
            
            # print("\n")

In [None]:
compare_df = pd.DataFrame(df_as_dict).fillna(0)
compare_df.head()

In [20]:
compare_df.to_pickle("top_1000_cleaned_rag_with_ratings_extrap.pkl")

In [None]:
break