In [1]:
import pandas as pd
from config import CONFIGS
import os
import re
import os
import json
from typing import Tuple
from utils.weaviate_client import WeaviateClient

ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

weaviate_client = WeaviateClient()
weaviate_client.connect_weaviate_client_docker()

IS_LOCAL: True

Checking for local config file and evaluating for updates from S3.
Loading config from local


  warn(


<weaviate.client.WeaviateClient at 0x1416dac00>

In [2]:
def get_major_and_all_components(component_type:str, df:pd.DataFrame) -> Tuple[list, list]:

    # Extract unique components from components
    unique_positive_components = set()
    for components in df[f'{component_type}_Components']:
        unique_positive_components.update(components)
    major_components = [x for x in list(unique_positive_components) if x != ''] 
    # sort major components by number of words in the entry, highest to lowest
    major_components = sorted(major_components, key=lambda x: len(x.split()), reverse=True)

    # Extract unique elements from both Positive_Components and Positive_Sentences
    unique_sentence_components = set()  # Start with Positive_Components
    for sentences in df[f'{component_type}_Sentences']:
        unique_sentence_components.update(sentences)
    sentence_components = [x for x in list(unique_sentence_components) if x != ''] 
    sentence_components = sorted(sentence_components, key=lambda x: len(x.split()), reverse=True)

    # Extract unique elements from both Positive_Components and Positive_Sentences
    # unique_all_components = set(unique_positive_components)  # Start with Positive_Components
    # for sentences in df[f'{component_type}_Sentences']:
    #     unique_all_components.update(sentences)
    # all_components = [x for x in list(unique_all_components) if x != ''] 
    # all_components = sorted(all_components, key=lambda x: len(x.split()), reverse=True)
    all_components = major_components + sentence_components


    return all_components, major_components, sentence_components

def determine_major_component(row, major_components):
    for component in major_components:
        if row.startswith(component):
            return component
    return None

# Weaviate create attribute embeddings

In [None]:
df = pd.read_pickle("top_1000_cleaned_rag.pkl")

df.head()

### Create Positives Storage

In [4]:
all_positives, major_positives, sentence_positives = get_major_and_all_components("Positive", df)

In [None]:
len(all_positives), len(major_positives), len(sentence_positives)

In [None]:
all_positives[:5]

In [None]:
major_positives[50:55]

In [None]:
sentence_positives[:5]

In [None]:
collection_name = "positive_attributes"
weaviate_client.create_attributes_collection(collection_name=collection_name, reset=False)

positive_attributes_store = weaviate_client.add_attributes_collection_batch(attributes=all_positives, collection_name=collection_name)

In [10]:
with open("major_positives.json", 'w') as f:
    json.dump(sorted(major_positives), f)

In [None]:
match_storage = {}
total_entries = len(all_positives)
entries_completed = 0

for positive_component in all_positives:
    uuid = positive_attributes_store[positive_component]
    major_component = determine_major_component(positive_component, major_components=major_positives)
    pos_similars = weaviate_client.find_near_objects(collection_name=collection_name, uuid=uuid, limit=1000)
    matches_without_major_component = {x.properties['attribute_name']:x.metadata.distance for x in pos_similars if x.metadata.distance <= .50 and not x.properties['attribute_name'].startswith(major_component)}
    match_storage[positive_component] = matches_without_major_component
    # report back every 100 entries completed
    entries_completed += 1
    if entries_completed % 100 == 0:
        print(f"Completed {entries_completed} of {total_entries}")
    

In [None]:
pos_similars

In [12]:
with open('positive_matches.json', 'w') as f:
    json.dump(match_storage, f)

### Create Negatives Storage

In [13]:
all_negatives, major_negatives, sentence_negatives = get_major_and_all_components("Negative", df)

In [None]:
major_negatives[:50]

In [None]:
collection_name = "negatives_attributes"

weaviate_client.create_attributes_collection(collection_name=collection_name, reset=False)

negative_attributes_store = weaviate_client.add_attributes_collection_batch(all_negatives, collection_name=collection_name)

In [None]:
sentence_negatives[:5]

In [None]:
match_storage = {}
total_entries = len(all_negatives)
entries_completed = 0

for negative_component in all_negatives:
    uuid = negative_attributes_store[negative_component]
    major_component = determine_major_component(negative_component, major_components=major_negatives)
    pos_similars = weaviate_client.find_near_objects(collection_name=collection_name, uuid=uuid, limit=1000)
    matches_without_major_component = {x.properties['attribute_name']:x.metadata.distance for x in pos_similars if x.metadata.distance <= .50 and not x.properties['attribute_name'].startswith(major_component)}
    match_storage[negative_component] = matches_without_major_component
    entries_completed += 1
    if entries_completed % 100 == 0:
        print(f"Completed {entries_completed} of {total_entries}")
    

In [20]:
with open('negative_matches.json', 'w') as f:
    json.dump(match_storage, f)

In [None]:
break

### Clean up positives and negatives into vectors

In [3]:
df = pd.read_pickle("top_1000_cleaned_rag.pkl")
df.head()

Unnamed: 0,BGGId,Description,About,Positive_Components,Positive_Sentences,Negative_Components,Negative_Sentences
0,224517,Brass: Birmingham is an economic strategy game...,This game is a strategic economic simulation s...,"[strategic depth, excellent design, player int...",[strategic depth games complexity multitude de...,"[steep learning curve, fiddly mechanics, varia...",[steep learning curve players find rules diffi...
1,161936,Pandemic Legacy is a co-operative campaign gam...,This game is a cooperative board game where pl...,"[engaging storyline, cooperative, replayabilit...",[engaging storyline evolving narrative keeps p...,"[complex, time commitment, group dependency, f...",[complex players find numerous choices rules o...
2,174430,Gloomhaven is a game of Euro-inspired tactica...,This game is a cooperative tactical adventure ...,"[content variety, replayability, strategic dep...",[content variety game offers immense amount co...,"[setup, complex, pacing issues, excellent comm...",[setup many reviews mention game requires sign...
3,342942,"In Ark Nova, you will plan and design a modern...",This game is a strategic board game where play...,"[engaging theme, creative mechanics, excellent...",[engaging theme zoo management theme appealing...,"[lengthy gameplay, luck factor, complex, analy...",[lengthy gameplay many reviews highlight game ...
4,363622,Castles of Burgundy is a legendary Board Game ...,This game is a strategic board game that revol...,"[excellent production quality, engaging mechan...",[excellent production quality game boasts exce...,"[complex for new players, setup, potential for...",[complex for new players reviews mention game ...


In [4]:
len(df)

1000

In [5]:
all_positives, major_positives, sentence_positives = get_major_and_all_components("Positive", df)
all_negatives, major_negatives, sentence_negatives = get_major_and_all_components("Negative", df)

len(all_positives), len(major_positives), len(sentence_positives), len(all_negatives), len(major_negatives), len(sentence_negatives)

(6428, 716, 5712, 6619, 1336, 5283)

In [6]:
keys_storage = {"positive":[major_positives, sentence_positives], "negative":[major_negatives, sentence_negatives]}

In [7]:
attributes_store = {"positive":json.loads(open("positive_matches.json").read()),
                    "negative":json.loads(open("negative_matches.json").read())}

  attributes_store = {"positive":json.loads(open("positive_matches.json").read()),
  "negative":json.loads(open("negative_matches.json").read())}


In [8]:
df_as_dict = df.to_dict(orient='records') # convert dataframe to list of dictionaries

total_entries = len(df_as_dict)
completed_entries = 0

for game_entry in df_as_dict:
    print(f"Preparing BGG ID: {game_entry['BGGId']}")

    # attach a positive or negative sentiment to each major key phrase
    keyword_mood_bar = {x:"positive" for x in game_entry['Positive_Components']} # start with positive components
    keyword_mood_bar.update({x:"negative" for x in game_entry['Negative_Components']}) # add negative components

    for component in game_entry['Positive_Components'] + game_entry['Negative_Components']:
        game_entry[f"{keyword_mood_bar[component]} {component}"] = 1 # create positive_ and negative_ headers for all major key phrase and set = 1 as the default "self" component rating
    
    # attach a positive or negative sentiment to each sentence
    sentence_mood_bar = {x:"positive" for x in game_entry['Positive_Sentences']}
    sentence_mood_bar.update({x:"negative" for x in game_entry['Negative_Sentences']})
    
    # process the major key phrases first
    for review_key, sentiment in keyword_mood_bar.items():
        # print(f"Processing major key {sentiment} {review_key}")

        matches_dictionary = attributes_store[sentiment][review_key] # get the matches dictionary for this item
        matches_to_iterate = list(matches_dictionary.keys()) # get the keywords to iterate over from the matches dictionary
        
        keyword_values = {}

        while(len(matches_to_iterate)):
            key_or_phrase = matches_to_iterate.pop(0) # pop a match from the list to check
            match_major_key = [x for x in keys_storage[sentiment][0] if key_or_phrase.startswith(x)][0] # get the match's major key

            ratings_for_this_item_major_key = round(1 - matches_dictionary[key_or_phrase], 3) # invert the match value to get the rating for this item

            if f"{sentiment} {match_major_key}" not in game_entry: # check if the major key is already in the game entry
                game_entry[f"{sentiment} {match_major_key}"] = ratings_for_this_item_major_key # if not, add it to the game entry
                # print(f"Added {sentiment} {match_major_key} {ratings_for_this_item_major_key} to game entry")
            else: # if the major key is already in the game entry, check if the new rating is higher than the existing rating
                old_entry = game_entry[f"{sentiment} {match_major_key}"] # get the existing rating
                if ratings_for_this_item_major_key > old_entry:
                    game_entry[f"{sentiment} {match_major_key}"] = ratings_for_this_item_major_key # if the new rating is higher, update the game entry
                    # print(f"Updated {sentiment} {match_major_key} in game entry from {old_entry} to {ratings_for_this_item_major_key}")

            for item in matches_to_iterate:
                if item.startswith(match_major_key):
                    matches_to_iterate.remove(item) # remove all items that start with the major key from the list of items to iterate over
    
    for review_sentence, sentiment in sentence_mood_bar.items():

        # print(f"\nProcessing sentence {sentiment} {review_sentence}")

        matches_dictionary = attributes_store[sentiment][review_sentence] # get the matches dictionary for this item
        matches_to_iterate = list(matches_dictionary.keys()) # get the keywords to iterate over from the matches dictionary

        current_major_key = [x for x in keys_storage[sentiment][0] if review_sentence.startswith(x)][0] # get the match's major key

        sentence_matches_excluding_current_major_key = {x:y for x,y in matches_dictionary.items() if not x.startswith(current_major_key)} # remove the current major key from the matches dictionary, because we already matched the major key

        while len(sentence_matches_excluding_current_major_key):

            top_match = list(sentence_matches_excluding_current_major_key.keys())[0] # get the first item in the matches dictionary
            match_value = sentence_matches_excluding_current_major_key[top_match] # get the value of the first item in the matches dictionary

            match_major_key = [x for x in keys_storage[sentiment][0] if top_match.startswith(x)][0] # get the match's major key so we can apply the match_value to this major key

            ratings_for_this_item_major_key = round(1 - match_value, 3) # invert the match value to get the rating for this item

            if f"{sentiment} {match_major_key}" not in game_entry: # check if the major key is already in the game entry
                game_entry[f"{sentiment} {match_major_key}"] = ratings_for_this_item_major_key # if not, add it to the game entry
                # print(f"Added {sentiment} {match_major_key} {ratings_for_this_item_major_key} to game entry")
            else: # if the major key is already in the game entry, check if the new rating is higher than the existing rating
                old_entry = game_entry[f"{sentiment} {match_major_key}"] # get the existing rating
                game_entry[f"{sentiment} {match_major_key}"] = max(old_entry, ratings_for_this_item_major_key) # if the new rating is higher, update the game entry
                # print(f"Updated {sentiment} {match_major_key} in game entry from {old_entry} to {ratings_for_this_item_major_key}")
            
            all_entries_starting_with_key = [x for x in sentence_matches_excluding_current_major_key if x.startswith(match_major_key)] # get all items in the matches dictionary that start with the match's major key
            # delete all things in sentence_matches_excluding_current_major_key that start with this_item_major_key
            for item in all_entries_starting_with_key:
                del sentence_matches_excluding_current_major_key[item]
    completed_entries += 1
    
    print(f"Completed {completed_entries} of {total_entries}")
    print("\n")



Preparing BGG ID: 224517
Completed 1 of 1000


Preparing BGG ID: 161936
Completed 2 of 1000


Preparing BGG ID: 174430
Completed 3 of 1000


Preparing BGG ID: 342942
Completed 4 of 1000


Preparing BGG ID: 363622
Completed 5 of 1000


Preparing BGG ID: 233078
Completed 6 of 1000


Preparing BGG ID: 316554
Completed 7 of 1000


Preparing BGG ID: 167791
Completed 8 of 1000


Preparing BGG ID: 115746
Completed 9 of 1000


Preparing BGG ID: 187645
Completed 10 of 1000


Preparing BGG ID: 291457
Completed 11 of 1000


Preparing BGG ID: 162886
Completed 12 of 1000


Preparing BGG ID: 220308
Completed 13 of 1000


Preparing BGG ID: 12333
Completed 14 of 1000


Preparing BGG ID: 182028
Completed 15 of 1000


Preparing BGG ID: 84876
Completed 16 of 1000


Preparing BGG ID: 193738
Completed 17 of 1000


Preparing BGG ID: 169786
Completed 18 of 1000


Preparing BGG ID: 246900
Completed 19 of 1000


Preparing BGG ID: 173346
Completed 20 of 1000


Preparing BGG ID: 28720
Completed 21 of 1000


Prep

In [9]:
compare_df = pd.DataFrame(df_as_dict).fillna(0)
compare_df.head()

Unnamed: 0,BGGId,Description,About,Positive_Components,Positive_Sentences,Negative_Components,Negative_Sentences,positive strategic depth,positive excellent design,positive player interaction,...,negative anticlimactic endings,positive excellent production values,positive attractive artwork,positive accessible non gamers,positive variable experiences,negative catch up mechanics,positive simple to learning,negative action limitations,positive good themes,positive good interactions
0,224517,Brass: Birmingham is an economic strategy game...,This game is a strategic economic simulation s...,"[strategic depth, excellent design, player int...",[strategic depth games complexity multitude de...,"[steep learning curve, fiddly mechanics, varia...",[steep learning curve players find rules diffi...,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,161936,Pandemic Legacy is a co-operative campaign gam...,This game is a cooperative board game where pl...,"[engaging storyline, cooperative, replayabilit...",[engaging storyline evolving narrative keeps p...,"[complex, time commitment, group dependency, f...",[complex players find numerous choices rules o...,0.834,0.739,0.848,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,174430,Gloomhaven is a game of Euro-inspired tactica...,This game is a cooperative tactical adventure ...,"[content variety, replayability, strategic dep...",[content variety game offers immense amount co...,"[setup, complex, pacing issues, excellent comm...",[setup many reviews mention game requires sign...,1.0,0.853,0.785,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,342942,"In Ark Nova, you will plan and design a modern...",This game is a strategic board game where play...,"[engaging theme, creative mechanics, excellent...",[engaging theme zoo management theme appealing...,"[lengthy gameplay, luck factor, complex, analy...",[lengthy gameplay many reviews highlight game ...,1.0,0.774,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,363622,Castles of Burgundy is a legendary Board Game ...,This game is a strategic board game that revol...,"[excellent production quality, engaging mechan...",[excellent production quality game boasts exce...,"[complex for new players, setup, potential for...",[complex for new players reviews mention game ...,0.855,0.784,0.862,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
compare_df.to_pickle("top_1000_cleaned_rag_with_ratings_extrap.pkl")

In [None]:
break