In [None]:
import pandas as pd
import weaviate.classes as wvc
from weaviate.classes.config import Configure
from dotenv import load_dotenv
import warnings
import gc
import time
import json
import weaviate.classes as wvc
import boto3

from modules.rag_description_generation.rag_functions import connect_weaviate_client_docker, add_collection_batch, generate_aggregated_review, refine_df_for_specific_game, divide_and_process_generated_summary, remove_collection_items

warnings.filterwarnings('ignore')
load_dotenv("../.env")

pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 30)

ai_generator = "gpt-4o-mini"
collection_name = "Reviews"
sample_pct=.05

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
client = connect_weaviate_client_docker()

meta_info = client.get_meta()
meta_info['modules']

In [None]:
if client.collections.exists(collection_name):
    client.collections.delete(collection_name)
    pass

client.collections.create(  
        name=collection_name,
        vectorizer_config=[
            Configure.NamedVectors.text2vec_transformers(
                name="title_vector",
                source_properties=["title"],
            )
        ],
        generative_config=wvc.config.Configure.Generative.openai(model=ai_generator),
        properties=[
            wvc.config.Property(
                name="review_text",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="product_id",
                data_type=wvc.config.DataType.TEXT,
                skip_vectorization=True,
                vectorize_property_name=False
            )
        ]
    )

In [None]:
game_df = pd.read_pickle("../data/prod/games/game_dfs_clean/games_clean.pkl")

overall_stats: dict = {}

game_mean = game_df["AvgRating"].describe()["mean"]
game_std = game_df["AvgRating"].describe()["std"]

overall_stats["overall_mean"] = game_mean
overall_stats["overall_std"] = game_std
overall_stats["two_under"] = round(game_mean - 2 * game_std, 2)
overall_stats["one_under"] = round(game_mean - game_std, 2)
overall_stats["half_over"] = round(game_mean + 0.5 * game_std, 2)
overall_stats["one_over"] = round(game_mean + game_std, 2)

In [25]:
top_25_df = game_df.sort_values("BayesAvgRating", ascending=False)[:25]

game_ids = top_25_df["BGGId"].astype(str).tolist()
top_25_df.head()

Unnamed: 0,BGGId,Name,Description,ImagePath,NumAlternates,NumExpansions,NumImplementations,IsReimplementation,Rank:boardgame,BestPlayers,GoodPlayers,YearPublished,MinPlayers,MaxPlayers,AvgRating,...,Rank:abstracts,Rank:partygames,Rank:wargames,Rank:thematic,Rank:familygames,Rank:childrensgames,Rank:cgs,Cat:Thematic,Cat:Strategy,Cat:War,Cat:Family,Cat:CGS,Cat:Abstract,Cat:Party,Cat:Childrens
10174,224517,Brass: Birmingham,Brass: Birmingham is an economic strategy game...,https://cf.geekdo-images.com/x3zxjr-Vw5iU4yDPg...,6,0,1,1,1.0,3,"['2', '3', '4']",2018,2,4,8.58922,...,28017,28017,28017,28017,28017,28017,28017,0,1,0,0,0,0,0,0
4846,161936,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,https://cf.geekdo-images.com/-Qer2BBPG7qGGDu6K...,11,0,2,1,2.0,4,"['2', '3', '4']",2015,2,4,8.52361,...,28017,28017,28017,1,28017,28017,28017,1,1,0,0,0,0,0,0
5989,174430,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,https://cf.geekdo-images.com/sZYp_3BTDGjh2unaZ...,6,17,1,0,3.0,3,"['1', '2', '3', '4']",2017,1,4,8.57783,...,28017,28017,28017,2,28017,28017,28017,1,1,0,0,0,0,0,0
19051,342942,Ark Nova,"In Ark Nova, you will plan and design a modern...",https://cf.geekdo-images.com/SoU8p28Sk1s8MSvoM...,8,4,0,0,4.0,2,"['1', '2', '3']",2021,1,4,8.53243,...,28017,28017,28017,28017,28017,28017,28017,0,1,0,0,0,0,0,0
20469,363622,The Castles of Burgundy: Special Edition,Castles of Burgundy is a legendary Board Game ...,https://cf.geekdo-images.com/JUrmY8GgFPQlENiPT...,2,0,0,0,,2,"['2', '3', '4']",2023,1,4,9.14907,...,28017,28017,28017,28017,28017,28017,28017,0,1,0,0,0,0,0,0


In [27]:
user_df = pd.read_pickle("../data/prod/users/user_dfs_clean/complete_user_ratings.pkl")

all_games_df = user_df.merge(top_25_df[['BGGId','Name','Description','AvgRating', 'BayesAvgRating']], on="BGGId", how="inner")
all_games_df["BGGId"] = all_games_df["BGGId"].astype("string")

del game_df
del user_df
gc.collect()

CPU times: user 425 ms, sys: 920 ms, total: 1.35 s
Wall time: 1.56 s


(989112, 9)

### Produce synopses

In [None]:
overall_summary = {}

In [None]:
all_prompts = json.loads(open('prompt.json').read())
generate_prompt = all_prompts['gpt4o_mini_generate_prompt_structured']

In [None]:
def prompt_replacement(generate_prompt, overall_stats, game_name, game_mean):

    # turn all stats to strings
    overall_stats = {k: str(v) for k, v in overall_stats.items()}

    current_prompt = generate_prompt.replace("{GAME_NAME_HERE}", game_name)
    current_prompt = current_prompt.replace("{GAME_AVERAGE_HERE}", game_mean)
    current_prompt = current_prompt.replace("{TWO_UNDER}", overall_stats['two_under'])
    current_prompt = current_prompt.replace("{ONE_UNDER}", overall_stats['one_under'])
    current_prompt = current_prompt.replace("{ONE_OVER}", overall_stats['one_over'])
    current_prompt = current_prompt.replace("{HALF_OVER}", overall_stats['half_over'])
    current_prompt = current_prompt.replace("{OVERALL_MEAN}", overall_stats['overall_mean'])
    return current_prompt

In [None]:
for game_id in ['318009']:
    if not check_dynamo_db_key(game_id):
        df, game_name, avg_rating = refine_df_for_specific_game(all_games_df, game_id, sample_pct=sample_pct)
        game_id = df['BGGId'].iloc[0]
        reviews = df['combined_review'].to_list()
        add_collection_batch(client, collection_name, game_id, reviews)
        current_prompt = prompt_replacement(generate_prompt, overall_stats, game_name, game_mean)
        summary = generate_aggregated_review(client, collection_name, game_id, current_prompt)
        divide_and_process_generated_summary(game_id, summary=summary.generated)
        print(f"\n\n{summary.generated}")
        remove_collection_batch(client, collection_name, game_id, reviews)

In [None]:
for game_id in top_25[:1]:

    if not check_dynamo_db_key(game_id):
        df, game_name, avg_rating = refine_df_for_specific_game(all_games_df, game_id, sample_pct=sample_pct)
        game_id = df['BGGId'].iloc[0]
        reviews = df['combined_review'].to_list()
        add_collection_batch(client, collection_name, game_id, reviews)
        current_prompt = prompt_replacement(generate_prompt, overall_stats, game_name, game_mean)
        summary = generate_aggregated_review(client, collection_name, game_id, current_prompt)
        divide_and_process_generated_summary(game_id, summary=summary.generated)
        print(f"\n\n{summary.generated}")
        remove_collection_batch(client, collection_name, game_id, reviews)

In [None]:
# get item from dynamodb
dynamodb_client = boto3.client('dynamodb')
table_name = 'game_generated_descriptions'
response = dynamodb_client.get_item(TableName=table_name, Key={'game_id': {'S': game_id}})['Item']
response
