In [None]:
import pandas as pd
import weaviate.classes as wvc
from weaviate.classes.config import Configure
from dotenv import load_dotenv
import warnings
import gc
import time
import json
import weaviate.classes as wvc
import boto3

from modules.rag_description_generation.rag_functions import connect_weaviate_client_docker, add_collection_batch, generate_aggregated_review, get_single_game_row, divide_and_process_generated_summary, remove_collection_items

warnings.filterwarnings('ignore')
load_dotenv("../.env")

pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 30)

ai_generator = "gpt-4o-mini"
collection_name = "Reviews"
sample_pct=.05

In [None]:
client = connect_weaviate_client_docker()

meta_info = client.get_meta()
meta_info['modules']

In [None]:
if client.collections.exists(collection_name):
    client.collections.delete(collection_name)
    pass

client.collections.create(  
        name=collection_name,
        vectorizer_config=[
            Configure.NamedVectors.text2vec_transformers(
                name="title_vector",
                source_properties=["title"],
            )
        ],
        generative_config=wvc.config.Configure.Generative.openai(model=ai_generator),
        properties=[
            wvc.config.Property(
                name="review_text",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="product_id",
                data_type=wvc.config.DataType.TEXT,
                skip_vectorization=True,
                vectorize_property_name=False
            )
        ]
    )

In [None]:
game_df = pd.read_pickle("../data/prod/games/game_dfs_clean/games_clean.pkl")

overall_stats: dict = {}

game_mean = game_df["AvgRating"].describe()["mean"]
game_std = game_df["AvgRating"].describe()["std"]

overall_stats["overall_mean"] = game_mean
overall_stats["overall_std"] = game_std
overall_stats["two_under"] = round(game_mean - 2 * game_std, 2)
overall_stats["one_under"] = round(game_mean - game_std, 2)
overall_stats["half_over"] = round(game_mean + 0.5 * game_std, 2)
overall_stats["one_over"] = round(game_mean + game_std, 2)

In [None]:
top_25_df = game_df.sort_values("BayesAvgRating", ascending=False)[:25]

game_ids = top_25_df["BGGId"].astype(str).tolist()
top_25_df.head()

In [None]:
user_df = pd.read_pickle("../data/prod/users/user_dfs_clean/complete_user_ratings.pkl")

all_games_df = user_df.merge(top_25_df[['BGGId','Name','Description','AvgRating', 'BayesAvgRating']], on="BGGId", how="inner")
all_games_df["BGGId"] = all_games_df["BGGId"].astype("string")

del game_df
del user_df
gc.collect()

### Produce synopses

In [None]:
overall_summary = {}

In [None]:
all_prompts = json.loads(open('prompt.json').read())
generate_prompt = all_prompts['gpt4o_mini_generate_prompt_structured']

In [None]:
def prompt_replacement(generate_prompt, overall_stats, game_name, game_mean):

    # turn all stats to strings
    overall_stats = {k: str(v) for k, v in overall_stats.items()}

    current_prompt = generate_prompt.replace("{GAME_NAME_HERE}", game_name)
    current_prompt = current_prompt.replace("{GAME_AVERAGE_HERE}", game_mean)
    current_prompt = current_prompt.replace("{TWO_UNDER}", overall_stats['two_under'])
    current_prompt = current_prompt.replace("{ONE_UNDER}", overall_stats['one_under'])
    current_prompt = current_prompt.replace("{ONE_OVER}", overall_stats['one_over'])
    current_prompt = current_prompt.replace("{HALF_OVER}", overall_stats['half_over'])
    current_prompt = current_prompt.replace("{OVERALL_MEAN}", overall_stats['overall_mean'])
    return current_prompt

In [None]:
for game_id in game_ids[:1]:

    if not check_dynamo_db_key(game_id):
        df, game_name, avg_rating = get_single_game_row(all_games_df, game_id, sample_pct=sample_pct)
        game_id = df['BGGId'].iloc[0]
        reviews = df['combined_review'].to_list()
        add_collection_batch(client, collection_name, game_id, reviews)
        current_prompt = prompt_replacement(generate_prompt, overall_stats, game_name, game_mean)
        summary = generate_aggregated_review(client, collection_name, game_id, current_prompt)
        divide_and_process_generated_summary(game_id, summary=summary.generated)
        print(f"\n\n{summary.generated}")
        remove_collection_batch(client, collection_name, game_id, reviews)

In [None]:
# get item from dynamodb
dynamodb_client = boto3.client('dynamodb')
table_name = 'game_generated_descriptions'
response = dynamodb_client.get_item(TableName=table_name, Key={'game_id': {'S': game_id}})['Item']
response
