In [None]:
import pandas as pd
import weaviate.classes as wvc
from weaviate.classes.config import Configure
from dotenv import load_dotenv
import warnings
import gc
import time
import json

from exploratory_functions import create_weaviate_client, add_collection_batch, generate_aggregated_review, refine_df_for_specific_game

warnings.filterwarnings('ignore')
load_dotenv("../.env")

pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 30)

ai_generator = "gpt-4o-mini"
word_vec = "mpnet"
collection_name = "Reviews_MPNet"
sample_pct=.05

In [None]:
client = create_weaviate_client()

if client.collections.exists(collection_name):
    # client.collections.delete(collection_name)
    pass

else:
    client.collections.create(
        name=collection_name,
        # vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(model="ada",model_version="002", type_="text", vectorize_collection_name=False),
        vectorizer_config=[
            Configure.NamedVectors.text2vec_huggingface(
                name="title_vector",
                source_properties=["title"],
                model="sentence-transformers/all-mpnet-base-v2",
            )
        ],
        generative_config=wvc.config.Configure.Generative.openai(model=ai_generator),
        properties=[
            wvc.config.Property(
                name="review_text",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="product_id",
                data_type=wvc.config.DataType.TEXT,
                skip_vectorization=True,
                vectorize_property_name=False
            )
        ]
    )

In [None]:
user_df = pd.read_pickle("../data/prod/users/user_dfs_clean/complete_user_ratings.pkl")
game_df = pd.read_pickle("../data/prod/games/game_dfs_clean/games_clean.pkl")

game_mean = game_df["AvgRating"].describe()['mean'].round(2)
game_std = game_df["AvgRating"].describe()['std'].round(2)
two_under = round(game_mean - 2*game_std, 2)
one_under = round(game_mean - game_std, 2)
half_over = round(game_mean + .5*game_std, 2)
one_over = round(game_mean + game_std, 2)

all_games_df = user_df.merge(game_df[['BGGId','Name','Description','AvgRating', 'BayesAvgRating']], on="BGGId", how="left")
all_games_df["BGGId"] = all_games_df["BGGId"].astype("string")
del game_df
del user_df
gc.collect()

In [None]:
top_25 = all_games_df.sort_values("BayesAvgRating", ascending=False)['BGGId'][:25].to_list()
top_25 = [str(x) for x in top_25]

In [None]:
bottom_10 = all_games_df.sort_values("BayesAvgRating", ascending=True)['BGGId'][:10].to_list()
bottom_10 = [str(x) for x in bottom_10]

### Produce synopses

In [None]:
overall_summary = {}

In [None]:
all_prompts = json.loads(open('prompt.json').read())
generate_prompt = all_prompts['gpt4o_mini_generate_prompt_structured']
print(generate_prompt)

In [None]:
def prompt_replacement(generate_prompt, game_name, avg_rating):
    current_prompt = generate_prompt.replace("{GAME_NAME_HERE}", game_name)
    current_prompt = current_prompt.replace("{GAME_AVERAGE_HERE}", str(avg_rating))
    current_prompt = current_prompt.replace("{TWO_UNDER}", str(two_under))
    current_prompt = current_prompt.replace("{ONE_UNDER}", str(one_under))
    current_prompt = current_prompt.replace("{ONE_OVER}", str(one_over))
    current_prompt = current_prompt.replace("{HALF_OVER}", str(half_over))
    current_prompt = current_prompt.replace("{OVERALL_AVERAGE}", str(game_mean))
    return current_prompt

In [None]:
for game_id in ['318009']:
    if game_id in overall_summary.keys():
        continue
    df, game_name, avg_rating = refine_df_for_specific_game(all_games_df, game_id, sample_pct=sample_pct)
    game_id = df['BGGId'].iloc[0]
    reviews = df['combined_review'].to_list()
    add_collection_batch(client, collection_name, game_id, reviews)
    current_prompt = prompt_replacement(generate_prompt, game_name, avg_rating)
    summary = generate_aggregated_review(client, collection_name, game_id, current_prompt)
    overall_summary[game_id] = summary.generated
    print(f"\n\n{summary.generated}")
    time.sleep(5)

In [None]:
for game_id in bottom_10:
    if game_id in overall_summary.keys():
        continue
    df, game_name, avg_rating = refine_df_for_specific_game(all_games_df, game_id, sample_pct=sample_pct)
    game_id = df['BGGId'].iloc[0]
    reviews = df['combined_review'].to_list()
    add_collection_batch(client, collection_name, game_id, reviews)
    current_prompt = prompt_replacement(generate_prompt, game_name, avg_rating)
    summary = generate_aggregated_review(client, collection_name, game_id, current_prompt)
    overall_summary[game_id] = summary.generated
    print(f"\n\n{summary.generated}")
    time.sleep(5)

In [None]:
for game_id in top_25:
    if game_id in overall_summary.keys():
        continue
    df, game_name, avg_rating = refine_df_for_specific_game(all_games_df, game_id, sample_pct=sample_pct)
    game_id = df['BGGId'].iloc[0]
    reviews = df['combined_review'].to_list()
    add_collection_batch(client, collection_name, game_id, reviews)
    current_prompt = prompt_replacement(generate_prompt, game_name, avg_rating)
    summary = generate_aggregated_review(client, collection_name, game_id, current_prompt)
    overall_summary[game_id] = summary.generated
    print(f"\n\n{summary.generated}")
    time.sleep(5)


In [None]:
games_with_summaries = pd.DataFrame.from_dict(overall_summary, orient='index').reset_index().rename(columns={"index":"BGGId",0:"summary"})
games_with_summaries.to_pickle(f"ai_summaries_{word_vec}_{ai_generator}_{int(sample_pct*100)}pct_sample_structured.pkl")

### Join all saved summaries so far

In [None]:
gpt4o_mini_05_results = pd.read_pickle(f"ai_summaries_ada_{ai_generator}_5pct_sample.pkl")
gpt4o_mini_05_structured_results = pd.read_pickle((f"ai_summaries_ada_{ai_generator}_{int(sample_pct*100)}pct_sample_structured.pkl"))
gpt4o_mini_1_results = pd.read_pickle(f"ai_summaries_ada_{ai_generator}_10pct_sample.pkl")
gpt4_results = pd.read_pickle("ai_summaries_ada_gpt4.pkl")
gpt4o_mini_05_mpnet = pd.read_pickle("ai_summaries_mpnet_gpt-4o-mini_5pct_sample_structured.pkl")

In [None]:
merged_one = gpt4o_mini_05_results.merge(gpt4o_mini_1_results, on="BGGId", how="left", suffixes=("_4mini_5pct", "_4mini_10pct"))
merged_two = merged_one.merge(gpt4_results, on="BGGId", how="left").rename(columns={"summary":"summary_gpt4"})
merged_two.head()

In [None]:
merged_one = gpt4o_mini_05_results.merge(gpt4o_mini_1_results, on="BGGId", how="left", suffixes=("_x", "_7")).rename(columns={"summary_x":"mini_05","summary_7":"mini_1"})
merged_two = merged_one.merge(gpt4_results, on="BGGId", how="left").rename(columns={"summary":"gpt4"})
merged_three = merged_two.merge(gpt4o_mini_05_structured_results, on="BGGId", how="left").rename(columns={"summary":"mini_05_struct"})
df = merged_three.merge(gpt4o_mini_05_mpnet, on="BGGId", how="left").rename(columns={"summary":"mini_mpnet"})
df.head()

In [None]:
df.to_csv(f"games_with_ai_summaries_{ai_generator}_comparison.csv", index=False)