In [None]:
import pandas as pd
import weaviate
import weaviate.classes as wvc
from weaviate.util import generate_uuid5
from weaviate.classes.query import Filter
import os
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import time
import json
import warnings
warnings.filterwarnings('ignore')
load_dotenv("../.env")

ai_generator = "gpt-4o-mini"
sample_pct=.05

In [None]:
def filter_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

def evaluate_quality_words_over_thresh(text):
    word_tokens = word_tokenize(text)
    return len(word_tokens) > 5

In [None]:


def create_weaviate_client():
    client = weaviate.connect_to_local(
        headers={
            "X-OpenAI-Api-Key": os.environ["OPENAI_API_KEY"]}
    )
    
    if client.collections.exists("Reviews"):
        client.collections.delete("Reviews")
    
    client.collections.create(
        name="Reviews",
        vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(model="ada",model_version="002", type_="text", vectorize_collection_name=False),    # Set the vectorizer to "text2vec-openai" to use the OpenAI API for vector-related operations
        generative_config=wvc.config.Configure.Generative.openai(model=ai_generator),             # Set the generative module to "generative-cohere" to use the Cohere API for RAG
        properties=[
            wvc.config.Property(
                name="review_text",
                data_type=wvc.config.DataType.TEXT,
            ),
            wvc.config.Property(
                name="product_id",
                data_type=wvc.config.DataType.TEXT,
                skip_vectorization=True,
                vectorize_property_name=False
            )
        ]
    )

    return client


client = create_weaviate_client()

In [None]:
def add_collection(client, game_id, reviews):
    collection = client.collections.get("Reviews")
    print(f"Adding reviews for game {game_id}")

    with collection.batch.dynamic() as batch:
        for review in reviews:
            review_item = {
                    "review_text": review,
                    "product_id": game_id,
                }
            uuid=generate_uuid5(review_item)

            if collection.data.exists(uuid):
                continue
                # if it already exists, update the properties
                collection.data.update(
                    properties=review_item,
                    uuid=uuid
                )
            else:
                batch.add_object(
                    properties=review_item,
                    uuid=uuid
                )

In [None]:
def generate_aggregated_review(game_id,generate_prompt):
    print(f"Generating aggregated review for game {game_id}")
    collection = client.collections.get("Reviews")
    summary = collection.generate.near_text(
        query="aggregate_review",
        return_properties=["review_text", "product_id"],
        filters=Filter.by_property("product_id").equal(game_id),
        grouped_task=generate_prompt
                )
    return summary

In [None]:
def refine_for_specific_game(df, game_id):
    
    # immediately filter to only the game_id we're interested in
    df = df[df['BGGId']==game_id]
    df = df.reset_index(drop=True)
    game_name = df['Name'].iloc[0]

    print(f"\n\nBuilding review data frame for game {game_name}: {game_id}")

    # get the ratings sample distribution by taking 10% of the total ratings
    df['rounded_rating'] = df['rating'].round(0).astype(int)
    sample_size = int(len(df)*sample_pct ) # Desired total sample size
    group_sizes = round(df['rounded_rating'].value_counts(normalize=True) * sample_size, 0).astype(int)
    print(f"Desired sample size: {sample_size}")

    # refine to only ratings with comments and clean all comments
    df = df[df['value'].notna()]
    count_reviews_all_comments = len(df)
    print(f"Total reviews with comments: {count_reviews_all_comments}")
    df['value'] = df['value'].replace(r'[^A-Za-z0-9 ]+', '', regex=True)
    df['value'] = df['value'].str.lower().apply(lambda x: filter_stopwords(x))

    df['quality_review'] = df['value'].apply(evaluate_quality_words_over_thresh)
    df = df[df['quality_review']==True]
    removed_reviews = count_reviews_all_comments - len(df)
    print(f"Total quality reviews: {len(df)}. {removed_reviews} reviews removed due to quality threshold")
    
    if len(df) < sample_size:
        print("Not enough quality reviews to sample from; using all reviews")
    else:
        print(f"Stratified sampling to {sample_size} reviews")
        rating_counts = df['rounded_rating'].value_counts()
        # Ensure we don't sample more than the available values in each group
        adjusted_group_sizes = group_sizes.clip(upper=rating_counts)
        df = (
            df.groupby('rounded_rating', group_keys=False)
            .apply(lambda x: x.sample(n=int(adjusted_group_sizes[x.name]), random_state=42))
        )
    
    # remove all special characters from combined_review
    df['combined_review'] = df['rating'].astype("string") + " " + df['value']
    df['combined_review'] = df['combined_review'].astype("string")
    
    avg_rating = round(df['AvgRating'].iloc[0], 1)
    df = df[['BGGId','Description','combined_review']]

    return df, game_name, avg_rating

In [None]:
user_df = pd.read_pickle("../data/prod/users/user_dfs_clean/complete_user_ratings.pkl")
game_df = pd.read_pickle("../data/prod/games/game_dfs_clean/games_clean.pkl")
all_games_df = user_df.merge(game_df[['BGGId','Name','Description','AvgRating']], on="BGGId", how="left")
all_games_df["BGGId"] = all_games_df["BGGId"].astype("string")

In [None]:
top_25 = game_df.sort_values("BayesAvgRating", ascending=False)['BGGId'][:25].to_list()
top_25 = [str(x) for x in top_25]

In [None]:
bottom_10 = game_df.sort_values("BayesAvgRating", ascending=True)['BGGId'][:10].to_list()
bottom_10 = [str(x) for x in bottom_10]

In [None]:
overall_summary = {}

In [None]:
generate_prompt = json.loads(open('prompt.json').read())['gpt4o_mini_generate_prompt']
generate_prompt

In [None]:
for game_id in ['318009']:
    if game_id in overall_summary.keys():
        continue
    df, game_name, avg_rating = refine_for_specific_game(all_games_df, game_id)
    game_id = df['BGGId'].iloc[0]
    reviews = df['combined_review'].to_list()
    add_collection(client, game_id, reviews)
    current_prompt = generate_prompt.replace("GAME_NAME_HERE", game_name)
    current_prompt = current_prompt.replace("GAME_AVERAGE_HERE", str(avg_rating))
    summary = generate_aggregated_review(game_id, current_prompt)
    overall_summary[game_id] = summary.generated
    print(f"\n\n{summary.generated}")
    time.sleep(5)

In [None]:
for game_id in bottom_10:
    if game_id in overall_summary.keys():
        continue
    df, game_name, avg_rating = refine_for_specific_game(all_games_df, game_id)
    game_id = df['BGGId'].iloc[0]
    reviews = df['combined_review'].to_list()
    add_collection(client, game_id, reviews)
    current_prompt = generate_prompt.replace("GAME_NAME_HERE", game_name)
    current_prompt = current_prompt.replace("GAME_AVERAGE_HERE", str(avg_rating))
    summary = generate_aggregated_review(game_id, current_prompt)
    overall_summary[game_id] = summary.generated
    print(f"\n\n{summary.generated}")
    time.sleep(5)

In [None]:
for game_id in top_25:
    if game_id in overall_summary.keys():
        continue
    df, game_name, avg_rating = refine_for_specific_game(all_games_df, game_id)
    game_id = df['BGGId'].iloc[0]
    reviews = df['combined_review'].to_list()
    add_collection(client, game_id, reviews)
    current_prompt = generate_prompt.replace("GAME_NAME_HERE", game_name)
    current_prompt = current_prompt.replace("GAME_AVERAGE_HERE", str(avg_rating))
    summary = generate_aggregated_review(game_id, current_prompt)
    overall_summary[game_id] = summary.generated
    print(f"\n\n{summary.generated}")
    time.sleep(5)


In [None]:
games_with_summaries = pd.DataFrame.from_dict(overall_summary, orient='index').reset_index().rename(columns={"index":"BGGId",0:"summary"})
len(games_with_summaries)

In [None]:
games_with_summaries.to_pickle(f"games_with_ai_summaries_{ai_generator}_{sample_pct}pct_sample.pkl")

In [None]:
gpt4o_mini_05_results = pd.read_pickle(f"games_with_ai_summaries_{ai_generator}_{sample_pct}pct_sample.pkl")
gpt4o_mini_1_results = pd.read_pickle(f"games_with_ai_summaries_{ai_generator}_1pct_sample.pkl")
gpt4_results = pd.read_pickle("games_with_ai_summaries_gpt4.pkl")

In [None]:
merged_one = gpt4o_mini_05_results.merge(gpt4o_mini_1_results, on="BGGId", how="left", suffixes=("_4mini_5pct", "_4mini_1pct"))
merged_two = merged_one.merge(gpt4_results, on="BGGId", how="left")
df = merged_two.rename(columns={"summary":"summary_gpt4_all"})
df.head()

In [None]:
df.to_csv(f"games_with_ai_summaries_{ai_generator}_comparison.csv", index=False)

## Single Game Study

In [None]:
single_game = "318009"

In [None]:
df, game_name, avg_rating = refine_for_specific_game(all_games_df, single_game)
df.head()

In [None]:
game_id = df['BGGId'].iloc[0]
reviews = df['combined_review'].to_list()
add_collection(client, game_id, reviews)

In [None]:
generate_prompt = ""

In [None]:
summary = generate_aggregated_review(game_id, generate_prompt)

In [None]:
review = summary.generated
print(review)