In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import warnings
warnings.filterwarnings('ignore')
load_dotenv("../.env")

pd.set_option("display.max_columns", 30)
pd.set_option("display.max_rows", 30)

ai_generator = "gpt-4o-mini"
word_vec = "mpnet"
collection_name = "Reviews_MPNet"
sample_pct=.05

In [None]:
games = pd.read_pickle("../data/prod/games/game_dfs_clean/games_clean.pkl")
games = games[['BGGId', 'Name']]

summaries = pd.read_csv('./ai_summaries_comparison.csv')
summaries = summaries.merge(games, on='BGGId', how='left')
names = summaries['Name'].tolist()
summaries = summaries.loc[summaries['Name'].isin(names)][['BGGId','mini_mpnet', 'Name']]
summaries.head()

In [None]:
full_descriptions = summaries['mini_mpnet'].tolist()
full_descriptions = [x.split("\n\n### Pros")[0].replace("### What is this game about?\n", "").replace('"', '').replace("'", "").replace(f"{names[i]} is ", "") for x,i in zip(full_descriptions, range(len(names)))]
sentences1 = [x.split(". ")[0] for x in full_descriptions]
sentences2 = [x.split(". ")[1] for x in full_descriptions]
sentences3 = [". ".join(x.split(". ")[0:2]) for x in full_descriptions]
sentences3

In [None]:
model_comparison = {}
model_name = "BAAI/bge-m3"
# model_name = "all-mpnet-base-v2"
model = SentenceTransformer(model_name)

print(f"\n\nRunning with model: {model_name}")

print("Model loaded")
embeddings = model.encode(sentences3)
print("Embeddings generated")
similarities = model.similarity(embeddings, embeddings)
print("Similarities generated")


similarity_df = pd.DataFrame(similarities, columns=names, index=names)
print("Similarity dataframe created")


In [None]:
similarity_df

In [None]:
# make a list of all the pairs of games and their similarity scores

similarity_list = []
for i in range(len(similarity_df)):
    for j in range(i+1, len(similarity_df)):
        similarity_list.append([names[i], names[j], similarity_df.iloc[i,j]])


sim_df = pd.DataFrame(similarity_list, columns=['Game1', 'Game2', 'Similarity']).sort_values('Similarity', ascending=False)

sim_df.head(30)

In [None]:
sim_df.tail(30)

### Test multiple embedding models

In [None]:
model_names = ["all-mpnet-base-v2", "multi-qa-mpnet-base-cos-v1","all-MiniLM-L6-v2","BAAI/bge-m3"]

In [None]:
model_comparison = {}
model_name = "BAAI/bge-m3"
model = SentenceTransformer(model_name)

# for model_name in model_names:

for i, sentences in zip(range(3), [sentences1, sentences2, sentences3]):

        print(f"\n\nRunning with model: {model_name} on {sentences[i]}")

        # Load https://huggingface.co/sentence-transformers/all-mpnet-base-v2
        # model = SentenceTransformer(model_name)
        print("Model loaded")
        embeddings = model.encode(sentences)
        print("Embeddings generated")
        similarities = model.similarity(embeddings, embeddings)
        print("Similarities generated")

        
        similarity_df = pd.DataFrame(similarities, columns=names, index=names)
        print("Similarity dataframe created")

        gh_sim = similarity_df.iloc[1,4]
        burg_sim = similarity_df.iloc[2,9]
        brass_sim = similarity_df.iloc[0,8]
        space_sim = similarity_df.iloc[3,6]

        print(f"Scores: GH: {gh_sim}, Burg: {burg_sim}, Brass: {brass_sim}, Space: {space_sim}")

        model_comparison = model_comparison | {f"gh_{model_name}_{i}":gh_sim, f"burg_{model_name}_{i}":burg_sim, f"brass_{model_name}_{i}":brass_sim, f"space_{model_name}_{i}":space_sim}

In [None]:
model_comparison = {}
model_name = "BAAI/bge-m3"
model = SentenceTransformer(model_name)

# for model_name in model_names:

for i, sentences in zip(range(3), [sentences1, sentences2, sentences3]):

        print(f"\n\nRunning with model: {model_name} on {sentences[i]}")

        # Load https://huggingface.co/sentence-transformers/all-mpnet-base-v2
        # model = SentenceTransformer(model_name)
        print("Model loaded")
        embeddings = model.encode(sentences)
        print("Embeddings generated")
        similarities = model.similarity(embeddings, embeddings)
        print("Similarities generated")

        
        similarity_df = pd.DataFrame(similarities, columns=names, index=names)
        print("Similarity dataframe created")

        gh_sim = similarity_df.iloc[1,4]
        burg_sim = similarity_df.iloc[2,9]
        brass_sim = similarity_df.iloc[0,8]
        space_sim = similarity_df.iloc[3,6]

        print(f"Scores: GH: {gh_sim}, Burg: {burg_sim}, Brass: {brass_sim}, Space: {space_sim}")

        model_comparison = model_comparison | {f"gh_{model_name}_{i}":gh_sim, f"burg_{model_name}_{i}":burg_sim, f"brass_{model_name}_{i}":brass_sim, f"space_{model_name}_{i}":space_sim}

In [None]:
scores = pd.DataFrame.from_dict(model_comparison, orient='index').reset_index().rename(columns={'index':'game', 0:'Similarity Score'}).sort_values("game")
scores