In [68]:
import pandas as pd
from config import CONFIGS
import os
import re

from utils.processing_functions import load_file_local_first, save_file_local_first
from pydantic import BaseModel, ConfigDict
from nltk.corpus import stopwords

from typing import Tuple

# hide warnings
import warnings
warnings.filterwarnings("ignore")

ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

In [161]:
def clean_weird_characters(text):
    text = text.replace("\n","").replace("\r","").replace("\t","")
    cleaned_text = re.sub(r"[^a-zA-Z0-9:,.\-\s]", "", text).strip()
    return cleaned_text

def filter_stopwords(text: str) -> str:
    stop_words = set(stopwords.words("english"))
    word_tokens = text.split(" ")
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

def strip_common_gpt_text(text: str) -> str:
    text = text.replace("many players appreciate games", "")
    text = text.replace("many players appreciate", "")
    text = text.replace("players noted", "")
    text = text.replace("players noted that", "")
    text = text.replace("game offers", "")
    text = text.replace("offers", "")
    text = text.replace("players appreciate", "")
    text = text.replace("players appreciate games", "")
    text = text.replace(" s ", "s ")
    text = text.replace(" p ", "p ")
    text = text.replace(" 's", "s")
    text = text.replace("  ", " ")
    text = text.replace("vs.", "versus")
    return text

def initial_components_processing(row) -> list:
    row = re.sub('([a-z0-9])([A-Z])', r'\1 \2', row) # split words on Snake case
    row = row.lower() # make row all lower case
    row = strip_common_gpt_text(row) # get rid of common GPT text
    row = re.sub(r'[^\w\s.,:]', '', row) # get rid of special characters except for periods and commas
    return row.split(". ") # split on periods
    return row

def clean_field_to_sentences(row):
    row = filter_stopwords(row) # get rid of stopwords
    components = initial_components_processing(row)
    components = [re.sub(r'[^\w\s]', '', component).replace("  ", " ").strip() for component in components]
    return components

def clean_field_to_integral_components(row):
    components = initial_components_processing(row)
    components =  [re.sub(r'[^\w\s]', '', x.split(":")[0]).replace("  ", " ").replace("''", "").strip() for x in components if x.strip() != ""]
    components = [" ".join(x.split(' ')[:5]) for x in components]
    return components


In [162]:
df = load_file_local_first(
            path=GAME_CONFIGS["clean_dfs_directory"], file_name="top_1000_with_attached_rag.pkl"
        )
desired_columns = ['BGGId','Name','Description']+[x for x in df.columns if 'generated' in x]

desc_df = df[desired_columns]
# desc_df.head()

In [163]:
desc_df['Description'] = desc_df['Description'].apply(clean_weird_characters)
desc_df['Positive_Components'] = desc_df['generated_pros'].apply(clean_field_to_integral_components)
desc_df['Negative_Components'] = desc_df['generated_cons'].apply(clean_field_to_integral_components)
desc_df['Positive_Sentences'] = desc_df['generated_pros'].apply(clean_field_to_sentences)
desc_df['Negative_Sentences'] = desc_df['generated_cons'].apply(clean_field_to_sentences)

# desc_df.head()

### Test problematic entries

In [164]:
entry = 10
# entry = 136

In [165]:
desc_df.iloc[entry]['generated_cons']

' Complex Rules: Some players find the rules to be fiddly and not clearly explained, which can lead to confusion during gameplay. Pacing Issues: A few reviews mention that the game can feel slow or cumbersome, particularly in terms of setup and execution of turns. Not for Everyone: The cardbased mechanics and narrative elements may not appeal to all players, with some feeling that it lacks depth. Initial Hook: Several users noted that the game did not immediately capture their interest, requiring more time to fully appreciate. Solo Play Limitations: While some enjoy playing solo, others feel that the game is better suited for group play, which can limit its appeal for solo gamers.'

In [166]:
desc_df.iloc[entry]['Negative_Components']

['complex rules',
 'pacing issues',
 'not for everyone',
 'initial hook',
 'solo play limitations']

In [167]:
desc_df.iloc[entry]['Negative_Sentences']

['complex rules players find rules fiddly clearly explained lead confusion gameplay',
 'pacing issues reviews mention game feel slow cumbersome particularly terms setup execution turns',
 'everyone cardbased mechanics narrative elements may appeal players feeling lacks depth',
 'initial hook several users noted game immediately capture interest requiring time fully appreciate',
 'solo play limitations enjoy playing solo others feel game better suited group play limit appeal solo gamers']

In [168]:
desc_df = desc_df.drop(columns=['Name','generated_pros','generated_cons'])
desc_df = desc_df.rename(columns={'generated_description':'About'})
desc_df.to_pickle("top_1000_cleaned_rag.pkl")
desc_df.head()

Unnamed: 0,BGGId,Description,About,Positive_Components,Negative_Components,Positive_Sentences,Negative_Sentences
0,224517,Brass: Birmingham is an economic strategy game...,This game is a strategic economic simulation s...,"[strategic depth, excellent design, player int...","[steep learning curve, fiddly mechanics, varia...",[strategic depth games complexity multitude de...,[steep learning curve players find rules diffi...
1,161936,Pandemic Legacy is a co-operative campaign gam...,This game is a cooperative board game where pl...,"[engaging storyline, cooperative gameplay, rep...","[complexity, time commitment, group dependency...",[engaging storyline evolving narrative keeps p...,[complexity players find numerous choices rule...
2,174430,Gloomhaven is a game of Euro-inspired tactica...,This game is a cooperative tactical adventure ...,"[content variety, replayability, tactical dept...","[setup time, complexity, pacing issues, high c...",[content variety immense amount content ensuri...,[setup time many reviews mention game requires...
3,342942,"In Ark Nova, you will plan and design a modern...",This game is a strategic board game where play...,"[engaging theme, unique mechanics, high replay...","[lengthy gameplay, luck factor, complexity, an...",[engaging theme zoo management theme appealing...,[lengthy gameplay many reviews highlight game ...
4,363622,Castles of Burgundy is a legendary Board Game ...,This game is a strategic board game that revol...,"[high production quality, engaging mechanics, ...","[complexity for new players, setup time, poten...",[high production quality game boasts exception...,[complexity new players reviews mention game c...


In [None]:
# find the entry with the most words in a component

max_len = 0
max_entry = ""

for i, row in desc_df.iterrows():
    row_sentences = row['Negative_Sentences']
    for sentence in row_sentences:
        if len(sentence.split()) > max_len:
            max_len = len(sentence.split())
            max_entry = i
            print(i, max_len, sentence)


0 15 steep learning curve players find rules difficult grasp barrier new players unfamiliar heavy strategy games
0 16 lengthy playtime game take significant amount time play may appeal everyone especially looking quicker gaming sessions
10 17 solo play limitations enjoy playing solo others feel game better suited group play limit appeal solo gamers
99 18 player count limitations game may scale well different player counts leading less enjoyable experience played right number participants
112 20 mixed experiences player counts game shines certain player counts reviews suggest may perform well fewer players leading less engaging experience
127 21 limited thematic depth reviews mention game lacks strong connection thematic elements expected source material feeling like mashup mechanics rather cohesive journey
