In [1]:
import pandas as pd
from config import CONFIGS
import os
import re
import json

from utils.nlp_functions import initial_components_processing, build_sentiment_elements, filter_stopwords, strip_special_chars_and_lower
 
from typing import Tuple

import json

# hide warnings
import warnings
warnings.filterwarnings("ignore")

ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

replacement_dictionary_path = 'data/prod/games/similarity_files/replacement_dictionary.json'
replacement_dictionary = json.load(open(replacement_dictionary_path))
replace_dict = replacement_dictionary['direct_replacements']
major_category_dict = replacement_dictionary['major_category_cleaning']
strip_list = replacement_dictionary['words_to_strip_all_right']

data_dir = 'data/prod/games/similarity_files/'

IS_LOCAL: True

Checking for local config file and evaluating for updates from S3.
Loading config from local


In [2]:
df = pd.read_pickle(f'{data_dir}top_1000_with_attached_rag.pkl')
desired_columns = ['BGGId','Name','Description']+[x for x in df.columns if 'generated' in x]

desc_df = df[desired_columns]
# desc_df.head()

In [3]:
desc_df['Description'] = desc_df['Description'].apply(lambda x: initial_components_processing(x, replace_dict))
desc_df['Description'] = desc_df['Description'].apply(filter_stopwords)
desc_df['Description'] = desc_df['Description'].apply(strip_special_chars_and_lower)

desc_df['generated_description'] = desc_df['generated_description'].apply(lambda x: initial_components_processing(x, replace_dict))
desc_df['generated_description'] = desc_df['generated_description'].apply(filter_stopwords)
desc_df['generated_description'] = desc_df['generated_description'].apply(strip_special_chars_and_lower)
desc_df.head()

Unnamed: 0,BGGId,Name,Description,generated_pros,generated_cons,generated_description
10174,224517,Brass: Birmingham,brass birmingham economic strategy game sequel...,Strategic Depth: Many players appreciate the ...,Steep Learning Curve: Some players find the r...,game strategic economic simulation set industr...
4846,161936,Pandemic Legacy: Season 1,pandemic legacy cooperative campaign game over...,Engaging Storyline: The evolving narrative ke...,Complexity: Some players find the numerous ch...,game cooperative board game players work toget...
5989,174430,Gloomhaven,gloomhavenis game euroinspired tactical combat...,Content Variety: The game offers an immense a...,Setup Time: Many reviews mention that the gam...,game cooperative tactical adventure combines e...
19051,342942,Ark Nova,ark nova plan design modern scientifically man...,Engaging Theme: The zoo management theme is a...,Lengthy Gameplay: Many reviews highlight that...,game strategic board game players manage zoo f...
20469,363622,The Castles of Burgundy: Special Edition,castles burgundy legendary board game design c...,High Production Quality: The game boasts exce...,Complexity for New Players: Some reviews ment...,game strategic board game revolves around deve...


In [4]:
components_positive = []
components_negative = []

components_positive.append(desc_df['generated_pros'].apply(lambda x: build_sentiment_elements(x, major_category_dict, strip_list)))
components_negative.append(desc_df['generated_cons'].apply(lambda x: build_sentiment_elements(x, major_category_dict, strip_list)))

desc_df['Positive_Components'] = [x[0] for x in components_positive[0]]
desc_df['Positive_Sentences']=  [x[1] for x in components_positive[0]]
desc_df['Negative_Components']= [x[0] for x in components_negative[0]]
desc_df['Negative_Sentences']=  [x[1] for x in components_negative[0]]

desc_df.head()

Unnamed: 0,BGGId,Name,Description,generated_pros,generated_cons,generated_description,Positive_Components,Positive_Sentences,Negative_Components,Negative_Sentences
10174,224517,Brass: Birmingham,brass birmingham economic strategy game sequel...,Strategic Depth: Many players appreciate the ...,Steep Learning Curve: Some players find the r...,game strategic economic simulation set industr...,"[strategic depth, excellent design, player int...",[strategic depth many players appreciate games...,"[steep learning curve, fiddly mechanics, varia...",[steep learning curve players find rules diffi...
4846,161936,Pandemic Legacy: Season 1,pandemic legacy cooperative campaign game over...,Engaging Storyline: The evolving narrative ke...,Complexity: Some players find the numerous ch...,game cooperative board game players work toget...,"[engaging storyline, cooperative, replayabilit...",[engaging storyline evolving narrative keeps p...,"[complex, time commitment, group dependency, f...",[complex players find numerous choices rules o...
5989,174430,Gloomhaven,gloomhavenis game euroinspired tactical combat...,Content Variety: The game offers an immense a...,Setup Time: Many reviews mention that the gam...,game cooperative tactical adventure combines e...,"[content variety, replayability, strategic dep...",[content variety game offers immense amount co...,"[setup, complex, pacing issues, excellent comm...",[setup many reviews mention game requires sign...
19051,342942,Ark Nova,ark nova plan design modern scientifically man...,Engaging Theme: The zoo management theme is a...,Lengthy Gameplay: Many reviews highlight that...,game strategic board game players manage zoo f...,"[engaging theme, creative mechanics, excellent...",[engaging theme zoo management theme appealing...,"[lengthy gameplay, luck factor, complex, analy...",[lengthy gameplay many reviews highlight game ...
20469,363622,The Castles of Burgundy: Special Edition,castles burgundy legendary board game design c...,High Production Quality: The game boasts exce...,Complexity for New Players: Some reviews ment...,game strategic board game revolves around deve...,"[excellent production quality, engaging mechan...",[excellent production quality game boasts exce...,"[complex for new players, setup, potential for...",[complex for new players reviews mention game ...


### Research problematic entries

In [5]:
problematic_entries = [373, 10, 136, 334, 214]
for problematic_entry in problematic_entries:
    print(f"\n\nEntry {problematic_entry}")
    print(desc_df.iloc[problematic_entry]['generated_pros'])
    print(desc_df.iloc[problematic_entry]['generated_cons'])
    print(desc_df.iloc[problematic_entry]['Positive_Components'])
    print(desc_df.iloc[problematic_entry]['Negative_Components'])
    print(desc_df.iloc[problematic_entry]['Positive_Sentences'])
    print(desc_df.iloc[problematic_entry]['Negative_Sentences'])
    



Entry 373
 Decisionmaking: Players appreciate the numerous decisions that need to be made, enhancing engagement and strategic depth. Replayability: Many reviews highlight the game's replayability, with players enjoying multiple sessions and finding new strategies each time. Artwork and Components: The quality of the artwork and game components is often praised, contributing positively to the overall experience. Engaging Theme: The historical theme is noted as engaging, providing a backdrop that enhances the gameplay. Tactical Depth: Players find the tactical decisions meaningful, adding to the satisfaction of gameplay. Solid Mechanics: The mechanics are generally wellreceived, with some players noting a lightweight feel that makes it accessible.
 Luck Factor: Several reviews mention a significant luck element, which can lead to frustrating experiences, especially if players draw unfavorable cards. Randomness: The game can feel random at times, which may detract from the strategic exp

In [6]:
# find the entry with the most words in a component

max_len = 0
max_entry = ""

for i, row in desc_df.iterrows():
    row_sentences = row['Negative_Components']
    for sentence in row_sentences:
        if len(sentence.split()) > max_len:
            max_len = len(sentence.split())
            max_entry = i
            print(i, max_len, sentence)


10174 3 steep learning curve
20469 4 complex for new players
16860 5 mixed experiences with player counts


### Save File

In [7]:
desc_df = desc_df.drop(columns=['Name','generated_pros','generated_cons'])
desc_df = desc_df.rename(columns={'generated_description':'About'})
desc_df.to_pickle(f"{data_dir}top_1000_cleaned_rag.pkl")
desc_df.head()

Unnamed: 0,BGGId,Description,About,Positive_Components,Positive_Sentences,Negative_Components,Negative_Sentences
10174,224517,brass birmingham economic strategy game sequel...,game strategic economic simulation set industr...,"[strategic depth, excellent design, player int...",[strategic depth many players appreciate games...,"[steep learning curve, fiddly mechanics, varia...",[steep learning curve players find rules diffi...
4846,161936,pandemic legacy cooperative campaign game over...,game cooperative board game players work toget...,"[engaging storyline, cooperative, replayabilit...",[engaging storyline evolving narrative keeps p...,"[complex, time commitment, group dependency, f...",[complex players find numerous choices rules o...
5989,174430,gloomhavenis game euroinspired tactical combat...,game cooperative tactical adventure combines e...,"[content variety, replayability, strategic dep...",[content variety game offers immense amount co...,"[setup, complex, pacing issues, excellent comm...",[setup many reviews mention game requires sign...
19051,342942,ark nova plan design modern scientifically man...,game strategic board game players manage zoo f...,"[engaging theme, creative mechanics, excellent...",[engaging theme zoo management theme appealing...,"[lengthy gameplay, luck factor, complex, analy...",[lengthy gameplay many reviews highlight game ...
20469,363622,castles burgundy legendary board game design c...,game strategic board game revolves around deve...,"[excellent production quality, engaging mechan...",[excellent production quality game boasts exce...,"[complex for new players, setup, potential for...",[complex for new players reviews mention game ...


In [8]:
all_positive_components = []
all_negative_components = []

# raw_pro_components = desc_df['generated_pros'].apply(clean_field_to_integral_components)
# raw_negat_components = desc_df['generated_cons'].apply(clean_field_to_integral_components)

# for pos in raw_pro_components:
#     all_positive_components += pos

# for neg in raw_negat_components:
#     all_negative_components += neg

for pos in desc_df['Positive_Components']:
    all_positive_components += pos

for neg in desc_df['Negative_Components']:
    all_negative_components += neg

with open(f"{data_dir}major_positives.json", "w") as f:
    json.dump(all_positive_components, f)

with open (f"{data_dir}major_negatives.json", "w") as f:
    json.dump(all_negative_components, f)