In [None]:
import pandas as pd
from config import CONFIGS
import os
import re

from utils.processing_functions import load_file_local_first, save_file_local_first
from pydantic import BaseModel, ConfigDict
from nltk.corpus import stopwords

from typing import Tuple

# hide warnings
import warnings
warnings.filterwarnings("ignore")

ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

In [3]:
def get_major_and_all_components(component_type:str, df:pd.DataFrame) -> Tuple[list, list]:

    # Extract unique components from Positive_Components
    unique_positive_components = set()
    for components in df[f'{component_type}_Components']:
        unique_positive_components.update(components)
    major_components = [x for x in list(unique_positive_components) if x != ''] 
    # sort major components by number of words in the entry, highest to lowest
    major_components = sorted(major_components, key=lambda x: len(x.split()), reverse=True)

    # Extract unique elements from both Positive_Components and Positive_Sentences
    unique_sentence_components = set()  # Start with Positive_Components
    for sentences in df[f'{component_type}_Sentences']:
        unique_sentence_components.update(sentences)
    sentence_components = [x for x in list(unique_sentence_components) if x != ''] 
    sentence_components = sorted(sentence_components, key=lambda x: len(x.split()), reverse=True)

    # Extract unique elements from both Positive_Components and Positive_Sentences
    unique_all_components = set(unique_positive_components)  # Start with Positive_Components
    for sentences in df[f'{component_type}_Sentences']:
        unique_all_components.update(sentences)
    all_components = [x for x in list(unique_all_components) if x != ''] 
    all_components = sorted(all_components, key=lambda x: len(x.split()), reverse=True)


    return all_components, major_components, sentence_components

In [None]:
df = load_file_local_first(
            path=GAME_CONFIGS["clean_dfs_directory"], file_name="top_1000_with_attached_rag.pkl"
        )
desired_columns = ['BGGId','Name','Description']+[x for x in df.columns if 'generated' in x]

desc_df = df[desired_columns]
desc_df.head()

In [5]:
def clean_weird_characters(text):
    text = text.replace("\n","").replace("\r","").replace("\t","")
    cleaned_text = re.sub(r"[^a-zA-Z0-9:,.\-\s]", "", text).strip()
    return cleaned_text

def filter_stopwords(text: str) -> str:
    stop_words = set(stopwords.words("english"))
    word_tokens = text.split(" ")
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

def strip_common_gpt_text(text: str) -> str:
    text = text.replace("many players appreciate games", "")
    text = text.replace("many players appreciate", "")
    text = text.replace("players noted", "")
    text = text.replace("players noted that", "")
    text = text.replace("game offers", "")
    text = text.replace("offers", "")
    text = text.replace("players appreciate", "")
    text = text.replace("players appreciate games", "")
    text = text.replace(" s ", "s ")
    text = text.replace(" p ", "p ")
    text = text.replace(" 's", "s")
    text = text.replace("  ", " ")
    return text

def initial_components_processing(row) -> list:
    row = re.sub('([a-z0-9])([A-Z])', r'\1 \2', row) # split words on Snake case
    row = row.lower() # make row all lower case
    row = strip_common_gpt_text(row) # get rid of common GPT text
    row = filter_stopwords(row) # get rid of stopwords
    return row.split(". ") # split on periods

def clean_field_to_sentences(row):
    components = initial_components_processing(row)
    components = [re.sub(r'[^\w\s]', '', component).replace("  ", " ").strip() for component in components]
    return components

def clean_field_to_integral_components(row):
    components = initial_components_processing(row)
    components =  [re.sub(r'[^\w\s]', '', x.split(":")[0]).replace("  ", " ").replace("''", "").strip() for x in components if x.strip() != ""]
    # components = [re.sub(r'[^\w\s,]', '', x).replace("  ", " ").strip() for x in row]
    return components


In [None]:
desc_df['Description'] = desc_df['Description'].apply(clean_weird_characters)
desc_df['Positive_Components'] = desc_df['generated_pros'].apply(clean_field_to_integral_components)
desc_df['Negative_Components'] = desc_df['generated_cons'].apply(clean_field_to_integral_components)
desc_df['Positive_Sentences'] = desc_df['generated_pros'].apply(clean_field_to_sentences)
desc_df['Negative_Sentences'] = desc_df['generated_cons'].apply(clean_field_to_sentences)

desc_df.head()

In [None]:
desc_df['Positive_Sentences'][0]

In [None]:
desc_df = desc_df.drop(columns=['Name','generated_pros','generated_cons'])
desc_df = desc_df.rename(columns={'generated_description':'About'})
desc_df.to_pickle("top_1000_cleaned_rag.pkl")
desc_df.head()