In [75]:
import pandas as pd
from config import CONFIGS
import os
import re

from utils.processing_functions import load_file_local_first, save_file_local_first
from pydantic import BaseModel, ConfigDict
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# hide warnings
import warnings
warnings.filterwarnings("ignore")

ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
S3_SCRAPER_BUCKET = CONFIGS["s3_scraper_bucket"]
GAME_CONFIGS = CONFIGS["games"]
RATINGS_CONFIGS = CONFIGS["ratings"]
IS_LOCAL = True if os.environ.get("IS_LOCAL", "False").lower() == "true" else False

In [None]:
df = load_file_local_first(
            path=GAME_CONFIGS["clean_dfs_directory"], file_name="top_1000_with_attached_rag.pkl"
        )
desired_columns = ['BGGId','Name','Description']+[x for x in df.columns if 'generated' in x]

desc_df = df[desired_columns]
desc_df.head()

In [77]:
def clean_weird_characters(text):
    text = text.replace("\n","").replace("\r","").replace("\t","")
    cleaned_text = re.sub(r"[^a-zA-Z0-9:,.\-\s]", "", text)
    return cleaned_text

def filter_stopwords(text: str) -> str:
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    return " ".join(filtered_sentence)

def strip_common_gpt_text(text: str) -> str:
    text = text.replace("many players appreciate games", "")
    text = text.replace("many players appreciate", "")
    text = text.replace("players noted", "")
    return text

def clean_field_to_sentences(row):
    components =  row.split(". ")
    components = [re.sub(r'[^\w\s]', '', component).replace("  ", " ") for component in components]
    components = [filter_stopwords(component).lower() for component in components]
    components = [strip_common_gpt_text(component).replace("  "," ") for component in components]
    return components

def clean_field_to_integral_components(row):
    row = filter_stopwords(row)
    components =  [x.split(":")[0].lower().strip() for x in row.split(".") if x.strip() != ""]
    components = [re.sub(r'[^\w\s,]', '', x).replace("  ", " ") for x in components]
    return components


In [None]:
sample_text = desc_df['generated_pros'][1]

clean_field_to_sentences(sample_text)

In [None]:
# sample_text = desc_df['generated_pros'][0]

# clean_field_to_sentences(sample_text)

desc_df['Description'] = desc_df['Description'].apply(clean_weird_characters)
desc_df['Positive_Sentences'] = desc_df['generated_pros'].apply(clean_field_to_sentences)
desc_df['Negative_Sentences'] = desc_df['generated_cons'].apply(clean_field_to_sentences)
desc_df['Positive_Components'] = desc_df['generated_pros'].apply(clean_field_to_integral_components)
desc_df['Negative_Components'] = desc_df['generated_cons'].apply(clean_field_to_integral_components)

desc_df.head()

In [None]:
desc_df = desc_df.drop(columns=['Name','generated_pros','generated_cons'])
desc_df = desc_df.rename(columns={'generated_description':'About'})
save_file_local_first(path=GAME_CONFIGS["clean_dfs_directory"], file_name="top_1000_cleaned_rag.pkl", data=desc_df)
desc_df.head()