In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import os
from pathlib import Path
from nltk.corpus import stopwords

# Load the master dataset generated in Notebook 1
df = pd.read_pickle(INPUT_FILE)
print(f"Dataset loaded: {df.shape} rows")
print(f"Columns: {list(df.columns)}")

Dataset loaded: (76369, 16) rows
Columns: ['review_text', 'review_en', 'rating', 'date', 'user_total_reviews', 'user_id', 'is_local_guide', 'lang', 'park_name', 'text_length', 'year', 'month', 'quarter', 'month_name', 'season', 'gender']


In [None]:
# Language frequency analysis
print("--- Language Distribution (Top 20) ---")

lang_counts = df["lang"].value_counts()

print(f"Total detected languages: {len(lang_counts)}")
print(lang_counts.head(20))

# Optional: inspect long-tail languages
print("\n--- Languages with fewer than 400 reviews ---")
print(lang_counts[lang_counts < 400].index.tolist())

--- Language Distribution (Top 20) ---
Total detected languages: 37
lang
es    62439
fr     7331
ru     1641
ca     1596
pt     1249
de      503
nl      314
it      304
iw      213
pl      151
uk      142
ro       93
hu       64
ar       60
cs       27
sv       23
zh       22
bg       22
fi       21
ko       19
Name: count, dtype: int64

--- Languages with fewer than 400 reviews ---
['nl', 'it', 'iw', 'pl', 'uk', 'ro', 'hu', 'ar', 'cs', 'sv', 'zh', 'bg', 'fi', 'ko', 'gl', 'no', 'lt', 'sk', 'eu', 'tr', 'el', 'da', 'bs', 'lv', 'hr', 'zh-Hant', 'et', 'sl', 'gn', 'sr', 'ja']


In [None]:
def filter_minority_languages(df, threshold=400):
    """Filter dataset to keep only languages with sufficient review volume."""

    print(f"\n--- Language Filtering (Threshold: {threshold} reviews) ---")

    # Count languages
    lang_counts = df["lang"].value_counts()

    # Languages above threshold
    valid_langs = lang_counts[lang_counts >= threshold].index.tolist()

    # Apply filter
    initial_len = len(df)
    df_filtered = df[df["lang"].isin(valid_langs)].copy()

    # Report
    removed_langs = [lng for lng in lang_counts.index if lng not in valid_langs]
    n_removed = initial_len - len(df_filtered)

    print(f"Languages kept ({len(valid_langs)}): {valid_langs}")
    print(f"Languages removed ({len(removed_langs)}): {removed_langs}")
    print(f"Reviews removed: {n_removed} ({n_removed / initial_len:.2%})")

    return df_filtered


# Execution
df_filtered = filter_minority_languages(df, threshold=400)

print(f"\nTotal reviews after language filter: {len(df_filtered)}")


--- Language Filtering (Threshold: 400 reviews) ---
Languages kept (6): ['es', 'fr', 'ru', 'ca', 'pt', 'de']
Languages removed (31): ['nl', 'it', 'iw', 'pl', 'uk', 'ro', 'hu', 'ar', 'cs', 'sv', 'zh', 'bg', 'fi', 'ko', 'gl', 'no', 'lt', 'sk', 'eu', 'tr', 'el', 'da', 'bs', 'lv', 'hr', 'zh-Hant', 'et', 'sl', 'gn', 'sr', 'ja']
Reviews removed: 1610 (2.11%)

Total reviews after language filter: 74759


In [None]:
# NLTK setup (fixing missing resource error)
import nltk

print("Downloading NLTK resources...")
nltk.download("stopwords", quiet=False)

from nltk.corpus import stopwords
from stop_words import get_stop_words  # Catalan stopwords

Downloading NLTK resources...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# ==============================================================================
# Stopword management
# ==============================================================================
print("Configuring stopword lists...")

try:
    # NLTK-supported languages
    sw_es = set(stopwords.words("spanish"))
    sw_fr = set(stopwords.words("french"))
    sw_ru = set(stopwords.words("russian"))
    sw_pt = set(stopwords.words("portuguese"))
    sw_de = set(stopwords.words("german"))
    sw_en = set(stopwords.words("english"))

    # Catalan (not provided by NLTK)
    try:
        sw_ca = set(get_stop_words("catalan"))
    except:
        sw_ca = {
            "el", "la", "els", "les", "i", "o", "que", "de", "a", "amb",
            "un", "una", "uns", "unes", "és", "son", "per", "perquè",
            "ja", "hi", "ho", "molt", "molta", "molts", "moltes"
        }

    # Merge all stopword sets
    stop_words_agg = sw_es | sw_fr | sw_ru | sw_pt | sw_de | sw_ca | sw_en

    print(f"Stopwords loaded successfully. Total words ignored: {len(stop_words_agg)}")

except LookupError as e:
    print(f"NLTK error: {e}")
    print("Try running nltk.download('stopwords') in a separate cell.")

Configuring stopword lists...
Stopwords loaded successfully. Total words ignored: 1276


In [None]:
# ==============================================================================
# Domain-specific stopwords
# ==============================================================================

# Static domain stopwords
domain_stopwords = {
    # Spanish
    "parque", "atracciones", "atracción", "espectáculos", "visita", "familiar",
    "experiencia", "temático", "entrada", "sitio", "ir", "dia", "vez",
    # French
    "parc", "attractions", "spectacles", "visite", "expérience", "familial", "thème",
    # Catalan
    "parc", "atraccions", "espectacles", "visita", "experiència", "temàtic",
    # German
    "park", "attraktionen", "shows", "erlebnis", "familie", "besuch",
    # Portuguese
    "parque", "atrações", "experiência", "familiar",
    # Russian
    "парк", "атракционы", "шоу", "опыт", "атракцион",
}

# Proper names related to the parks (lowercase)
park_proper_names = {
    "portaventura", "port", "aventura", "warner", "madrid", "por"
}

# Combine all domain-related words
all_domain_words = {w.lower() for w in domain_stopwords}
all_domain_words.update(park_proper_names)

# Final stopword set: universal + domain-specific
final_combined_stopwords = stop_words_agg.union(all_domain_words)

print(f"Total stopwords: {len(final_combined_stopwords)}")

Total stopwords: 1318


In [None]:
def clean_for_bert(text):
    """Light cleaning for Deep Learning models (keeps punctuation and accents)."""
    if not isinstance(text, str):
        return ""

    # Remove URLs and mentions
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+", "", text)

    # Space out punctuation
    text = re.sub(r"([.,!?;()\[\]])", r" \1 ", text)

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text


def clean_for_stats(text):
    """Aggressive cleaning for EDA (word frequencies, word clouds)."""
    if not isinstance(text, str):
        return ""

    text = text.lower()

    # Remove URLs and mentions
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"@\w+", "", text)

    # Replace punctuation with spaces
    text = re.sub(r"[^\w\s]", " ", text)

    # Remove numbers
    text = re.sub(r"\d+", "", text)

    # Remove stopwords and normalize tokens
    words = text.split()
    clean_words = [w for w in words if w not in final_combined_stopwords]

    return " ".join(clean_words)


print("Generating NLP columns (text_bert and text_stats)...")

# For Transformer models
df_filtered["text_bert"] = df_filtered["review_text"].apply(clean_for_bert)

# For statistical visualizations
df_filtered["text_stats"] = df_filtered["review_text"].apply(clean_for_stats)

# Light cleaning for English translations
if "review_en" in df_filtered.columns:
    df_filtered["text_en_clean"] = df_filtered["review_en"].apply(clean_for_bert)

print("Processing completed.")

Generating NLP columns (text_bert and text_stats)...
Processing completed.


In [None]:
print("\n--- Quality Check ---")

# Show 5 random examples to inspect cleaning outputs
pd.set_option("display.max_colwidth", 150)
display(df_filtered[["review_text", "text_bert", "text_stats"]].sample(5))


--- Quality Check ---


Unnamed: 0,review_text,text_bert,text_stats
16867,"He estado ya 6 veces, super buen mantenimiento, y atracciones entretenidas para todos los gustos y edades. Mis favoritas las más fuertes. Además s...","He estado ya 6 veces , super buen mantenimiento , y atracciones entretenidas para todos los gustos y edades . Mis favoritas las más fuertes . Adem...",veces super buen mantenimiento entretenidas gustos edades favoritas fuertes además filas opción coger fast past ticket pasas hacer apenas minutos ...
46609,ESTÁ BONITO PERO ES MUY CARO TODO PRECIOS MUY ELEVADOS Y LO PEOR ES LAS LARGAS COLAS QUÉ HAY PARA ENTRAR EN LAS ATRACCIONES ES INSOPORTABLE HASTA ...,ESTÁ BONITO PERO ES MUY CARO TODO PRECIOS MUY ELEVADOS Y LO PEOR ES LAS LARGAS COLAS QUÉ HAY PARA ENTRAR EN LAS ATRACCIONES ES INSOPORTABLE HASTA ...,bonito caro precios elevados peor largas colas entrar insoportable horas entrar tema comida pobre pocos sitios comer bien creo vaya
56028,"Me parece fatal que esté permitido fumar en el parque. Todos los adultos fumando como energumenos dando mal ejemplo. Y además a mi, personalmente,...","Me parece fatal que esté permitido fumar en el parque . Todos los adultos fumando como energumenos dando mal ejemplo . Y además a mi , personalmen...",parece fatal permitido fumar adultos fumando energumenos dando mal ejemplo además personalmente molestaba humo cara fatal fatal
1705,"UN ROBO. Colas interminables , Mas de 3H para montar en una atracción cada 30min sonaba por megafonía ( La atracción está cerrada por mantenimient...","UN ROBO . Colas interminables , Mas de 3H para montar en una atracción cada 30min sonaba por megafonía ( La atracción está cerrada por mantenimien...",robo colas interminables h montar min sonaba megafonía cerrada mantenimiento paraban min volvía funcionar cabo min volver pararla así todas
46932,"Vergonzoso, estamos hoy en el Parque Warner, y hay 4 atracciones cerradas de las más importante: río bravo, correcaminos, coaster express, superma...","Vergonzoso , estamos hoy en el Parque Warner , y hay 4 atracciones cerradas de las más importante: río bravo , correcaminos , coaster express , su...",vergonzoso hoy cerradas importante río bravo correcaminos coaster express superman además subir generado colas muchas largas pocas abiertas supues...


In [None]:
# ==============================================================================
# Cleaning function validation
# ==============================================================================

def test_cleaning_functions(case_name, text):
    """Print original text and cleaned versions for manual inspection."""
    print(f"\n--- Case: {case_name} ---")
    print(f"Original:   {text}")
    print(f"For BERT:   {clean_for_bert(text)}")
    print(f"For Stats:  {clean_for_stats(text)}")


# ------------------------------------------------------------------
# A. Stress test (technical noise)
# ------------------------------------------------------------------
noisy_text = "¡Hola! Visita https://web.com. @usuario El parque es 100% genial, pero caro..."
test_cleaning_functions("Technical Noise", noisy_text)

# ------------------------------------------------------------------
# B. Multilingual tests
# ------------------------------------------------------------------

# Spanish
text_es = "El parque es muy divertido y fui con mi familia. ¡Genial!"
test_cleaning_functions("Spanish", text_es)

# Catalan
text_ca = "El parc és molt gran i vaig anar amb els meus amics."
test_cleaning_functions("Catalan", text_ca)

# French
text_fr = "C'est un très beau parc et j'ai adoré l'expérience."
test_cleaning_functions("French", text_fr)

# German
text_de = "Der Park ist sehr groß und wir haben es genossen!"
test_cleaning_functions("German", text_de)

# Portuguese
text_pt = "O parque é muito legal e fui com a família."
test_cleaning_functions("Portuguese", text_pt)

# Russian
text_ru = "Парк очень красивый и большой. Супер!"
test_cleaning_functions("Russian", text_ru)

# English
text_en = "The park is amazing and very big. I loved it!"
test_cleaning_functions("English", text_en)


--- Case: Technical Noise ---
Original:   ¡Hola! Visita https://web.com. @usuario El parque es 100% genial, pero caro...
For BERT:   ¡Hola ! Visita El parque es 100% genial , pero caro . . .
For Stats:  hola genial caro

--- Case: Spanish ---
Original:   El parque es muy divertido y fui con mi familia. ¡Genial!
For BERT:   El parque es muy divertido y fui con mi familia . ¡Genial !
For Stats:  divertido familia genial

--- Case: Catalan ---
Original:   El parc és molt gran i vaig anar amb els meus amics.
For BERT:   El parc és molt gran i vaig anar amb els meus amics .
For Stats:  gran amics

--- Case: French ---
Original:   C'est un très beau parc et j'ai adoré l'expérience.
For BERT:   C'est un très beau parc et j'ai adoré l'expérience .
For Stats:  très beau adoré

--- Case: German ---
Original:   Der Park ist sehr groß und wir haben es genossen!
For BERT:   Der Park ist sehr groß und wir haben es genossen !
For Stats:  groß genossen

--- Case: Portuguese ---
Original:   O parque é