# Part 2: Data Cleaning

This notebook contains the code for cleaning and preprocessing the NexusMods data.

## Overview
1. **Description Translation** - Detect language and translate non-English mod descriptions
2. **Patreon URL Extraction** - Extract Patreon URLs from descriptions
3. **Category Normalization** - Use fuzzy matching to normalize category names

## Output
- `TranslatedModData` table with cleaned descriptions (didn't end up using this)
- `GameCategories` table with normalized category groups

## 1. Setup and Configuration

In [None]:
import pandas as pd
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import text
from sqlalchemy import create_engine
import re
from tqdm import tqdm
from fuzzywuzzy import process, fuzz
from langdetect import detect
from deep_translator import GoogleTranslator
from bs4 import BeautifulSoup
import os
import joblib

In [None]:
# SQLAlchemy connection setup
engine = create_engine(
    "mssql+pyodbc://username:password@server.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server&Connect Timeout=60"
)

In [None]:
# Processing configuration
CHUNK_SIZE = 500
BATCH_SIZE = 500
SAVE_FILE = "processed_data.parquet"
CHECKPOINT_FILE = "checkpoint.pkl"

---
## Part 1: Description Translation & Patreon URL Extraction

In [None]:
PATREON_REGEX = re.compile(r"https?://(?:www\.)?patreon\.com/[^\s\]]+", re.IGNORECASE)

In [None]:
def clean_detect_translate(text):
    '''
    Clean text, detect language, translate if needed, and extract Patreon URL.
    
    Returns:
        tuple: (detected_language, translated_text, patreon_url)
    '''
    try:
        # Extract Patreon URL
        patreon_match = PATREON_REGEX.findall(text)
        patreon_url = patreon_match[0].split("]")[0] if patreon_match else None
        
        # Remove BBCode and URLs
        text_no_bbcode = re.sub(r"\[.*?\]", "", text)
        text_no_bbcode = re.sub(r"https?://\S+", "", text_no_bbcode)
        
        # Parse HTML and get clean text
        clean_text = BeautifulSoup(text_no_bbcode, "lxml").get_text(separator=" ")
        clean_text = re.sub(r"\s+", " ", clean_text).strip()
        
        # Detect language
        lang = detect(clean_text)
        
        # Translate if not English
        if lang != "en":
            translated_text = GoogleTranslator(source=lang, target="en").translate(clean_text)
        else:
            translated_text = clean_text
        
        return lang, translated_text, patreon_url
        
    except Exception as e:
        return None, text, None

In [None]:
def load_mods_for_translation():
    '''Load mod descriptions that haven't been translated yet.'''
    query = """
    SELECT 
        c.game_id,
        c.domain_name, 
        c.mod_id, 
        c.[description], 
        t.[detected_language]
    FROM [dbo].[CleanedModData] as c 
    LEFT JOIN [dbo].[TranslatedModData] as t 
        ON t.game_id = c.game_id AND t.mod_id = c.mod_id 
    WHERE c.description IS NOT NULL 
        AND t.[detected_language] IS NULL
    """
    return pd.read_sql(query, engine)

In [None]:
def process_translations():
    '''Process all mod descriptions - detect language, translate, extract Patreon URLs.'''
    df = load_mods_for_translation()
    print(f"Loaded {len(df)} mods for translation.")
    
    # Load existing processed data if available
    if os.path.exists(SAVE_FILE):
        df_processed = pd.read_parquet(SAVE_FILE)
        processed_ids = set(df_processed.index)
    else:
        df_processed = pd.DataFrame(columns=['detected_language', 'translated_description', 'patreon_url'])
        processed_ids = set()
    
    # Load checkpoint
    if os.path.exists(CHECKPOINT_FILE):
        last_index = joblib.load(CHECKPOINT_FILE)
    else:
        last_index = 0
    
    total_rows = len(df)
    
    # Process in chunks
    for i in tqdm(range(last_index, total_rows, CHUNK_SIZE), desc="Processing chunks"):
        chunk = df.iloc[i:i + CHUNK_SIZE]
        chunk = chunk[~chunk.index.isin(processed_ids)]
        
        if chunk.empty:
            continue
        
        # Process each row
        results = chunk['description'].apply(lambda x: pd.Series(clean_detect_translate(x)))
        chunk[['detected_language', 'translated_description', 'patreon_url']] = results
        final_chunk = chunk[['game_id', 'mod_id', 'detected_language', 'translated_description', 'patreon_url']]
        
        # Append to processed data
        df_processed = pd.concat([df_processed, final_chunk])
        df_processed.to_parquet(SAVE_FILE, index=True, engine="pyarrow", allow_truncated_timestamps=True)
        
        # Save checkpoint
        joblib.dump(i + CHUNK_SIZE, CHECKPOINT_FILE)
    
    print(f"Processing complete. Data saved to: {SAVE_FILE}")
    return df_processed

In [None]:
def upload_translations_to_sql():
    '''Upload processed translations to SQL database.'''
    df_processed = pd.read_parquet(SAVE_FILE)
    df = df_processed[["game_id", "mod_id", "detected_language", "translated_description", "patreon_url"]]
    df = df.dropna(subset=["translated_description"])
    df["detected_language"] = df["detected_language"].fillna("")
    df["patreon_url"] = df["patreon_url"].fillna("")
    
    merge_sql = """
        MERGE INTO TranslatedModData AS target
        USING (SELECT :game_id AS game_id, :mod_id AS mod_id, :lang AS detected_language, 
                      :desc AS translated_description, :patreon AS patreon_url) AS source
        ON target.game_id = source.game_id AND target.mod_id = source.mod_id
        WHEN MATCHED AND target.translated_description IS NULL THEN 
            UPDATE SET detected_language = source.detected_language, 
                       translated_description = source.translated_description, 
                       patreon_url = source.patreon_url
        WHEN NOT MATCHED THEN 
            INSERT (game_id, mod_id, detected_language, translated_description, patreon_url)
            VALUES (source.game_id, source.mod_id, source.detected_language, 
                    source.translated_description, source.patreon_url);
    """
    
    with engine.connect() as conn:
        for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Inserting batches"):
            batch_df = df.iloc[i:i + BATCH_SIZE]
            batch_data = batch_df.to_dict(orient="records")
            
            try:
                for row in batch_data:
                    conn.execute(text(merge_sql), {
                        "game_id": row["game_id"],
                        "mod_id": row["mod_id"],
                        "lang": row["detected_language"],
                        "desc": row["translated_description"],
                        "patreon": row["patreon_url"]
                    })
                conn.commit()
            except Exception as e:
                print(f"Error inserting batch: {e}")
    
    print("TranslatedModData table populated successfully!")

---
## Part 2: Category Normalization

In [None]:
def load_categories():
    '''Load category data from database.'''
    query = """
    SELECT [game_id], [game_name], [domain_name], [category_id], [category_name],
           [total_mods], [total_endorsements], [total_unique_downloads],
           [group_category], [group_id]
    FROM [dbo].[GameCategories]
    """
    return pd.read_sql(query, engine)

In [None]:
def preprocess_category(name):
    '''
    Preprocess category name for normalization.
    - Lowercase and strip
    - Remove common words (and, &, mod, mods)
    - Apply standard replacements
    - Group similar categories
    '''
    if not name or name.strip() == "---":
        return "---"
    
    name = name.lower().strip()
    name = re.sub(r'\b(and|&)\b', '', name)
    name = re.sub(r'\bmod(s)?\b', '', name)
    
    # Standard replacements
    replacements = {
        "armor": "armour",
        "user interface": "ui",
        "abilities": "ability",
        "animations": "animation",
        "weapons": "weapon",
        "items": "item",
        "skins": "skin",
        "sounds": "sound",
        "maps": "map",
        "levels": "level",
        "vehicles": "vehicle",
        "textures and meshes": "textures",
        "texture pack": "textures",
        "textures": "texture",
        "retextures": "texture",
        "hud": "ui",
        "audio": "sound",
        "music": "sound",
        "sfx": "sound",
        "voice": "sound",
        "visuals": "visual",
        "graphics": "graphic",
        "miscellaneous": "misc",
        "miscallenous": "misc",
        "miscellanneous": "misc"
    }
    
    # Category groups for normalization
    category_groups = {
        "vehicle": ["vehicle", "vehicle - aeroplanes", "vehicle - aircraft", "vehicle - boat", 
                    "vehicle - buses", "vehicle - land", "vehicle - ship", "vehicle - train", "vehicle - truck"],
        "sound": ["sound", "sound - misc", "sound - music", "sound - sfx", "sound - voice", "sound pack"],
        "misc": ["misc", "misc item", "misc tool", "miscallenous"],
        "map": ["map", "map - adventure", "map - campaign", "map - multiplayer", "map - singleplayer"],
        "gameplay": ["gameplay", "gameplay changes", "gameplay effect", "gameplay mechanic", "gameplay tweak"],
        "cosmetic": ["clothes", "clothing", "hair", "jewelry", "jewellery", "apparel", "outfits", "fashion"],
        "item": ["item", "item - food/drinks/chems/etc", "item - misc", "item pack"],
        "magic": ["mage", "magic", "magic - alchemy", "magic - gameplay", "magic - spell & enchantment"]
    }
    
    # Apply replacements
    for key, value in replacements.items():
        name = re.sub(rf'\b{re.escape(key)}\b', value, name)
    
    # Apply category groups
    for category, variations in category_groups.items():
        if any(variation in name for variation in variations):
            name = category
            break
    
    # Normalize modding tools
    if re.search(r'\b(tool|modding tool|modding resources)\b', name):
        name = "modding tool"
    
    # Remove trailing 's' from words (simple pluralization)
    words = name.split()
    processed_words = []
    for word in words:
        if word.endswith('s') and not word.endswith(('ss', 'us', 'is', 'ous', 'ies', 'es')):
            word = word[:-1]
        processed_words.append(word)
    
    return ' '.join(processed_words).strip()

In [None]:
def normalize_categories():
    '''Main function to normalize all categories using fuzzy matching.'''
    df = load_categories()
    print(f"Loaded {len(df)} categories.")
    
    # Preprocess category names
    df['clean_category'] = df['group_category'].astype(str).apply(preprocess_category)
    
    # Extract unique categories for fuzzy matching
    categories = df[['group_id', 'clean_category']].astype(str).values.tolist()
    
    # Apply fuzzy matching
    category_mapping = {}
    for cat_id, cat_name in tqdm(categories, desc="Fuzzy matching categories"):
        matches = process.extract(cat_name, [c[1] for c in categories], limit=5, scorer=fuzz.ratio)
        best_match = next((m[0] for m in matches if m[1] > 80 and m[0] != cat_name), cat_name)
        category_mapping[cat_id] = best_match
    
    # Create mapping dataframe
    mapping_df = pd.DataFrame(category_mapping.items(), columns=['group_id', 'new_group_category'])
    mapping_df['new_group_id'] = mapping_df.groupby('new_group_category').ngroup()
    
    # Merge back
    df['group_id'] = df['group_id'].astype(int)
    mapping_df['group_id'] = mapping_df['group_id'].astype(int)
    df = df.merge(mapping_df[['group_id', 'new_group_category', 'new_group_id']], on='group_id', how='left')
    
    print(f"Normalized {len(df)} categories into {mapping_df['new_group_id'].nunique()} groups.")
    
    return df, mapping_df

In [None]:
def update_categories_in_sql(df):
    '''Update normalized categories in SQL database.'''
    df_dedup = df.sort_values(by='group_id').drop_duplicates(subset=['group_id'], keep='last')
    
    # Build merge query
    values = ', '.join(
        f"({row.group_id}, '{row.new_group_category}', {row.new_group_id})" 
        for _, row in df_dedup.iterrows()
    )
    
    merge_query = text(f"""
    WITH DeduplicatedSource AS (
        SELECT DISTINCT group_id, new_group_category, new_group_id
        FROM (VALUES {values}) AS tmp (group_id, new_group_category, new_group_id)
    )
    MERGE INTO [dbo].[GameCategories] AS gc
    USING DeduplicatedSource AS tmp
    ON gc.group_id = tmp.group_id
    WHEN MATCHED THEN
        UPDATE SET 
            gc.new_group_category = tmp.new_group_category,
            gc.new_group_id = tmp.new_group_id;
    """)
    
    with engine.connect() as conn:
        conn.execute(merge_query)
        conn.commit()
    
    print("Categories updated successfully!")