## Process Description 
 Consolidates and categorises game mods categories by using SQL. Data is collected from Nexus mods on mod categories within game domains. The data is then processed using fuzzy string matching to identify and group similar category names. This is then merged back into the dataset.

In [1]:
import pandas as pd
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import text
from sqlalchemy import create_engine, update
import matplotlib.pyplot as plt
import seaborn as sns
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime, timedelta
from fuzzywuzzy import process, fuzz
from langdetect import detect
from deep_translator import GoogleTranslator
from bs4 import BeautifulSoup
import os
import joblib

In [2]:
# SQLAlchemy connection setup
engine = create_engine(
    "mssql+pyodbc://admin4327:Tr3m3r3Pr1nc3!@nmntserver.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server&Connect Timeout=60"
)

## Cleaning up Categories/Grouping Categories together

### Actual Clean

In [3]:
grouped_categories = """
SELECT [game_id]
      ,[game_name]
      ,[domain_name]
      ,[category_id]
      ,[category_name]
      ,[total_mods]
      ,[total_endorsements]
      ,[total_unique_downloads]
      ,[group_category]
      ,[group_id]
  FROM [dbo].[GameCategories] 


"""

df = pd.read_sql(grouped_categories, engine)
print("Checkpoint 1: Original Data Loaded")
df.head(30)

Checkpoint 1: Original Data Loaded


Unnamed: 0,game_id,game_name,domain_name,category_id,category_name,total_mods,total_endorsements,total_unique_downloads,group_category,group_id
0,180,Neverwinter Nights,neverwinter,26,Modules,44,1877.0,20120.0,Modules,665
1,180,Neverwinter Nights,neverwinter,31,Scripts,17,158.0,2882.0,Scripts,893
2,657,Wasteland 2,wasteland2,2,Miscellaneous,6,1176.0,16620.0,Miscellaneous,622
3,687,Legend of Grimrock 2,legendofgrimrock2,6,Tileset,8,256.0,2236.0,Tileset,1049
4,952,The Witcher 3,witcher3,35,Audio,89,19931.0,326579.0,Audio,88
5,1231,Hitman: Blood Money,hitmanbloodmoney,4,Gameplay,12,399.0,12302.0,Gameplay,403
6,1468,Star Trek: Elite Force II,startrekeliteforceII,20,Utilities,0,,,Utilities,1099
7,1515,Final Fantasy X/X-2 HD Remaster,finalfantasyxx2hdremaster,9,Visuals and Graphics,6,1655.0,30820.0,Visuals and Graphics,1127
8,2736,Enderal,enderal,121,Models and Textures,11,1025.0,38426.0,Models and Textures,649
9,3272,Mafia II: Definitive Edition,mafia2definitiveedition,4,Gameplay,12,314.0,9530.0,Gameplay,403


In [4]:
import re
import pandas as pd

def preprocess_category(name):
    if not name or name.strip() == "---":
        return "---"  # Preserve missing values as "---"

    name = name.lower().strip()  # Convert to lowercase and remove leading/trailing spaces
    name = re.sub(r'\b(and|&)\b', '', name)
    name = re.sub(r'\bmod(s)?\b', '', name)

    # Standard replacements
    replacements = {
        "armor": "armour",
        "user interface": "ui",
        "abilities": "ability",
        "animations": "animation",
        "weapons": "weapon",
        "items": "item",
        "skins": "skin",
        "sounds": "sound",
        "maps": "map",
        "levels": "level",
        "vehicles": "vehicle",
        "cars": "car",
        "creatures": "creature",
        "monsters": "monster",
        "npcs": "npc",
        "companions": "companion",
        "followers": "follower",
        "classes": "class",
        "buildings": "building",
        "structures": "structure",
        "expansions": "expansion",
        "campaigns": "campaign",
        "quests": "quest",
        "achievements": "achievement",
        "patches": "patch",
        "fixes": "fix",
        "tweaks": "tweak",
        "tools": "tool",
        "utilities": "utility",
        "resources": "resource",
        "textures and meshes": "textures",
        "texture pack": "textures",
        "textures": "texture",
        "retextures": "texture",
        "hud": "ui",
        "audio": "sound",
        "music": "sound",
        "sfx": "sound",
        "voice": "sound",
        "visuals": "visual",
        "graphics": "graphic",
        "game modes": "gameplay",
        "multiplayer": "multiplayer",
        "single player": "singleplayer",
        "co-op": "cooperative",
        "co-operative": "cooperative",
        "mmo": "multiplayer",
        "online": "multiplayer",
        "miscellaneous": "misc",
        "miscallenous": "misc",
        "miscellanneous": "misc"
    }

    # Grouped category normalization
    category_groups = {
        "vehicle": [
            "vehicle", "vehicle - aeroplanes", "vehicle - aircraft", "vehicle - boat", 
            "vehicle - buses", "vehicle - busses", "vehicle - land", "vehicle - other",
            "vehicle - ship", "vehicle - train", "vehicle - truck", "vehicle ctf"
        ],
        "sound": [
            "sound", "sound - misc", "sound - music", "sound - sound", "sound - sound - music",
            "sound - sound - sfx", "sound - sound - sfx voices", 
            "sound - sound - sfx, sound - music, sound - voice", "sound - sound - voice",
            "sound - voice chat", "sound - voice comm script", "sound pack", 
            "sound sound", "sound sound - music", "sound sound - sfx"
        ],
        "planes": [
            "planes - britain", "planes - france", "planes - germany", "planes - japan",
            "planes - miscellaneous", "planes - soviet", "planes - sweden", "planes - usa"
        ],
        "misc": [
            "misc", "misc item", "misc tool", "miscallenous", "miscellanneous script",
            "miscellanneous tool"
        ],
        "map": [
            "map", "map - adventure", "map - bot", "map - campaign", "map - castle defense",
            "map - challenge", "map - hero arena", "map - hero defense", "map - instant action",
            "map - melee", "map - mini-game/sport", "map - misc", "map - modified",
            "map - multiplayer", "map - new", "map - other", "map - power struggle",
            "map - role playing (rpg)", "map - singleplayer", "map - survival",
            "map - tower defense (cooperative)", "map - tower defense (survivor)", 
            "map - tower war", "map change", "map compass", "map level", "map mission", "map pack"
        ],
        "gameplay": [
            "gameplay", "gameplay changes", "gameplay effect", "gameplay effect changes",
            "gameplay mechanic", "gameplay tweak"
        ],
        "cosmetic": [
            "clothes", "clothing", "hair", "jewelry", "jewellery", "apparel", "outfits", "fashion"
        ],
        "iso": [
            "iso - air vehicle", "iso - building & colony", "iso - energy weapon",
            "iso - gun", "iso - object", "iso - shield", "iso - sword", "iso - water vehicle"
        ],
        "item": [
            "item", "item - food/drinks/chems/etc", "item - misc", 
            "item (food, drinks, chems, etc)", "item object - player", 
            "item object - world", "item pack"
        ],
        "magic": [
            "mage", "magic", "magic - alchemy", "magic - gameplay", "magic - spell & enchantment", 
            "magic - spell &amp; enchantment"
        ]
    }

    for key, value in replacements.items():
        name = re.sub(rf'\b{re.escape(key)}\b', value, name)

    for category, variations in category_groups.items():
        if any(variation in name for variation in variations):
            name = category
            break 

    if re.search(r'\b(tool|modding tool|modding resources)\b', name):
        name = "modding tool"

    words = name.split()
    processed_words = []
    for word in words:
        if word.endswith('s') and not word.endswith(('ss', 'us', 'is', 'ous', 'ies', 'es')):
            word = word[:-1]  
        processed_words.append(word)

    return ' '.join(processed_words).strip()


# Preprocess category names
df['clean_category'] = df['group_category'].astype(str).apply(preprocess_category)

In [5]:
categories = df[['group_id', 'clean_category']].astype(str).values.tolist()

category_mapping = {}


for cat_id, cat_name in categories:
    matches = process.extract(cat_name, [c[1] for c in categories], limit=5, scorer=fuzz.ratio)  
    best_match = next((m[0] for m in matches if m[1] > 80 and m[0] != cat_name), cat_name)
    category_mapping[cat_id] = best_match


mapping_df = pd.DataFrame(category_mapping.items(), columns=['group_id','new_group_category'])
mapping_df['new_group_id'] = mapping_df.groupby('new_group_category').ngroup()

print("\nCheckpoint 2: Fuzzy Matching Applied")
mapping_df.head(100)




Checkpoint 2: Fuzzy Matching Applied


Unnamed: 0,group_id,new_group_category,new_group_id
0,665,modes,491
1,893,script,689
2,622,misc,472
3,1049,tileset,823
4,88,sound,740
...,...,...,...
95,565,mandown,449
96,189,character skin,147
97,95,sound,740
98,448,heads-up display,363


In [7]:
print("\nColumns in df before merging:", df.columns)
print("\nColumns in mapping_df before merging:", mapping_df.columns)

df['group_id'] = df['group_id'].astype(int)
mapping_df['group_id'] = mapping_df['group_id'].astype(int)

if 'new_group_category' not in mapping_df.columns or 'new_group_id' not in mapping_df.columns:
    raise KeyError(" 'new_group_category' or 'new_group_id' missing from mapping_df before merging!")

df = df.merge(mapping_df[['group_id', 'new_group_category', 'new_group_id']], 
              on='group_id', how='left')

print("\nColumns in df after merging:", df.columns)

if 'new_group_category' not in df.columns or 'new_group_id' not in df.columns:
    raise KeyError(" Merge failed: 'new_group_category' or 'new_group_id' not in df!")
print("\nCheckpoint 4: Merged Data with Grouped Categories")
df[['group_id', 'group_category', 'new_group_category', 'new_group_id']].head(100)



Columns in df before merging: Index(['game_id', 'game_name', 'domain_name', 'category_id', 'category_name',
       'total_mods', 'total_endorsements', 'total_unique_downloads',
       'group_category', 'group_id', 'clean_category'],
      dtype='object')

Columns in mapping_df before merging: Index(['group_id', 'new_group_category', 'new_group_id'], dtype='object')

Columns in df after merging: Index(['game_id', 'game_name', 'domain_name', 'category_id', 'category_name',
       'total_mods', 'total_endorsements', 'total_unique_downloads',
       'group_category', 'group_id', 'clean_category', 'new_group_category',
       'new_group_id'],
      dtype='object')

Checkpoint 4: Merged Data with Grouped Categories


Unnamed: 0,group_id,group_category,new_group_category,new_group_id
0,665,Modules,modes,491
1,893,Scripts,script,689
2,622,Miscellaneous,misc,472
3,1049,Tileset,tileset,823
4,88,Audio,sound,740
...,...,...,...,...
95,1099,Utilities,utility,863
96,881,Saved Games,saved games,679
97,622,Miscellaneous,misc,472
98,649,Models and Textures,model texture,489


In [8]:
df_dedup = df.sort_values(by='group_id').drop_duplicates(subset=['group_id'], keep='last')

print("\nSQL Merge Preview:")
merge_query = text(f"""
WITH DeduplicatedSource AS (
    SELECT DISTINCT group_id, new_group_category, new_group_id
    FROM (VALUES {', '.join(f"({row.group_id}, '{row.new_group_category}', {row.new_group_id})" 
                             for _, row in df_dedup.iterrows())}
    ) AS tmp (group_id, new_group_category, new_group_id)
)
MERGE INTO [dbo].[GameCategories] AS gc
USING DeduplicatedSource AS tmp
ON gc.group_id = tmp.group_id
WHEN MATCHED THEN
    UPDATE SET 
        gc.new_group_category = tmp.new_group_category,
        gc.new_group_id = tmp.new_group_id;
""")

with engine.connect() as conn:
    conn.execute(merge_query)  
    conn.commit()  



SQL Merge Preview:


### Creating Parent Categories Based on the mod categories

In [None]:
mod_data_query = """
SELECT a.mod_id, a.description, a.summary, a.domain_name,b.group_category,b.group_id
FROM [dbo].[CleanedModData] as a 
LEFT JOIN [dbo].[GameCategories] as b on a.domain_name = b.domain_name
WHERE description IS NOT NULL or a.summary IS NOT NULL 
"""
df_mods = pd.read_sql(mod_data_query, engine)

In [69]:
def match_category(description, category_list):
    best_match, score = process.extractOne(description, category_list, scorer=fuzz.token_sort_ratio)
    return best_match if score > 70 else "Uncategorized"
