In [44]:
import pandas as pd
from sqlalchemy.orm import sessionmaker
from sqlalchemy.sql import text
from sqlalchemy import create_engine, update
import matplotlib.pyplot as plt
import seaborn as sns
import re
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime, timedelta
from fuzzywuzzy import process, fuzz
from langdetect import detect
from deep_translator import GoogleTranslator
from bs4 import BeautifulSoup
import os
import joblib

In [56]:
# SQLAlchemy connection setup
engine = create_engine(
    "mssql+pyodbc://admin4327:Tr3m3r3Pr1nc3!@nmntserver.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server&Connect Timeout=60"
)

## Translator for non-english mods, fix html and add patreon urls as own field

In [57]:
CHUNK_SIZE = 500  
BATCH_SIZE = 500 
SAVE_FILE = "processed_data_2.parquet"
CHECKPOINT_FILE = "checkpoint_2.pkl"

In [47]:
PATREON_REGEX = re.compile(r"https?://(?:www\.)?patreon\.com/[^\s\]]+", re.IGNORECASE)

In [48]:
query = """SELECT 
c.game_id,
c.domain_name, 
c.mod_id, 
c.[description], 
t.[detected_language]
FROM [dbo].[CleanedModData] as c 
Left join [dbo].[TranslatedModData] as t on t.game_id = c.game_id and t.mod_id = c.mod_id where description is not null AND t.[detected_language] IS NULL"""
df = pd.read_sql(query, engine)

In [49]:
def clean_detect_translate(text):
    try:
        # Extract Patreon URL
        patreon_match = PATREON_REGEX.findall(text)
        patreon_url = patreon_match[0].split("]")[0] if patreon_match else None  

        # Remove BBCode and filter out links
        text_no_bbcode = re.sub(r"\[.*?\]", "", text)
        text_no_bbcode = re.sub(r"https?://\S+", "", text_no_bbcode)  # Remove URLs
        clean_text = BeautifulSoup(text_no_bbcode, "lxml").get_text(separator=" ")  # Use lxml for better handling

        # Remove excessive whitespace
        clean_text = re.sub(r"\s+", " ", clean_text).strip()

        # Detect language
        lang = detect(clean_text)

        # Translate if not English
        translated_text = GoogleTranslator(source=lang, target="en").translate(clean_text) if lang != "en" else clean_text

        return lang, translated_text, patreon_url
    except Exception as e:
        return None, text, None  


In [52]:
# Load existing data if available
if os.path.exists(SAVE_FILE):
    df_processed = pd.read_parquet(SAVE_FILE)
    processed_ids = set(df_processed.index)
else:
    df_processed = pd.DataFrame(columns=['detected_language', 'translated_description', 'patreon_url'])
    processed_ids = set()

if os.path.exists(CHECKPOINT_FILE):
    last_index = joblib.load(CHECKPOINT_FILE)
else:
    last_index = 0

# Process in chunks
total_rows = len(df)
tqdm_bar = tqdm(total=total_rows, desc="Processing rows", initial=last_index)

for i in range(last_index, total_rows, CHUNK_SIZE):
    chunk = df.iloc[i:i + CHUNK_SIZE]
    chunk = chunk[~chunk.index.isin(processed_ids)]  # Skip already processed rows

    if chunk.empty:
        continue

    # Process
    tqdm.pandas(desc="Processing rows")
    results = chunk['description'].progress_apply(lambda x: pd.Series(clean_detect_translate(x)))
    chunk[['detected_language', 'translated_description', 'patreon_url']] = results
    final_chunk = chunk[['game_id', 'mod_id', 'detected_language', 'translated_description', 'patreon_url']]

    # Append to processed file
    df_processed = pd.concat([df_processed, final_chunk])
    df_processed.to_parquet(SAVE_FILE, index=True, engine="pyarrow", allow_truncated_timestamps=True)


    # Save checkpoint
    joblib.dump(i + CHUNK_SIZE, CHECKPOINT_FILE)
    tqdm_bar.update(CHUNK_SIZE)

tqdm_bar.close()
print("Processing complete. Data saved to:", SAVE_FILE)

Processing rows:   0%|                                                                       | 0/26622 [00:00<?, ?it/s]
Processing rows:   0%|                                                                         | 0/500 [00:00<?, ?it/s][A
Processing rows:   0%|▎                                                                | 2/500 [00:00<01:44,  4.76it/s][A
Processing rows:   1%|▍                                                                | 3/500 [00:02<06:40,  1.24it/s][A
Processing rows:   1%|▌                                                                | 4/500 [00:03<09:19,  1.13s/it][A
  clean_text = BeautifulSoup(text_no_bbcode, "lxml").get_text(separator=" ")  # Use lxml for better handling

Processing rows:   7%|████▋                                                           | 37/500 [00:06<01:04,  7.23it/s][A
Processing rows:   8%|█████▍                                                          | 42/500 [00:07<01:19,  5.77it/s][A
Processing rows:  11%|███████▏  

Processing complete. Data saved to: processed_data_2.parquet





In [10]:
#tqdm.pandas(desc="Processing rows")
#df[['detected_language', 'translated_description', 'patreon_url']] = df['description'].progress_apply(lambda x: pd.Series(clean_detect_translate(x)))

In [None]:
#df.head(1000)

In [58]:
df_processed = pd.read_parquet(SAVE_FILE)
df = df_processed[["game_id", "mod_id", "detected_language", "translated_description", "patreon_url"]]
df = df.dropna(subset=["translated_description"])
df["detected_language"] = df["detected_language"].fillna("")
df["patreon_url"] = df["patreon_url"].fillna("")

In [59]:
merge_sql = """
    MERGE INTO TranslatedModData AS target
    USING (SELECT :game_id AS game_id, :mod_id AS mod_id, :lang AS detected_language, 
                  :desc AS translated_description, :patreon AS patreon_url) AS source
    ON target.game_id = source.game_id AND target.mod_id = source.mod_id
    WHEN MATCHED AND target.translated_description IS NULL THEN 
        UPDATE SET detected_language = source.detected_language, 
                   translated_description = source.translated_description, 
                   patreon_url = source.patreon_url
    WHEN NOT MATCHED THEN 
        INSERT (game_id, mod_id, detected_language, translated_description, patreon_url)
        VALUES (source.game_id, source.mod_id, source.detected_language, source.translated_description, source.patreon_url);
"""


In [60]:
insert_sql = """
    INSERT INTO TranslatedModData (game_id, mod_id, detected_language, translated_description, patreon_url)
    SELECT :game_id, :mod_id, :lang, :desc, :patreon
    WHERE NOT EXISTS (
        SELECT 1 FROM TranslatedModData WHERE game_id = :game_id AND mod_id = :mod_id
    );
"""

In [61]:
with engine.connect() as conn:
    for i in tqdm(range(0, len(df), BATCH_SIZE), desc="Inserting in batches"):
        batch_df = df.iloc[i : i + BATCH_SIZE]
        batch_data = batch_df.to_dict(orient="records")

        try:
            for row in batch_data:
                conn.execute(text(merge_sql), {
                    "game_id": row["game_id"],
                    "mod_id": row["mod_id"],
                    "lang": row["detected_language"],
                    "desc": row["translated_description"],
                    "patreon": row["patreon_url"]
                })
            
            conn.commit()  # ✅ Force commit after each batch
            print(f"✅ Committed batch {i//BATCH_SIZE + 1}")
        except Exception as e:
            print(f"❌ Error inserting batch {i//BATCH_SIZE + 1}: {e}")

print("New table 'TranslatedModData' populated successfully!")

Inserting in batches:   2%|█▏                                                           | 1/54 [00:34<30:14, 34.23s/it]

✅ Committed batch 1


Inserting in batches:   4%|██▎                                                          | 2/54 [01:08<29:43, 34.30s/it]

✅ Committed batch 2


Inserting in batches:   6%|███▍                                                         | 3/54 [01:38<27:26, 32.28s/it]

✅ Committed batch 3


Inserting in batches:   7%|████▌                                                        | 4/54 [02:08<26:05, 31.31s/it]

✅ Committed batch 4


Inserting in batches:   9%|█████▋                                                       | 5/54 [02:37<25:05, 30.71s/it]

✅ Committed batch 5


Inserting in batches:  11%|██████▊                                                      | 6/54 [03:12<25:29, 31.86s/it]

✅ Committed batch 6


Inserting in batches:  13%|███████▉                                                     | 7/54 [03:42<24:35, 31.40s/it]

✅ Committed batch 7


Inserting in batches:  15%|█████████                                                    | 8/54 [04:16<24:39, 32.17s/it]

✅ Committed batch 8


Inserting in batches:  17%|██████████▏                                                  | 9/54 [04:53<25:12, 33.61s/it]

✅ Committed batch 9


Inserting in batches:  19%|███████████                                                 | 10/54 [05:26<24:42, 33.68s/it]

✅ Committed batch 10


Inserting in batches:  20%|████████████▏                                               | 11/54 [05:58<23:45, 33.16s/it]

✅ Committed batch 11


Inserting in batches:  22%|█████████████▎                                              | 12/54 [06:30<22:53, 32.70s/it]

✅ Committed batch 12


Inserting in batches:  24%|██████████████▍                                             | 13/54 [07:00<21:44, 31.81s/it]

✅ Committed batch 13


Inserting in batches:  26%|███████████████▌                                            | 14/54 [07:30<20:47, 31.18s/it]

✅ Committed batch 14


Inserting in batches:  28%|████████████████▋                                           | 15/54 [08:00<20:09, 31.01s/it]

✅ Committed batch 15


Inserting in batches:  30%|█████████████████▊                                          | 16/54 [08:32<19:53, 31.41s/it]

✅ Committed batch 16


Inserting in batches:  31%|██████████████████▉                                         | 17/54 [09:03<19:17, 31.27s/it]

✅ Committed batch 17


Inserting in batches:  33%|████████████████████                                        | 18/54 [09:36<18:57, 31.60s/it]

✅ Committed batch 18


Inserting in batches:  35%|█████████████████████                                       | 19/54 [10:08<18:31, 31.75s/it]

✅ Committed batch 19


Inserting in batches:  37%|██████████████████████▏                                     | 20/54 [10:38<17:42, 31.25s/it]

✅ Committed batch 20


Inserting in batches:  39%|███████████████████████▎                                    | 21/54 [11:09<17:08, 31.17s/it]

✅ Committed batch 21


Inserting in batches:  41%|████████████████████████▍                                   | 22/54 [11:39<16:30, 30.95s/it]

✅ Committed batch 22


Inserting in batches:  43%|█████████████████████████▌                                  | 23/54 [12:12<16:18, 31.57s/it]

✅ Committed batch 23


Inserting in batches:  44%|██████████████████████████▋                                 | 24/54 [12:45<15:52, 31.76s/it]

✅ Committed batch 24


Inserting in batches:  46%|███████████████████████████▊                                | 25/54 [13:15<15:11, 31.44s/it]

✅ Committed batch 25


Inserting in batches:  48%|████████████████████████████▉                               | 26/54 [13:43<14:11, 30.40s/it]

✅ Committed batch 26


Inserting in batches:  50%|██████████████████████████████                              | 27/54 [14:11<13:20, 29.65s/it]

✅ Committed batch 27


Inserting in batches:  52%|███████████████████████████████                             | 28/54 [14:46<13:34, 31.32s/it]

✅ Committed batch 28


Inserting in batches:  54%|████████████████████████████████▏                           | 29/54 [15:17<12:55, 31.02s/it]

✅ Committed batch 29


Inserting in batches:  56%|█████████████████████████████████▎                          | 30/54 [15:47<12:16, 30.69s/it]

✅ Committed batch 30


Inserting in batches:  57%|██████████████████████████████████▍                         | 31/54 [16:20<12:02, 31.41s/it]

✅ Committed batch 31


Inserting in batches:  59%|███████████████████████████████████▌                        | 32/54 [16:49<11:17, 30.80s/it]

✅ Committed batch 32


Inserting in batches:  61%|████████████████████████████████████▋                       | 33/54 [17:17<10:29, 29.98s/it]

✅ Committed batch 33


Inserting in batches:  63%|█████████████████████████████████████▊                      | 34/54 [17:45<09:48, 29.40s/it]

✅ Committed batch 34


Inserting in batches:  65%|██████████████████████████████████████▉                     | 35/54 [18:15<09:19, 29.46s/it]

✅ Committed batch 35


Inserting in batches:  67%|████████████████████████████████████████                    | 36/54 [18:43<08:45, 29.17s/it]

✅ Committed batch 36


Inserting in batches:  69%|█████████████████████████████████████████                   | 37/54 [19:11<08:09, 28.79s/it]

✅ Committed batch 37


Inserting in batches:  70%|██████████████████████████████████████████▏                 | 38/54 [19:39<07:35, 28.46s/it]

✅ Committed batch 38


Inserting in batches:  72%|███████████████████████████████████████████▎                | 39/54 [20:07<07:03, 28.21s/it]

✅ Committed batch 39


Inserting in batches:  74%|████████████████████████████████████████████▍               | 40/54 [20:35<06:34, 28.18s/it]

✅ Committed batch 40


Inserting in batches:  76%|█████████████████████████████████████████████▌              | 41/54 [21:03<06:06, 28.21s/it]

✅ Committed batch 41


Inserting in batches:  78%|██████████████████████████████████████████████▋             | 42/54 [21:30<05:35, 27.98s/it]

✅ Committed batch 42


Inserting in batches:  80%|███████████████████████████████████████████████▊            | 43/54 [21:58<05:07, 27.93s/it]

✅ Committed batch 43


Inserting in batches:  81%|████████████████████████████████████████████████▉           | 44/54 [22:26<04:38, 27.87s/it]

✅ Committed batch 44


Inserting in batches:  83%|██████████████████████████████████████████████████          | 45/54 [22:54<04:10, 27.79s/it]

✅ Committed batch 45


Inserting in batches:  85%|███████████████████████████████████████████████████         | 46/54 [23:26<03:54, 29.25s/it]

✅ Committed batch 46


Inserting in batches:  87%|████████████████████████████████████████████████████▏       | 47/54 [23:56<03:25, 29.36s/it]

✅ Committed batch 47


Inserting in batches:  89%|█████████████████████████████████████████████████████▎      | 48/54 [24:23<02:52, 28.78s/it]

✅ Committed batch 48


Inserting in batches:  91%|██████████████████████████████████████████████████████▍     | 49/54 [24:53<02:25, 29.09s/it]

✅ Committed batch 49


Inserting in batches:  93%|███████████████████████████████████████████████████████▌    | 50/54 [25:20<01:54, 28.57s/it]

✅ Committed batch 50


Inserting in batches:  94%|████████████████████████████████████████████████████████▋   | 51/54 [25:49<01:25, 28.54s/it]

✅ Committed batch 51


Inserting in batches:  96%|█████████████████████████████████████████████████████████▊  | 52/54 [26:16<00:56, 28.25s/it]

✅ Committed batch 52


Inserting in batches:  98%|██████████████████████████████████████████████████████████▉ | 53/54 [26:43<00:27, 27.89s/it]

✅ Committed batch 53


Inserting in batches: 100%|████████████████████████████████████████████████████████████| 54/54 [26:50<00:00, 29.83s/it]

✅ Committed batch 54
New table 'TranslatedModData' populated successfully!





In [None]:
with engine.begin() as conn:
    for row in tqdm(df.itertuples(index=False), total=len(df), desc="Inserting into SQL"):
        conn.execute(
            text("""
                MERGE INTO TranslatedModData AS target
                USING (SELECT :mod_id AS mod_id, :lang AS detected_language, :desc AS translated_description, :patreon AS patreon_url) AS source
                ON target.mod_id = source.mod_id
                WHEN MATCHED THEN 
                    UPDATE SET detected_language = source.detected_language, 
                               translated_description = source.translated_description, 
                               patreon_url = source.patreon_url
                WHEN NOT MATCHED THEN 
                    INSERT (mod_id, detected_language, translated_description, patreon_url)
                    VALUES (source.mod_id, source.detected_language, source.translated_description, source.patreon_url);
            """),
            {"mod_id": row.mod_id, "lang": row.detected_language, "desc": row.translated_description, "patreon": row.patreon_url}
        )

session.close()
print("New table 'TranslatedModData' updated successfully!")

Inserting into SQL:  34%|████████████████▋                                | 125422/368305 [4:39:04<10:46:52,  6.26it/s]

## Cleaning up Categories/Grouping Categories together

In [4]:
grouped_categories = """
SELECT 
    gc.category_name,
    gc.parent_category,
    STRING_AGG(gc.domain_name, ', ') AS game_list,
    COUNT(DISTINCT gc.domain_name) AS domain_count,  -- Count distinct games in this category
    SUM(gc.total_mods) AS total_mods,
    SUM(gc.total_endorsements) AS total_endorsements,
    SUM(gc.total_unique_downloads) AS total_unique_downloads
FROM [dbo].[GameCategoriesView] gc
GROUP BY 
    gc.category_name, 
    gc.parent_category
ORDER BY total_mods DESC;


"""
df = pd.read_sql(grouped_categories, engine)
print(len(df))
df.head(10)

2680


Unnamed: 0,category_name,parent_category,game_list,domain_count,total_mods,total_endorsements,total_unique_downloads
0,Armour,20,"skyrim, skyrimspecialedition, oblivion",3,15854,8665381.0,179153992.0
1,Models And Textures,20,"oblivion, skyrimspecialedition, skyrim",3,12759,15866825.0,485927415.0
2,"Body, Face, and Hair",20,"skyrim, oblivion, skyrimspecialedition",3,10076,6077426.0,153730605.0
3,Weapons,20,"skyrim, oblivion, skyrimspecialedition",3,9882,3559932.0,73360178.0
4,Miscellaneous,20,"skyrim, skyrimspecialedition, oblivion",3,9601,2064816.0,48688942.0
5,Player homes,20,"skyrim, skyrimspecialedition, oblivion",3,9321,2402661.0,39191200.0
6,Followers & Companions,20,skyrimspecialedition,1,8754,2196505.0,53406831.0
7,Patches,20,"skyrimspecialedition, oblivion, skyrim",3,8246,4953012.0,166190229.0
8,Weapons,55,"newvegas, fallout3",2,8234,1699297.0,25949953.0
9,Gameplay,20,"skyrimspecialedition, skyrim",2,7641,5594710.0,120160031.0


In [5]:
filtered_df = df.dropna(subset=['parent_category']) #removed due to comparision of title names like hitman,hitman2

categories = filtered_df['category_name'].astype(str).tolist()

similar_categories = {}
for cat in categories:
    matches = process.extract(cat, categories, limit=5, scorer=fuzz.ratio)  # Use a scorer
    similar_categories[cat] = [match[0] for match in matches if match[1] > 95 and match[0] != cat]

similar_df = pd.DataFrame([(k, v) for k, vals in similar_categories.items() for v in vals], 
                          columns=['Original', 'Similar'])



In [6]:
print(len(similar_df))
print(similar_df.head(20))

141
                                Original                              Similar
0                    Models And Textures                  Models and Textures
1                    Models And Textures                  Models and Textures
2                    Models And Textures                  Models and Textures
3                    Models And Textures                  Models and Textures
4                           Player homes                         Player Homes
5                           Player homes                         Player Homes
6                  Quests And Adventures                Quests and Adventures
7                  Quests And Adventures                Quests and Adventures
8                    Models and Textures                  Models And Textures
9                   Visuals and Graphics                 Visuals and graphics
10                    Weapons And Armour                   Weapons and Armour
11                    Weapons And Armour                   W