
# Refactored Nexus Mods Data Pipeline
This notebook connects to the Nexus Mods API, retrieves data, and writes it directly to an Azure SQL Server database using SQLAlchemy. CSV operations have been removed.


In [1]:
import requests
import json
import logging
import time
import pandas as pd
from sqlalchemy import create_engine
import os
from sqlalchemy.exc import SQLAlchemyError, OperationalError
import time

In [2]:
# Logging setup
logging.basicConfig(filename='API_Update.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# API configuration
API_KEY = "WLDgsmFLQ/ZCLB1ELghAYI341WKTs/fs7ya8+VDq2yTQznCl/j+6--iS6z8N+za91e8G7L--sNgR1fxWJCmplCOW2IfHOg==" #switch when rate limit is reached
#API_KEY = "P4WFtQQ8zMc89xCESY0x2E183bfOpHRjvY159e4H3HcOJ6GNaDI=--I3/cG6WtrxR9ac4t--/e+gBW1nRPXJS7PggzSspg=="
HEADERS = {
    'apikey': API_KEY,
    'Accept': 'application/json'
}
BASE_URL = "https://api.nexusmods.com/v1/"
REQUEST_LIMIT = 2500
CHECKPOINT_FILE = "API_mods_checkpoint.json"
BATCH_SIZE = 100 
# SQLAlchemy connection setup
engine = create_engine(
    "mssql+pyodbc://admin4327:Tr3m3r3Pr1nc3!@nmntserver.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server&Connect Timeout=60"
)


In [3]:
# Function to handle API rate limits
def rate_limited_request(url, params=None):
    while True:
        response = requests.get(url, headers=HEADERS, params=params)
        if response.status_code == 429:
            logging.warning("Rate limit reached. Sleeping for 60 seconds.")
            time.sleep(60)
        else:
            response.raise_for_status()  
            return response.json()


In [4]:
#if checkpoint load it
def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r") as f:
            return json.load(f)
    return {}

# save checkpoint
def save_checkpoint(checkpoint_data):
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump(checkpoint_data, f)

In [5]:
def get_game_domains():
    query = "SELECT domain_name, mods FROM Games"
    try:
        games_df = pd.read_sql(query, con=engine)
        return games_df.set_index("domain_name")["mods"].to_dict() 
    except Exception as e:
        logging.error(f"Error fetching game domains: {e}")
        return {}

In [6]:
def get_existing_mod_ids():
    #query = "SELECT mod_id, domain_name FROM Mods WHERE status != 'not found'"
    query ="SELECT mod_id, COALESCE(game_domain, domain_name) AS game_domain FROM Mods WHERE status != 'not found'"

    try:
        mods_df = pd.read_sql(query, con=engine)
        return mods_df.groupby("domain_name")["mod_id"].apply(set).to_dict()  
    except Exception as e:
        logging.error(f"Error fetching existing mod IDs: {e}")
        return {}

In [7]:
def handle_missing_mod(mod_id, game_domain):
   #needed a function to handle missing mods to not re-query them again
    missing_mod_df = pd.DataFrame([{
        "mod_id": mod_id,
        "game_domain": game_domain,
        "status": "not found"
    }])
    try:
        missing_mod_df.to_sql("Mods", con=engine, if_exists="append", index=False)
        logging.warning(f"üö´ Mod {mod_id} not found for {game_domain}, marking as 'not found'.")
        print(f"üö´ Mod {mod_id} not found for {game_domain}, marking as 'not found'.")
    except Exception as e:
        logging.error(f"‚ùå Failed to insert missing mod {mod_id} for {game_domain}: {e}")
        print(f"‚ùå Failed to insert missing mod {mod_id} for {game_domain}: {e}")


In [8]:
def clean_mod_data(mod_data):
    #storing json as string because errors galore
    mod_data["user_info"] = json.dumps(mod_data.get("user", {}))  #converts dict to json
    mod_data["endorsement"] = json.dumps(mod_data.get("endorsement", {}))
    mod_data.pop("user", None)

    return mod_data
def fetch_mod(mod_id, game_domain):
    """Fetch a mod and handle errors properly."""
    mod_url = f"{BASE_URL}games/{game_domain}/mods/{mod_id}"
    try:
        mod_data = rate_limited_request(mod_url)

        if not mod_data:
            handle_missing_mod(mod_id, game_domain)
        else:
            mod_data = clean_mod_data(mod_data)
            mod_df = pd.DataFrame([mod_data])
            mod_df.to_sql("Mods", con=engine, if_exists="append", index=False)
            logging.info(f"‚úÖ Added mod {mod_id} for {game_domain}.")
            print(f"‚úÖ Added mod {mod_id} for {game_domain}.")

    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            handle_missing_mod(mod_id, game_domain)
        else:
            logging.error(f"‚ùå Error fetching mod {mod_id} for {game_domain}: {e}")
            print(f"‚ùå Error fetching mod {mod_id} for {game_domain}: {e}")


In [17]:
def fetch_and_update_mods():
    game_domains = get_game_domains()
    checkpoint = load_checkpoint()

    for game_domain, expected_mod_count in game_domains.items():
        logging.info(f"Checking mods for game: {game_domain}")
        print(f"üïµÔ∏è Checking mods for game: {game_domain}")
    
        # ‚úÖ Get correct stored mod count (DISTINCT mod_id to prevent duplicates)
        query = f"""
        SELECT COUNT(DISTINCT mod_id) 
        FROM Mods 
        WHERE COALESCE(game_domain, domain_name) = '{game_domain}' 
        AND status != 'not found'
        """
        try:
            stored_mod_count = pd.read_sql(query, con=engine).iloc[0, 0]
        except Exception as e:
            logging.error(f"Error fetching mod count for {game_domain}: {e}")
            stored_mod_count = 0

        logging.info(f"Game: {game_domain} - Stored (distinct): {stored_mod_count}, Expected: {expected_mod_count}")
        print(f"üìù Game: {game_domain} - Stored (distinct): {stored_mod_count}, Expected: {expected_mod_count}")

        # ‚úÖ Always log, but only continue if fully processed
        if stored_mod_count >= expected_mod_count:
            logging.info(f"‚úÖ Mod count for {game_domain} is complete.")
            print(f"‚úÖ Mod count for {game_domain} is complete. No new mods needed.")
            continue  

        # ‚úÖ Fetch the highest mod_id available in the database
        query_max_mod = f"""
        SELECT MAX(mod_id) FROM Mods 
        WHERE COALESCE(game_domain, domain_name) = '{game_domain}'
        """
        try:
            max_stored_mod_id = pd.read_sql(query_max_mod, con=engine).iloc[0, 0]
            if max_stored_mod_id is None:
                max_stored_mod_id = 0
        except Exception as e:
            logging.error(f"Error fetching max mod_id for {game_domain}: {e}")
            max_stored_mod_id = 0

        # ‚úÖ Fetch distinct mod IDs that are already stored
        query_existing_mods = f"""
        SELECT DISTINCT mod_id FROM Mods 
        WHERE COALESCE(game_domain, domain_name) = '{game_domain}'
        """
        try:
            existing_mod_ids = set(pd.read_sql(query_existing_mods, con=engine)['mod_id'])
        except Exception as e:
            logging.error(f"Error fetching existing mod IDs for {game_domain}: {e}")
            existing_mod_ids = set()

        # ‚úÖ Expand the range dynamically to account for missing higher mod IDs
        highest_possible_mod_id = max(expected_mod_count, max_stored_mod_id + 20000)  # Allow extra range for new mods
        expected_mod_ids = set(range(1, highest_possible_mod_id + 1))

        # ‚úÖ Identify missing mods correctly
        missing_mods = sorted(expected_mod_ids - existing_mod_ids)

        # ‚úÖ Debugging logs
        print(f"üîç Total missing mods for {game_domain}: {len(missing_mods)} (including beyond stored mod IDs)")
        logging.info(f"Total missing mods for {game_domain}: {len(missing_mods)} (including beyond stored mod IDs)")

        if not missing_mods:
            print(f"‚ö†Ô∏è Warning: No missing mods found for {game_domain}, but count is lower than expected!")
            logging.warning(f"‚ö†Ô∏è Warning: No missing mods found for {game_domain}, but count is lower than expected!")
            continue  

        last_completed_mod_id = checkpoint.get(game_domain, 0)

        while stored_mod_count < expected_mod_count:
            if not missing_mods:
                logging.warning(f"‚ö†Ô∏è No more mod IDs to check for {game_domain}, but count is still short!")
                break

            mod_id = missing_mods.pop(0)

            print(f"üîç Processing mod {mod_id} for {game_domain}")
            #fetch_mod(mod_id, game_domain)
            mod_data = fetch_mod(mod_id, game_domain)

            if mod_data and mod_data.get('status') not in ('not found'):
                stored_mod_count += 1
                checkpoint[game_domain] = stored_mod_count

            #stored_mod_count += 1  
            save_checkpoint(checkpoint)

            if stored_mod_count % 100 == 0:
                print(f"üìä Progress: {stored_mod_count}/{expected_mod_count} mods fetched for {game_domain}")

        checkpoint[game_domain] = max(missing_mods, default=0)
        save_checkpoint(checkpoint)
        print(f"‚úÖ Finished processing {game_domain}. Checkpoint saved.")


In [21]:
fetch_and_update_mods()

üïµÔ∏è Checking mods for game: skyrimspecialedition
üìù Game: skyrimspecialedition - Stored (distinct): 102501, Expected: 102501
‚úÖ Mod count for skyrimspecialedition is complete. No new mods needed.
üïµÔ∏è Checking mods for game: skyrim
üìù Game: skyrim - Stored (distinct): 72459, Expected: 72459
‚úÖ Mod count for skyrim is complete. No new mods needed.
üïµÔ∏è Checking mods for game: fallout4
üìù Game: fallout4 - Stored (distinct): 63574, Expected: 63574
‚úÖ Mod count for fallout4 is complete. No new mods needed.
üïµÔ∏è Checking mods for game: newvegas
üìù Game: newvegas - Stored (distinct): 36140, Expected: 36140
‚úÖ Mod count for newvegas is complete. No new mods needed.
üïµÔ∏è Checking mods for game: cyberpunk2077
üìù Game: cyberpunk2077 - Stored (distinct): 13563, Expected: 13563
‚úÖ Mod count for cyberpunk2077 is complete. No new mods needed.
üïµÔ∏è Checking mods for game: stardewvalley
üìù Game: stardewvalley - Stored (distinct): 20846, Expected: 20846
‚úÖ Mod count

In [None]:
def fetch_and_update_mods():
    game_domains = get_game_domains()
    existing_mods = get_existing_mod_ids()
    checkpoint = load_checkpoint()

    for game_domain, expected_mod_count in game_domains.items():
        logging.info(f"Checking mods for game: {game_domain}")
        print(f"üïµÔ∏è Checking mods for game: {game_domain}")
    
        current_mods = existing_mods.get(game_domain, set())
        
        query = f"""
        SELECT COUNT(*) FROM Mods 
        WHERE game_domain = '{game_domain}' 
        AND (status != 'not found')
        """
        try:
            stored_mod_count = pd.read_sql(query, con=engine).iloc[0, 0]
        except Exception as e:
            logging.error(f"Error fetching mod count for {game_domain}: {e}")
            stored_mod_count = 0
    
        logging.info(f"Game: {game_domain} - Stored: {stored_mod_count}, Expected: {expected_mod_count}")
        print(f"üìù Game: {game_domain} - Stored: {stored_mod_count}, Expected: {expected_mod_count}")
    
        if stored_mod_count >= expected_mod_count and stored_mod_count > 0 and stored_mod_count == expected_mod_count:
            logging.info(f"‚úÖ Mod count matches for {game_domain}, skipping...")
            print(f"‚úÖ Mod count matches for {game_domain}, skipping...")
            continue  # ‚úÖ Now it only skips when needed!
    
        # Fetch missing mods
        query_missing = f"""
        SELECT mod_id FROM Mods 
        WHERE game_domain = '{game_domain}' 
        AND status = 'not found'
        """
        try:
            missing_mods_from_db = set(pd.read_sql(query_missing, con=engine)['mod_id'])
        except Exception as e:
            logging.error(f"Error fetching missing mods for {game_domain}: {e}")
            missing_mods_from_db = set()
    
        missing_mods = set(range(1, expected_mod_count + 1)) - current_mods - missing_mods_from_db
        missing_mods = sorted(missing_mods)
    
        last_completed_mod_id = checkpoint.get(game_domain, 0)
    
        while stored_mod_count < expected_mod_count:
            if not missing_mods:
                logging.warning(f"‚ö†Ô∏è No more mod IDs to check for {game_domain}, but count is still short!")
                break
    
            mod_id = missing_mods.pop(0)  # Get next missing mod
            print(f"üîç Processing mod {mod_id} for {game_domain}")
            fetch_mod(mod_id, game_domain)
    
            # Recalculate stored count after each batch
            if mod_id % BATCH_SIZE == 0:
                stored_mod_count = pd.read_sql(query, con=engine).iloc[0, 0]
                save_checkpoint(checkpoint)
                print(f"üíæ Checkpoint saved at mod {mod_id} for {game_domain}")
    
        # Final checkpoint save
        checkpoint[game_domain] = max(missing_mods, default=0)
        save_checkpoint(checkpoint)
        print(f"‚úÖ Finished processing {game_domain}. Checkpoint saved.")



In [50]:
def fetch_and_update_mods():
    game_domains = get_game_domains()
    existing_mods = get_existing_mod_ids()
    checkpoint = load_checkpoint()

    for game_domain, expected_mod_count in game_domains.items():
        logging.info(f"Checking mods for game: {game_domain}")
        print(f"üïµÔ∏è Checking mods for game: {game_domain}")
        current_mods = existing_mods.get(game_domain, set())

        if len(current_mods) >= expected_mod_count:
            logging.info(f"‚úÖ Mod count matches for {game_domain}, skipping...")
            print(f"‚úÖ Mod count matches for {game_domain}, skipping...")
            continue

        missing_mods = set(range(1, expected_mod_count + 1)) - current_mods
        missing_mods = sorted(missing_mods)

        last_completed_mod_id = checkpoint.get(game_domain, 0)

        for mod_id in missing_mods:
            if mod_id <= last_completed_mod_id:
                continue  

            print(f"üîç Processing mod {mod_id} for {game_domain}")
            fetch_mod(mod_id, game_domain)  

            # Update checkpoint every batch
            if mod_id % BATCH_SIZE == 0:
                checkpoint[game_domain] = mod_id
                save_checkpoint(checkpoint)
                print(f"üíæ Checkpoint saved at mod {mod_id} for {game_domain}")

        # Final checkpoint save
        checkpoint[game_domain] = max(missing_mods, default=0)
        save_checkpoint(checkpoint)
        print(f"‚úÖ Finished processing {game_domain}. Checkpoint saved.")
