
# Refactored Nexus Mods Data Pipeline
This notebook connects to the Nexus Mods API, retrieves data, and writes it directly to an Azure SQL Server database using SQLAlchemy. CSV operations have been removed.


In [1]:
import requests
import json
import logging
import time
import pandas as pd
from sqlalchemy import create_engine
from tqdm import tqdm

In [2]:
# Logging setup
logging.basicConfig(filename='nexus_mods_log_extra.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# API configuration
API_KEY = "WLDgsmFLQ/ZCLB1ELghAYI341WKTs/fs7ya8+VDq2yTQznCl/j+6--iS6z8N+za91e8G7L--sNgR1fxWJCmplCOW2IfHOg=="
HEADERS = {
    'apikey': API_KEY,
    'Accept': 'application/json'
}
BASE_URL = "https://api.nexusmods.com/v1/"
REQUEST_LIMIT = 2500

# SQLAlchemy connection setup
engine = create_engine(
    "mssql+pyodbc://admin4327:Tr3m3r3Pr1nc3!@nmntserver.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server"
)


In [3]:
# Function to handle API rate limits
def rate_limited_request(url, params=None):
    while True:
        response = requests.get(url, headers=HEADERS, params=params)
        if response.status_code == 429:  # Too Many Requests
            logging.warning("Rate limit reached. Sleeping for 60 seconds.")
            time.sleep(60)
        else:
            response.raise_for_status()  # Raise error for other HTTP errors
            return response.json()


In [None]:
# Function to fetch game domains from the Games table
def get_game_domains():
    query = "SELECT domain_name FROM Games"
    try:
        games_df = pd.read_sql(query, con=engine)
        return games_df["game_domain"].tolist()
    except Exception as e:
        logging.error(f"Error fetching game domains: {e}")
        return []

In [None]:
def fetch_and_update_mods():
    game_domains = get_game_domains()
    existing_mod_ids = get_existing_mod_ids()

    for game_domain in game_domains:
        logging.info(f"Fetching mods for game: {game_domain}")
        mods_url = f"{BASE_URL}games/{game_domain}/mods"
        all_mods = []

        for page in range(1, REQUEST_LIMIT + 1):
            params = {'page': page}
            data = rate_limited_request(mods_url, params)
            
            if not data.get('mods'):
                break
            
            all_mods.extend(data['mods'])

        # Convert to DataFrame
        mods_df = pd.DataFrame(all_mods)

        # Rename column "user" to "user_info"
        mods_df.rename(columns={"user": "user_info"}, inplace=True)

        # Filter out mods that already exist
        new_mods_df = mods_df[~mods_df["mod_id"].isin(existing_mod_ids)]

        if not new_mods_df.empty:
            try:
                new_mods_df.to_sql("Mods", con=engine, if_exists="append", index=False)
                logging.info(f"Added {len(new_mods_df)} new mods for {game_domain}.")
            except Exception as e:
                logging.error(f"Error inserting new mods for {game_domain}: {e}")

# Run the function
fetch_and_update_mods()