# Part 1: API Data Collection

This notebook contains the code for collecting data from the Nexus Mods API.

## Overview
1. **Mod Data Collection** - Fetching mod metadata using Nexus Mods REST API v1
2. **Author Data Collection** - Fetching author/user information using Nexus Mods GraphQL API v2

## Data Sources
- Nexus Mods REST API v1: `api.nexusmods.com/v1/games/{domain}/mods/{id}`
- Nexus Mods GraphQL API v2: `api.nexusmods.com/v2/graphql`

## Output
- Data is written directly to Azure SQL Server database
- Tables: `Mods`, `Authors`, `Games`

## 1. Setup and Configuration

In [None]:
import requests
import json
import logging
import time
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.sql import text
from tqdm import tqdm
import os

In [None]:
# Logging setup
logging.basicConfig(
    filename='API_Update.log', 
    level=logging.INFO, 
    format='%(asctime)s - %(levelname)s - %(message)s'
)

API_KEY = "API_KEY"
HEADERS = {
    'apikey': API_KEY,
    'Accept': 'application/json'
}
BASE_URL_V1 = "https://api.nexusmods.com/v1/"
BASE_URL_V2 = "https://api.nexusmods.com/v2/graphql"

In [None]:
# Rate limiting
REQUEST_LIMIT = 2500
BATCH_SIZE = 100
CHECKPOINT_FILE_MODS = "API_mods_checkpoint.json"
CHECKPOINT_FILE_AUTHORS = "API_authors_checkpoint.json"

In [None]:
# SQLAlchemy connection setup
'''Replace with your database connection string I created an Azure SQL server to get this functioning just because I know how it worked.
    You can change this to whatver you like. I found I ended up with too much data it had to go into an SQL server'''

engine = create_engine(
    "mssql+pyodbc://username:password@server.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server&Connect Timeout=60"
)

## 2. Utility Functions

In [None]:
def rate_limited_request(url, params=None, is_graphql=False, query=None):
    '''Handling the rate limit in the API'''
    while True:
        if is_graphql:
            response = requests.post(url, headers=HEADERS, json=query)
        else:
            response = requests.get(url, headers=HEADERS, params=params)
        
        if response.status_code == 429:
            logging.warning("Rate limit reached. 60 secs then restart.")
            time.sleep(60)
        else:
            response.raise_for_status()
            return response.json()

If replicating highly recommend the checkpoint files otherwise it's painful to fix it up 

In [None]:
def load_checkpoint(checkpoint_file):
    '''Load checkpoint from file if exists.'''
    if os.path.exists(checkpoint_file):
        with open(checkpoint_file, "r") as f:
            return json.load(f)
    return {}

def save_checkpoint(checkpoint_data, checkpoint_file):
    '''Save checkpoint to file.'''
    with open(checkpoint_file, "w") as f:
        json.dump(checkpoint_data, f)

## 3. Mod Data Collection (API v1)

In [None]:
def get_game_domains():
    '''Fetch game domains and mod counts from database.'''
    query = "SELECT domain_name, mods FROM Games"
    try:
        games_df = pd.read_sql(query, con=engine)
        return games_df.set_index("domain_name")["mods"].to_dict()
    except Exception as e:
        logging.error(f"Error fetching game domains: {e}")
        return {}

In [None]:
def get_existing_mod_ids():
    '''Get mod IDs already in database.'''
    query = "SELECT mod_id, COALESCE(game_domain, domain_name) AS game_domain FROM Mods WHERE status != 'not found'"
    try:
        mods_df = pd.read_sql(query, con=engine)
        return mods_df.groupby("game_domain")["mod_id"].apply(set).to_dict()
    except Exception as e:
        logging.error(f"Error fetching existing mod IDs: {e}")
        return {}

In [None]:
def handle_missing_mod(mod_id, game_domain):
    '''Found that if a mod was missing it requeried it so instead I did this'''
    missing_mod_df = pd.DataFrame([{
        "mod_id": mod_id,
        "game_domain": game_domain,
        "status": "not found"
    }])
    try:
        missing_mod_df.to_sql("Mods", con=engine, if_exists="append", index=False)
        logging.warning(f"Mod {mod_id} not found for {game_domain}, marking as 'not found'.")
    except Exception as e:
        logging.error(f"Failed to insert missing mod {mod_id} for {game_domain}: {e}")

In [None]:
def clean_mod_data(mod_data):
    '''Clean nested objects and turn them into json strings'''
    mod_data["user_info"] = json.dumps(mod_data.get("user", {}))
    mod_data["endorsement"] = json.dumps(mod_data.get("endorsement", {}))
    mod_data.pop("user", None)
    return mod_data

def fetch_mod(mod_id, game_domain):
    '''Fetch a single mod and save to database.'''
    mod_url = f"{BASE_URL_V1}games/{game_domain}/mods/{mod_id}"
    try:
        mod_data = rate_limited_request(mod_url)
        
        if not mod_data:
            handle_missing_mod(mod_id, game_domain)
        else:
            mod_data = clean_mod_data(mod_data)
            mod_df = pd.DataFrame([mod_data])
            mod_df.to_sql("Mods", con=engine, if_exists="append", index=False)
            logging.info(f"Added mod {mod_id} for {game_domain}.")
            return mod_data
            
    except requests.exceptions.HTTPError as e:
        if e.response.status_code == 404:
            handle_missing_mod(mod_id, game_domain)
        else:
            logging.error(f"Error fetching mod {mod_id} for {game_domain}: {e}")
    return None

In [None]:
def fetch_and_update_mods():
    '''Main function to fetch all mods for all games.'''
    game_domains = get_game_domains()
    checkpoint = load_checkpoint(CHECKPOINT_FILE_MODS)
    
    for game_domain, expected_mod_count in game_domains.items():
        logging.info(f"Checking mods for game: {game_domain}")
        print(f"Checking mods for game: {game_domain}")
        
        # Get stored mod count
        query = f"""
        SELECT COUNT(DISTINCT mod_id) 
        FROM Mods 
        WHERE COALESCE(game_domain, domain_name) = '{game_domain}' 
        AND status != 'not found'
        """
        try:
            stored_mod_count = pd.read_sql(query, con=engine).iloc[0, 0]
        except Exception as e:
            logging.error(f"Error fetching mod count for {game_domain}: {e}")
            stored_mod_count = 0
        
        print(f"Game: {game_domain} - Stored: {stored_mod_count}, Expected: {expected_mod_count}")
        
        if stored_mod_count >= expected_mod_count:
            print(f"Mod count for {game_domain} is complete.")
            continue
        
        # Get existing mod IDs
        query_existing = f"SELECT DISTINCT mod_id FROM Mods WHERE COALESCE(game_domain, domain_name) = '{game_domain}'"
        try:
            existing_mod_ids = set(pd.read_sql(query_existing, con=engine)['mod_id'])
        except:
            existing_mod_ids = set()
        
        # Find missing mods
        query_max = f"SELECT MAX(mod_id) FROM Mods WHERE COALESCE(game_domain, domain_name) = '{game_domain}'"
        max_stored_mod_id = pd.read_sql(query_max, con=engine).iloc[0, 0] or 0
        highest_possible_mod_id = max(expected_mod_count, max_stored_mod_id + 20000)
        
        missing_mods = sorted(set(range(1, highest_possible_mod_id + 1)) - existing_mod_ids)
        print(f"Total missing mods for {game_domain}: {len(missing_mods)}")
        
        # Fetch missing mods
        for mod_id in tqdm(missing_mods, desc=f"Fetching mods for {game_domain}"):
            fetch_mod(mod_id, game_domain)
            
            if mod_id % BATCH_SIZE == 0:
                checkpoint[game_domain] = mod_id
                save_checkpoint(checkpoint, CHECKPOINT_FILE_MODS)
        
        checkpoint[game_domain] = max(missing_mods, default=0)
        save_checkpoint(checkpoint, CHECKPOINT_FILE_MODS)
        print(f"Finished processing {game_domain}.")

## 4. Author Data Collection (GraphQL API v2)

Permission to collect this data was granted by the NexusMods admin team. 

In [None]:
def get_distinct_user_ids():
    '''Get user IDs from mods that don't have author records yet.'''
    try:
        query = """
            SELECT DISTINCT cm.member_id 
            FROM dbo.CleanedModData cm
            LEFT JOIN Authors a ON cm.member_id = a.member_id
            WHERE cm.member_id IS NOT NULL AND a.member_id IS NULL
        """
        df = pd.read_sql(query, con=engine)
        print(f"Retrieved {len(df)} user IDs to fetch.")
        return df["member_id"].dropna().unique().tolist()
    except Exception as e:
        logging.error(f"Error retrieving user IDs: {str(e)}")
        return []

In [None]:
def get_existing_user_ids():
    '''Get user IDs already in Authors table.'''
    query = "SELECT DISTINCT member_id FROM Authors"
    try:
        df = pd.read_sql(query, con=engine)
        return set(df["member_id"].dropna())
    except Exception as e:
        logging.error(f"Error fetching existing user IDs: {e}")
        return set()

In [None]:
def fetch_user_details(member_id):
    '''Fetch user details via GraphQL API.'''
    query = {
        "query": """
            query user($id: Int!) {
              user(id: $id) {
                about
                name
                country
                joined
                lastActive
                avatar
                recognizedAuthor
                roles
                modCount
                collectionCount
                contributedModCount
                ownedModCount
                endorsementsGiven
                posts
                kudos
                uniqueModDownloads
                views
                videoCount
                imageCount
                banned
                deleted
                showActivityFeed
                showLastActive
                moderationHistoryCount
                memberId
                isBlocked
                isTracked
                moderationJwt
                donationsEnabled
                dpOptedIn
                blockedFromOptingInModsAt
              }
            }
        """,
        "variables": {"id": int(member_id)}
    }
    
    try:
        response = rate_limited_request(BASE_URL_V2, is_graphql=True, query=query)
        user_data = response.get("data", {}).get("user", {})
        
        if not user_data or user_data.get("memberId") is None:
            return None
        
        # Rename memberId to member_id for SQL consistency
        user_data["member_id"] = user_data.pop("memberId")
        return user_data
        
    except requests.exceptions.HTTPError as e:
        logging.error(f"Error fetching user {member_id}: {e}")
        return None

In [None]:
def sanitize_user_data(raw_data):
    '''Sanitize and normalize user data for database insertion.'''
    member_id = raw_data.get("member_id", raw_data.get("memberId"))
    
    if member_id is None:
        return None
    
    return {
        "member_id": member_id,
        "name": raw_data.get("name", ""),
        "about": raw_data.get("about", ""),
        "country": raw_data.get("country", "Unknown"),
        "joined": raw_data.get("joined", "1970-01-01T00:00:00Z"),
        "last_active": raw_data.get("lastActive", None),
        "avatar": raw_data.get("avatar", ""),
        
        # Booleans
        "recognized_author": raw_data.get("recognizedAuthor", False),
        "banned": raw_data.get("banned", False),
        "deleted": raw_data.get("deleted", False),
        "show_activity_feed": raw_data.get("showActivityFeed", False),
        "show_last_active": raw_data.get("showLastActive", False),
        "is_blocked": raw_data.get("isBlocked", False),
        "is_tracked": raw_data.get("isTracked", False),
        "donations_enabled": raw_data.get("donationsEnabled", False),
        "dp_opted_in": raw_data.get("dpOptedIn", False),
        
        # Integers
        "mod_count": raw_data.get("modCount", 0),
        "collection_count": raw_data.get("collectionCount", 0),
        "contributed_mod_count": raw_data.get("contributedModCount", 0),
        "owned_mod_count": raw_data.get("ownedModCount", 0),
        "endorsements_given": raw_data.get("endorsementsGiven", 0),
        "unique_mod_downloads": raw_data.get("uniqueModDownloads", 0),
        "posts": raw_data.get("posts", 0),
        "kudos": raw_data.get("kudos", 0),
        "views": raw_data.get("views", 0),
        "video_count": raw_data.get("videoCount", 0),
        "image_count": raw_data.get("imageCount", 0),
        
        # Nullable
        "moderation_history_count": raw_data.get("moderationHistoryCount", None),
        "blocked_from_opting_in_mods_at": raw_data.get("blockedFromOptingInModsAt", None),
        "moderation_jwt": raw_data.get("moderationJwt", None),
    }

In [None]:
def upsert_user_data(user_data):
    '''Insert or update user data in Authors table using MERGE.'''
    if "member_id" not in user_data or user_data["member_id"] is None:
        return
    
    query = text("""
        MERGE INTO Authors AS target
        USING (SELECT :member_id AS member_id) AS source
        ON target.member_id = source.member_id
        WHEN MATCHED THEN 
            UPDATE SET name = :name, about = :about, country = :country, joined = :joined, 
                       last_active = :last_active, avatar = :avatar, recognized_author = :recognized_author, 
                       mod_count = :mod_count, collection_count = :collection_count, 
                       contributed_mod_count = :contributed_mod_count, owned_mod_count = :owned_mod_count, 
                       endorsements_given = :endorsements_given, posts = :posts, kudos = :kudos,
                       unique_mod_downloads = :unique_mod_downloads, views = :views, 
                       video_count = :video_count, image_count = :image_count, 
                       banned = :banned, deleted = :deleted, show_activity_feed = :show_activity_feed,
                       show_last_active = :show_last_active, moderation_history_count = :moderation_history_count,
                       is_blocked = :is_blocked, is_tracked = :is_tracked, moderation_jwt = :moderation_jwt, 
                       donations_enabled = :donations_enabled, dp_opted_in = :dp_opted_in, 
                       blocked_from_opting_in_mods_at = :blocked_from_opting_in_mods_at
        WHEN NOT MATCHED THEN 
            INSERT (member_id, name, about, country, joined, last_active, avatar, recognized_author, 
                    mod_count, collection_count, contributed_mod_count, owned_mod_count, endorsements_given, 
                    posts, kudos, unique_mod_downloads, views, video_count, image_count, banned, deleted, 
                    show_activity_feed, show_last_active, moderation_history_count, is_blocked, is_tracked, 
                    moderation_jwt, donations_enabled, dp_opted_in, blocked_from_opting_in_mods_at)
            VALUES (:member_id, :name, :about, :country, :joined, :last_active, :avatar, :recognized_author, 
                    :mod_count, :collection_count, :contributed_mod_count, :owned_mod_count, :endorsements_given, 
                    :posts, :kudos, :unique_mod_downloads, :views, :video_count, :image_count, :banned, :deleted, 
                    :show_activity_feed, :show_last_active, :moderation_history_count, :is_blocked, :is_tracked, 
                    :moderation_jwt, :donations_enabled, :dp_opted_in, :blocked_from_opting_in_mods_at);
    """)
    
    try:
        with engine.connect() as conn:
            conn.execute(query, user_data)
            conn.commit()
    except Exception as e:
        logging.error(f"Error inserting/updating user {user_data.get('member_id')}: {e}")

In [None]:
def fetch_all_authors():
    '''Main function to fetch all author data.'''
    all_user_ids = get_distinct_user_ids()
    existing_user_ids = get_existing_user_ids()
    
    new_user_ids = [uid for uid in all_user_ids if uid not in existing_user_ids]
    print(f"Found {len(new_user_ids)} new users to fetch.")
    
    inserted_count = 0
    
    for user_id in tqdm(new_user_ids, desc="Fetching author data"):
        raw_data = fetch_user_details(user_id)
        
        if not raw_data:
            continue
        
        user_data = sanitize_user_data(raw_data)
        
        if user_data is None:
            continue
        
        upsert_user_data(user_data)
        inserted_count += 1
        
        if inserted_count % 100 == 0:
            print(f"Inserted {inserted_count} authors.")
    
    print(f"Finished! Total authors inserted: {inserted_count}")