In [1]:
import requests
import json
import pandas as pd
import logging
import time
import os
from tqdm import tqdm

### General Set Up


In [2]:
logging.basicConfig(filename='nexus_mods_log.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

API_KEY = "P4WFtQQ8zMc89xCESY0x2E183bfOpHRjvY159e4H3HcOJ6GNaDI=--I3/cG6WtrxR9ac4t--/e+gBW1nRPXJS7PggzSspg=="
HEADERS = {
    'apikey': API_KEY,
    'Accept': 'application/json'
}
BASE_URL = "https://api.nexusmods.com/v1/"

REQUEST_LIMIT = 2500
REQUESTS_PER_HOUR_LIMIT = 100

OUTPUT_FILE = "nexus_mods_games.csv"
CHECKPOINT_FILE = "checkpoint.json"

#### Function for API limits

In [3]:
# Function to handle API rate limits
def rate_limited_request(url, params=None):
    while True:
        response = requests.get(url, headers=HEADERS, params=params)
        if response.status_code == 429:
            logging.warning("Rate limit reached. Sleeping for 60 seconds.")
            time.sleep(60)
        else:
            return response

#### Functions for checkpoints

In [4]:
# Load checkpoint
def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, 'r') as f:
            return json.load(f)
    return {}

In [5]:
# Save checkpoint
def save_checkpoint(data):
    with open(CHECKPOINT_FILE, 'w') as f:
        json.dump(data, f)

#### Games List and Details

In [6]:
# Get the list of games
def get_games(top_n=10):
    url = f"{BASE_URL}games.json"
    response = rate_limited_request(url)
    if response.status_code == 200:
        games = response.json()
        return sorted(games, key=lambda x: x['downloads'], reverse=True)[:top_n]
    else:
        logging.error("Failed to fetch games.")
        return []

In [7]:
# Get detailed game information
def get_game_details(game_domain):
    url = f"{BASE_URL}games/{game_domain}.json"
    response = rate_limited_request(url)
    if response.status_code == 200:
        return response.json()
    else:
        logging.error(f"Failed to fetch game details for {game_domain}")
        return None

#### Main Functions

In [8]:
def collect_game_data(game_list):
    checkpoint = load_checkpoint()
    game_data = []
    start_time = time.time()

    for game in tqdm(game_list, desc="Fetching game data", unit="game"):
        game_domain = game['domain_name']
        if game_domain in checkpoint:
            logging.info(f"Skipping {game_domain}, already processed.")
            continue

        logging.info(f"Fetching data for {game_domain}")
        details = get_game_details(game_domain)
        if details:
            game_data.append(details)
            checkpoint[game_domain] = True
            save_checkpoint(checkpoint)

        # Save progress periodically
        pd.DataFrame(game_data).to_csv(OUTPUT_FILE, index=False)
        time.sleep(2)  # Respect API limits

    elapsed_time = time.time() - start_time
    logging.info(f"Data collection completed in {elapsed_time:.2f} seconds.")
    return game_data

In [9]:
def main():
    top_n = int(input("Enter the number of top games to fetch (default 10): ") or 10)
    games = get_games(top_n)
    if not games:
        logging.error("No games retrieved.")
        return

    collected_data = collect_game_data(games)
    pd.DataFrame(collected_data).to_csv(OUTPUT_FILE, index=False)
    logging.info("Data collection complete.")

In [11]:
if __name__ == "__main__":
    main()

Enter the number of top games to fetch (default 10):  1000


Fetching game data: 100%|████████████████████████████████████████████████████████| 1000/1000 [37:02<00:00,  2.22s/game]


In [13]:
df=pd.read_csv(r"C:\Users\nicta\Documents\Repos\Nexus_mods_graphsql\nexus_mods_api_webscrape\nexus_mods_games.csv")
df.head()

Unnamed: 0,id,name,forum_url,nexusmods_url,genre,file_count,downloads,domain_name,approved_date,file_views,authors,file_endorsements,mods,categories
0,4309,Sifu,https://forums.nexusmods.com/index.php?/forum/,https://www.nexusmods.com/sifu,Fighting,2178,2700960,sifu,1644492357,0,216,77734,1050,"[{'category_id': 1, 'name': 'Sifu', 'parent_ca..."
1,198,Knights of the Old Republic II,https://forums.nexusmods.com/index.php?/forum/,https://www.nexusmods.com/kotor2,RPG,1871,2684769,kotor2,1386083220,7286093,198,68104,355,"[{'category_id': 1, 'name': 'Knights of the Ol..."
2,2388,BattleTech,https://forums.nexusmods.com/index.php?/forum/...,https://www.nexusmods.com/battletech,Strategy,5805,2633781,battletech,1524834925,7897768,296,66293,481,"[{'category_id': 1, 'name': 'BattleTech', 'par..."
3,3381,Horizon Zero Dawn,https://forums.nexusmods.com/index.php?/forum/,https://www.nexusmods.com/horizonzerodawn,ARPG,1089,2561590,horizonzerodawn,1596879070,1109504,154,103172,250,"[{'category_id': 1, 'name': 'Horizon Zero Dawn..."
4,2573,Jurassic World Evolution,https://forums.nexusmods.com/index.php?/forum/,https://www.nexusmods.com/jurassicworldevolution,Simulation,4248,2552992,jurassicworldevolution,1535961788,6278696,432,119225,1443,"[{'category_id': 1, 'name': 'Jurassic World Ev..."


### ADD GAMES TO SQL

In [27]:
import pandas as pd
import json
from sqlalchemy import create_engine, text
import ast
import re

csv_file = "C:/Users/nicta/Documents/Repos/Nexus_mods_graphsql/nexus_mods_api_webscrape/nexus_mods_games.csv"
df = pd.read_csv(csv_file)
print(df["categories"].head(10))  # Check the first 10 cleaned category values

engine = create_engine(
    "mssql+pyodbc://admin4327:Tr3m3r3Pr1nc3!@nmntserver.database.windows.net/NexusModsDB?driver=ODBC+Driver+17+for+SQL+Server&Connect Timeout=60"
)

0    [{'category_id': 1, 'name': 'Sifu', 'parent_ca...
1    [{'category_id': 1, 'name': 'Knights of the Ol...
2    [{'category_id': 1, 'name': 'BattleTech', 'par...
3    [{'category_id': 1, 'name': 'Horizon Zero Dawn...
4    [{'category_id': 1, 'name': 'Jurassic World Ev...
5    [{'category_id': 1, 'name': 'Starbound', 'pare...
6    [{'category_id': 1, 'name': 'Nioh 2', 'parent_...
7    [{'category_id': 1, 'name': "Marvel's Spider-M...
8    [{'category_id': 1, 'name': 'Diablo II: Resurr...
9    [{'category_id': 1, 'name': 'Ghost Recon Break...
Name: categories, dtype: object


In [28]:
import re

def find_hidden_chars(json_data):
    """Check for hidden characters in the JSON string."""
    hidden_chars = [char for char in json_data if ord(char) < 32 and char not in ['\n', '\t', ' ']]
    return hidden_chars

for index, row in df.iterrows():
    categories = str(row["categories"])  # Convert to string
    hidden_chars = find_hidden_chars(categories)
    if hidden_chars:
        print(f"⚠️ Hidden characters found in row {index}: {hidden_chars}")


In [29]:
def clean_json(json_data):
    try:
        if pd.isna(json_data) or json_data.strip() == "":
            return None  

        # Remove any hidden characters before parsing
        json_data = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', json_data)  

        # Convert text-based list into a real Python list
        json_obj = ast.literal_eval(json_data)  

        # Ensure 'parent_category' values are null if False
        for category in json_obj:
            if "parent_category" in category and isinstance(category["parent_category"], bool):
                category["parent_category"] = None if category["parent_category"] is False else category["parent_category"]

            # Fix unescaped quotes ("s → 's)
            if "name" in category:
                category["name"] = category["name"].replace('"s', "'s").strip()

        return json.dumps(json_obj, ensure_ascii=False)  # Ensure it's valid JSON

    except (ValueError, SyntaxError):
        print(f"⚠️ Malformed JSON found: {json_data}")
        return None  
# Apply the cleaning function
df["categories"] = df["categories"].apply(clean_json)


# ✅ Fix any other potential issues (e.g., handling NaN values)
df = df.fillna("")  # Replace NaN with empty strings where needed
print(df["categories"].head(10))  # Check the first 10 cleaned category values



0    [{"category_id": 1, "name": "Sifu", "parent_ca...
1    [{"category_id": 1, "name": "Knights of the Ol...
2    [{"category_id": 1, "name": "BattleTech", "par...
3    [{"category_id": 1, "name": "Horizon Zero Dawn...
4    [{"category_id": 1, "name": "Jurassic World Ev...
5    [{"category_id": 1, "name": "Starbound", "pare...
6    [{"category_id": 1, "name": "Nioh 2", "parent_...
7    [{"category_id": 1, "name": "Marvel's Spider-M...
8    [{"category_id": 1, "name": "Diablo II: Resurr...
9    [{"category_id": 1, "name": "Ghost Recon Break...
Name: categories, dtype: object


In [30]:
# SQL MERGE query
sql_merge = text("""
MERGE INTO [dbo].[Games] AS Target
USING (
    SELECT :id AS id, :name AS name, :forum_url AS forum_url, :nexusmods_url AS nexusmods_url, 
           :genre AS genre, :file_count AS file_count, :downloads AS downloads, 
           :domain_name AS domain_name, :approved_date AS approved_date, 
           :file_views AS file_views, :authors AS authors, 
           :file_endorsements AS file_endorsements, :mods AS mods, :categories AS categories
) AS Source
ON Target.id = Source.id
WHEN MATCHED THEN 
    UPDATE SET 
        Target.name = Source.name,
        Target.forum_url = Source.forum_url,
        Target.nexusmods_url = Source.nexusmods_url,
        Target.genre = Source.genre,
        Target.file_count = Source.file_count,
        Target.downloads = Source.downloads,
        Target.domain_name = Source.domain_name,
        Target.approved_date = Source.approved_date,
        Target.file_views = Source.file_views,
        Target.authors = Source.authors,
        Target.file_endorsements = Source.file_endorsements,
        Target.mods = Source.mods,
        Target.categories = Source.categories
WHEN NOT MATCHED THEN
    INSERT (id, name, forum_url, nexusmods_url, genre, file_count, downloads, 
            domain_name, approved_date, file_views, authors, file_endorsements, mods, categories)
    VALUES (Source.id, Source.name, Source.forum_url, Source.nexusmods_url, Source.genre, Source.file_count, 
            Source.downloads, Source.domain_name, Source.approved_date, Source.file_views, 
            Source.authors, Source.file_endorsements, Source.mods, Source.categories);
""")

# Convert DataFrame to list of dicts for bulk execution
records = df.to_dict(orient="records")

# Execute the query in a transaction
with engine.begin() as conn:
    conn.execute(sql_merge, records)

print("✅ Data successfully merged (updated + new rows added).")

✅ Data successfully merged (updated + new rows added).


In [None]:
sql_merge = text("""
MERGE INTO [dbo].[Games] AS Target
USING (
    SELECT :id AS id, :name AS name, :forum_url AS forum_url, :nexusmods_url AS nexusmods_url, 
           :genre AS genre, :file_count AS file_count, :downloads AS downloads, 
           :domain_name AS domain_name, :approved_date AS approved_date, 
           :file_views AS file_views, :authors AS authors, 
           :file_endorsements AS file_endorsements, :mods AS mods, :categories AS categories
) AS Source
ON Target.id = Source.id
WHEN MATCHED THEN 
    UPDATE SET 
        Target.name = Source.name,
        Target.forum_url = Source.forum_url,
        Target.nexusmods_url = Source.nexusmods_url,
        Target.genre = Source.genre,
        Target.file_count = Source.file_count,
        Target.downloads = Source.downloads,
        Target.domain_name = Source.domain_name,
        Target.approved_date = Source.approved_date,
        Target.file_views = Source.file_views,
        Target.authors = Source.authors,
        Target.file_endorsements = Source.file_endorsements,
        Target.mods = Source.mods,
        Target.categories = Source.categories
WHEN NOT MATCHED THEN
    INSERT (id, name, forum_url, nexusmods_url, genre, file_count, downloads, 
            domain_name, approved_date, file_views, authors, file_endorsements, mods, categories)
    VALUES (Source.id, Source.name, Source.forum_url, Source.nexusmods_url, Source.genre, Source.file_count, 
            Source.downloads, Source.domain_name, Source.approved_date, Source.file_views, 
            Source.authors, Source.file_endorsements, Source.mods, Source.categories);
""")

records = df.to_dict(orient="records")

# Execute the query in a transaction (bulk execution)
with engine.begin() as conn:
    conn.execute(sql_merge, records)

print("✅ Data successfully merged (updated + new rows added).")


In [25]:
import json

# Example JSON string (replace with the actual row that causes issues)
json_text = '''[{"category_id": 1, "name": "Borderlands 3", "parent_category": null}, 
{"category_id": 2, "name": "Miscellaneous", "parent_category": 1}, 
{"category_id": 3, "name": "Saved Games", "parent_category": 1}, 
{"category_id": 5, "name": "Audio", "parent_category": 1}, 
{"category_id": 6, "name": "Visuals", "parent_category": 1}, 
{"category_id": 7, "name": "Weapons and Gear", "parent_category": 1}, 
{"category_id": 8, "name": "Utilities", "parent_category": 1}, 
{"category_id": 9, "name": "Characters", "parent_category": 1}, 
{"category_id": 10, "name": "User Interface", "parent_category": 1}, 
{"category_id": 11, "name": "Gameplay", "parent_category": 1}]'''

try:
    parsed_json = json.loads(json_text)  # Try parsing
    print("✅ JSON is valid!")
except json.JSONDecodeError as e:
    print(f"⚠️ JSON Error: {e}")  # Prints detailed error location


✅ JSON is valid!
