In [12]:
import os
import json
import logging
import json
import pandas as pd
import logging
import time
import requests
from tqdm import tqdm
import re

In [13]:
# Configuration
MODS_OUTPUT_FILE_PREFIX = "nexus_mods_mods"
API_URL = "https://api.nexusmods.com/v1/mods/{game_id}/files.json"
API_KEY = "P4WFtQQ8zMc89xCESY0x2E183bfOpHRjvY159e4H3HcOJ6GNaDI=--I3/cG6WtrxR9ac4t--/e+gBW1nRPXJS7PggzSspg=="
HEADERS = {"apikey": API_KEY, "Accept": "application/json"}
MAX_FILE_SIZE_KB = 100000  # 100MB limit
MODS_OUTPUT_FILE_PREFIX = "nexus_mods_mods"
CHECKPOINT_FILE = "mods_checkpoint.json"
LOG_FILE = "nexus_mods_log.log"
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [29]:
import os
import json
import pandas as pd
import re
import logging

# Configuration
MODS_OUTPUT_FILE_PREFIX = "nexus_mods_mods"
LOG_FILE = "nexus_mods_log.log"

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_processed_mod_ids():
    """Extract processed mod IDs from the log file, excluding missing ones."""
    processed_ids = set()
    missing_pattern = re.compile(r"WARNING - Mod (\d+) for .* not found")
    processed_pattern = re.compile(r"INFO - Fetching mod (\d+) for .*")

    try:
        with open(LOG_FILE, 'r') as f:
            for line in f:
                processed_match = processed_pattern.search(line)
                missing_match = missing_pattern.search(line)
                if processed_match:
                    mod_id = int(processed_match.group(1))
                    processed_ids.add(mod_id)
                if missing_match:
                    mod_id = int(missing_match.group(1))
                    processed_ids.discard(mod_id)  # Remove missing mods from the set

    except FileNotFoundError:
        logging.error(f"Log file {LOG_FILE} not found.")

    logging.info(f"Extracted {len(processed_ids)} successfully processed mod IDs.")
    return processed_ids

def get_collected_mod_ids():
    """Scan all CSVs and collect existing mod IDs."""
    collected_ids = set()
    for file in os.listdir():
        if file.startswith(MODS_OUTPUT_FILE_PREFIX) and file.endswith(".csv"):
            if os.path.getsize(file) == 0:
                logging.warning(f"Skipping empty file: {file}")
                continue
            try:
                df = pd.read_csv(file, dtype={"mod_id": str})
                if 'mod_id' in df.columns:
                    collected_ids.update(df['mod_id'].astype(int).tolist())
            except (pd.errors.EmptyDataError, pd.errors.ParserError) as e:
                logging.warning(f"Skipping unreadable file {file}: {e}")
    logging.info(f"Collected {len(collected_ids)} mod IDs from CSV files.")
    return collected_ids

def find_missing_ids():
    """Compare processed mod IDs with collected CSV IDs and find missing ones."""
    processed_ids = get_processed_mod_ids()
    collected_ids = get_collected_mod_ids()

    missing_ids = processed_ids - collected_ids
    logging.info(f"Identified {len(missing_ids)} missing mods.")

    with open("missing_mods.json", "w") as f:
        json.dump({"missing_ids": sorted(missing_ids)}, f, indent=4)

    logging.info("Missing mod IDs saved to 'missing_mods.json'.")

if __name__ == "__main__":
    find_missing_ids()


2025-01-27 14:09:37,546 - INFO - Extracted 56822 successfully processed mod IDs.
2025-01-27 14:09:38,248 - INFO - Collected 16791 mod IDs from CSV files.
2025-01-27 14:09:38,260 - INFO - Identified 40031 missing mods.
2025-01-27 14:09:38,277 - INFO - Missing mod IDs saved to 'missing_mods.json'.


In [15]:
def load_missing_mods():
    """Load missing mod IDs from JSON file."""
    with open("missing_mods.json", "r") as f:
        data = json.load(f)
    return data["missing_ids"], data["next_start_id"]

def fetch_mod_data(mod_id):
    """Fetch mod details from the API."""
    try:
        response = requests.get(API_URL.format(game_id=mod_id), headers=HEADERS)
        if response.status_code == 200:
            return response.json()
        elif response.status_code == 429:
            logging.warning("Rate limit hit, sleeping for 60 seconds...")
            time.sleep(60)
            return fetch_mod_data(mod_id)
        else:
            logging.error(f"Failed to fetch mod ID {mod_id}: {response.status_code}")
            return None
    except Exception as e:
        logging.error(f"Error fetching mod ID {mod_id}: {e}")
        return None

def save_data_with_limit(mods_data):
    """Save data ensuring file size does not exceed the limit."""
    file_index = 1
    output_file = f"{MODS_OUTPUT_FILE_PREFIX}_{file_index}.csv"
    pd.DataFrame(mods_data).to_csv(output_file, index=False)

    while os.path.exists(output_file) and os.path.getsize(output_file) / 1024 > MAX_FILE_SIZE_KB:
        file_index += 1
        output_file = f"{MODS_OUTPUT_FILE_PREFIX}_{file_index}.csv"
        pd.DataFrame(mods_data).to_csv(output_file, index=False)

def collect_missing_mods():
    """Fetch missing mods and save periodically."""
    missing_ids, next_start_id = load_missing_mods()
    mods_data = []
    start_time = time.time()

    with tqdm(total=len(missing_ids), desc="Fetching missing mods") as pbar:
        for mod_id in missing_ids:
            mod_info = fetch_mod_data(mod_id)
            if mod_info:
                mods_data.append(mod_info)

            if len(mods_data) % 100 == 0:  # Save every 100 records
                save_data_with_limit(mods_data)
                mods_data = []  # Clear buffer after saving
            pbar.update(1)

    if mods_data:
        save_data_with_limit(mods_data)  # Final save if remaining items exist

    logging.info("Mods data collection complete.")

if __name__ == "__main__":
    collect_missing_mods()


Fetching missing mods:   0%|                                                                 | 0/22953 [00:00<?, ?it/s]2025-01-27 13:55:51,841 - ERROR - Failed to fetch mod ID 1: 404
Fetching missing mods:   0%|                                                       | 1/22953 [00:00<2:50:26,  2.24it/s]2025-01-27 13:55:52,170 - ERROR - Failed to fetch mod ID 2: 404
Fetching missing mods:   0%|                                                       | 2/22953 [00:00<2:24:24,  2.65it/s]2025-01-27 13:55:52,506 - ERROR - Failed to fetch mod ID 3: 404
Fetching missing mods:   0%|                                                       | 3/22953 [00:01<2:17:07,  2.79it/s]2025-01-27 13:55:52,866 - ERROR - Failed to fetch mod ID 8: 404
Fetching missing mods:   0%|                                                       | 4/22953 [00:01<2:17:25,  2.78it/s]2025-01-27 13:55:53,302 - ERROR - Failed to fetch mod ID 9: 404
Fetching missing mods:   0%|                                                       | 

KeyboardInterrupt: 

In [35]:
all_ids = set(range(df['mod_id'].min(), df['mod_id'].max() + 1))
existing_ids = set(df['mod_id'])
missing_ids = sorted(all_ids - existing_ids)

print("Missing mod_id values:", missing_ids)

Missing mod_id values: [82, 83, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 132, 135, 136, 137, 138, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 158, 162, 163, 165, 166, 170, 171, 174, 175, 176, 179, 181, 182, 187, 196, 199, 200, 202, 203, 204, 205, 206, 207, 208, 211, 212, 213, 215, 217, 220, 223, 226, 230, 231, 232, 236, 239, 240, 241, 242, 243, 244, 246, 247, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 267, 268, 270, 273, 278, 279, 280, 281, 282, 284, 285, 286, 287, 288, 289, 294, 296, 297, 298, 299, 300, 303, 304, 305, 307, 308, 309, 310, 311, 312, 313, 314, 316, 317, 319, 320, 322, 323, 326, 327, 328, 330, 331, 332, 333, 335, 337, 338, 341, 343, 345, 347, 349, 350, 351, 353, 354, 357, 358, 359, 360, 361, 362, 363, 364, 367, 368, 369, 370, 371, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396

In [38]:
df=pd.read_csv("nexus_mods_mods.csv")
filtered_df = df[df['mod_id'] == 81]

# Display the filtered DataFrame
print(filtered_df)

                                     name  \
0  The Cracking City - Ayleid Player Base   

                                             summary  \
0  A completely functional player base added, ent...   

                                         description  \
0  The resources used here are from [url=http://w...   

                                         picture_url  mod_downloads  \
0  https://staticdelivery.nexusmods.com/mods/1704...           4377   

   mod_unique_downloads            uid  mod_id  game_id  allow_rating  ...  \
0                  2656  7318624272465      81     1704          True  ...   

                    updated_time     author uploaded_by  \
0  2016-10-30T02:50:37.000+00:00  Evittalex   evittalex   

                 uploaded_users_profile_url  contains_adult_content  \
0  https://www.nexusmods.com/users/16033119                   False   

      status  available                                               user  \
0  published       True  {'member_id': 1603

In [40]:
# Find gaps greater than 10
df_sorted = df.sort_values(by='mod_id')  # Ensure sorting
gaps = df_sorted['mod_id'].diff()  # Calculate differences between consecutive IDs

# Find where the gap is greater than 10
gap_indices = df_sorted.index[gaps > 10].tolist()

# Get the starting and ending points of the gaps
gap_starts = df_sorted.iloc[[i - 1 for i in gap_indices]]['mod_id'].tolist()
gap_ends = df_sorted.iloc[gap_indices]['mod_id'].tolist()

# Print the results
for start, end in zip(gap_starts, gap_ends):
    print(f"Gap found between {start} and {end}, missing {end - start - 1} items.")

Gap found between 84 and 100, missing 15 items.
Gap found between 100 and 114, missing 13 items.
Gap found between 140 and 151, missing 10 items.
Gap found between 249 and 266, missing 16 items.
Gap found between 372 and 386, missing 13 items.
Gap found between 386 and 411, missing 24 items.
Gap found between 412 and 426, missing 13 items.
Gap found between 476 and 487, missing 10 items.
Gap found between 523 and 553, missing 29 items.
Gap found between 556 and 568, missing 11 items.
Gap found between 602 and 640, missing 37 items.
Gap found between 804 and 818, missing 13 items.
Gap found between 856 and 887, missing 30 items.
Gap found between 1554 and 1588, missing 33 items.
Gap found between 1599 and 1647, missing 47 items.
Gap found between 1678 and 1690, missing 11 items.
Gap found between 2089 and 2109, missing 19 items.
Gap found between 2520 and 2543, missing 22 items.
Gap found between 2546 and 2562, missing 15 items.
Gap found between 2573 and 2589, missing 15 items.
Gap fou