## Test Web Scrape of User Data and Mod Data from NexusMods

In [36]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import json
import time
import pandas as pd
import re
import logging
import traceback

In [37]:
#web driver for firefox 
def init_driver():
    options = webdriver.FirefoxOptions()
    options.add_argument('--headless')  # was recommended for dynamic JavaScript on web
    driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()), options=options)
    return driver

### Original Script to get an idea of what data was available

### Actual Code for Web Scraping

#### Was created using selenium and beautiful soup. Logging was added to help identify errors

In [29]:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

def clean_html(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
    return soup

In [30]:

def extract_profile_data(cleaned_soup):
    profile_data = {}
    try:
        username_elem = cleaned_soup.find('h1', class_='typography-heading-sm sm:typography-heading-lg')
        if username_elem:
            profile_data['username'] = username_elem.text.strip()
        else:
            logging.warning("Username element not found")
        
        about_div = cleaned_soup.find('div', {'id': 'mainContent'})
        if about_div:
            profile_data['about'] = about_div.text.strip()
        else:
            logging.warning("About section not found")
        
        unique_downloads = cleaned_soup.find('p', {'data-e2eid': 'unique-download-count'})
        if unique_downloads:
            profile_data['unique_downloads'] = unique_downloads.text.strip()
        else:
            logging.warning("Unique downloads count not found")
        
        stats_divs = cleaned_soup.find_all('div', class_='bg-surface-translucent-mid flex items-center gap-x-2 rounded-lg px-4 py-2')
        for div in stats_divs:
            label = div.find('p', class_='typography-title-xs text-translucent-subdued')
            value = div.find('p', class_='typography-title-md text-translucent-strong')
            if label and value:
                key = label.text.strip().lower().replace(' ', '_')
                profile_data[key] = value.text.strip()
            else:
                logging.warning(f"Incomplete stat div: {div}")
        
        country_span = cleaned_soup.find('span', string=lambda text: text and "United States" in text)
        if country_span:
            profile_data['country'] = country_span.text.strip()
        else:
            logging.warning("Country not found")
        
        last_active = cleaned_soup.find('time', {'data-e2eid': 'last-active-date'})
        if last_active:
            profile_data['last_active'] = last_active['datetime']
        else:
            logging.warning("Last active date not found")
        
        joined_on = cleaned_soup.find('time', {'data-e2eid': 'joined-date'})
        if joined_on:
            profile_data['joined_on'] = joined_on['datetime']
        else:
            logging.warning("Joined date not found")

        logging.info(f"Extracted profile data: {profile_data}")
    except Exception as e:
        logging.error(f"Error extracting profile data: {e}")
        logging.error(traceback.format_exc())
        return None

    return profile_data



In [31]:
def extract_mods_data(cleaned_soup, username):
    mods_data = []
    try:
        mods_list = cleaned_soup.find_all('div', class_='group/mod')
        logging.info(f"Found {len(mods_list)} mod elements")
        for mod in mods_list:
            mod_url_elem = mod.find('a', {'data-e2eid': 'mod-tile-title'})
            mod_url = 'https://www.nexusmods.com' + mod_url_elem.get('href', '') if mod_url_elem else ''
            
            # Extract mod_id from the URL
            mod_id = ''
            if mod_url:
                match = re.search(r'/mods/(\d+)$', mod_url)
                if match:
                    mod_id = match.group(1)
                else:
                    logging.warning(f"Could not extract mod_id from URL: {mod_url}")
            
            mod_data = {
                'mod_id': mod_id,
                'mod_name': mod_url_elem.text.strip() if mod_url_elem else '',
                'mod_url': mod_url,
                'game': mod.find('a', {'data-e2eid': 'mod-tile-game'}).text.strip() if mod.find('a', {'data-e2eid': 'mod-tile-game'}) else '',
                'category': mod.find('a', {'data-e2eid': 'mod-tile-category'}).text.strip() if mod.find('a', {'data-e2eid': 'mod-tile-category'}) else '',
                'endorsements': mod.find('span', {'data-e2eid': 'mod-tile-endorsements'}).text.strip() if mod.find('span', {'data-e2eid': 'mod-tile-endorsements'}) else '',
                'downloads': mod.find('span', {'data-e2eid': 'mod-tile-downloads'}).text.strip() if mod.find('span', {'data-e2eid': 'mod-tile-downloads'}) else '',
                'file_size': mod.find('span', {'data-e2eid': 'mod-tile-file-size'}).text.strip() if mod.find('span', {'data-e2eid': 'mod-tile-file-size'}) else '',
                'updated_date': mod.find('time', attrs={'data-e2eid': 'mod-tile-updated'}).get('datetime') if mod.find('time', attrs={'data-e2eid': 'mod-tile-updated'}) else '',
                'uploaded_date': mod.find('time', attrs={'data-e2eid': 'mod-tile-uploaded'}).get('datetime') if mod.find('time', attrs={'data-e2eid': 'mod-tile-uploaded'}) else '',
                'description': mod.find('div', {'data-e2eid': 'mod-tile-summary'}).text.strip() if mod.find('div', {'data-e2eid': 'mod-tile-summary'}) else '',
                'mod_creator_username': username
            }
            
            update_available = mod.find('span', {'data-e2eid': 'mod-tile-update-available'})
            mod_data['update_available'] = 'Yes' if update_available else 'No'
            
            mods_data.append(mod_data)
        logging.info(f"Extracted {len(mods_data)} mods")
    except Exception as e:
        logging.error(f"Error extracting mods data: {e}")
        logging.error(traceback.format_exc())
    
    return mods_data


In [32]:
def fetch_cleaned_page_source(driver, url, max_retries=3):
    for attempt in range(max_retries):
        try:
            driver.get(url)
            WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            time.sleep(10)  # Increased delay
            page_source = driver.page_source
            cleaned_soup = clean_html(page_source)
            if cleaned_soup.find('h1'):  # Check if main content is loaded
                return cleaned_soup
            else:
                logging.warning(f"Main content not found, retrying... (Attempt {attempt + 1})")
        except TimeoutException:
            logging.warning(f"Timeout occurred, retrying... (Attempt {attempt + 1})")
        except Exception as e:
            logging.error(f"Error fetching page source: {e}")
            logging.error(traceback.format_exc())
    
    logging.error(f"Failed to fetch page after {max_retries} attempts")
    return None


In [38]:
def process_user(username):
    driver = init_driver()
    profile_url = f'https://next.nexusmods.com/profile/{username}'
    #print(profile_url)
    profile_soup = fetch_cleaned_page_source(driver, profile_url)
    #print(profile_soup)
    if profile_soup:
        profile_data = extract_profile_data(profile_soup)
        logging.info(f"Profile data for {username}: {profile_data}")
    else:
        profile_data = None
        logging.warning(f"Failed to fetch profile data for {username}")

    mods_url = f'https://next.nexusmods.com/profile/{username}/mods'
    mods_soup = fetch_cleaned_page_source(driver, mods_url)
    if mods_soup:
        mods_data = extract_mods_data(mods_soup, username)
        logging.info(f"Mods data for {username}: {len(mods_data)} mods found")
    else:
        mods_data = []
        logging.warning(f"Failed to fetch mods data for {username}")

    driver.quit()
    return profile_data, mods_data


In [39]:
def save_to_csv(user_data_list, mods_data_list):
    if user_data_list:
        user_df = pd.DataFrame(user_data_list)
        user_df.to_csv('user_profiles.csv', index=False)
        logging.info("User profile data saved to user_profiles.csv")
    else:
        logging.warning("No user profile data to save")

    if mods_data_list:
        mods_df = pd.DataFrame(mods_data_list)
        mods_df.to_csv('mods_data.csv', index=False)
        logging.info("Mods data saved to mods_data.csv")
    else:
        logging.warning("No mods data to save")

def process_multiple_users(usernames):
    user_data_list = []
    mods_data_list = []

    for username in usernames:
        logging.info(f"Processing {username}...")
        profile_data, mods_data = process_user(username)
        
        if profile_data:
            user_data_list.append(profile_data)
        if mods_data:
            mods_data_list.extend(mods_data)

    save_to_csv(user_data_list, mods_data_list)


In [40]:
if __name__ == "__main__":
    usernames = ['DungeonsAndSouls'] 
    process_multiple_users(usernames)

2024-09-24 11:05:01,364 - INFO - Processing DungeonsAndSouls...
2024-09-24 11:05:03,722 - INFO - Get LATEST geckodriver version for 130.0 firefox
2024-09-24 11:05:04,145 - INFO - Get LATEST geckodriver version for 130.0 firefox
2024-09-24 11:05:04,241 - INFO - Driver [C:\Users\Nico\.wdm\drivers\geckodriver\win64\v0.35.0\geckodriver.exe] found in cache
2024-09-24 11:05:22,190 - INFO - Extracted profile data: {'about': "PremiumDungeonsAndSoulsVerified mod author408,231 Unique downloadsTrackGive KudosMessageEndorsements Given358Profile Views69,507Kudos212United StatesLast active on 24 Sept 2024Joined on 28 Jun 2014BlockReportAbout MeMods11Collections0Media10AboutI solo develop mods as a creative outlet and to gain familiarity with other gaming engines outside of Unreal Engine 5. Currently, I'm pursuing a bachelor's degree in Game Development. I also Forever DM for D&D/Pathfinder 2E from time to time. Wondering what I'm working on? All modding WIPs get posted on my social media platform ac