In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os
import pandas as pd

# Funktion zum Laden der URLs aus der Datenbankdatei mit fehlenden Werten in 'stsp_owners' und 'stsp_mdntime'
def load_missing_urls(database_path):
    df = pd.read_json(database_path)
    missing_steamspy = df[df['stsp_owners'].isnull() | df['stsp_mdntime'].isnull()]
    missing_urls = missing_steamspy['store_url'].tolist()
    return missing_urls

# Funktion zum Generieren der SteamSpy URL aus der Steam store URL
def generate_steamspy_url(steam_url):
    app_id = steam_url.split('/app/')[1].split('/')[0]
    return f'https://steamspy.com/app/{app_id}'

# Funktion zum Scrapen einer SteamSpy-Seite
def scrape_steamspy_page(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        
        data = {}
        
        title_element = soup.find('h3')
        data['name'] = title_element.get_text().strip() if title_element else 'N/A'
        
        owners_element = soup.find('b', string='Owners:')
        data['owners'] = owners_element.next_sibling.strip() if owners_element and owners_element.next_sibling else 'N/A'
        
        mdntime_element = soup.find('b', string='Average playtime in last 2 weeks:')
        data['mdntime'] = mdntime_element.next_sibling.strip() if mdntime_element and mdntime_element.next_sibling else 'N/A'
        
        data['store_url'] = url
        
        return data
    else:
        print(f"Fehler beim Laden der Seite: {response.status_code}")
        return None

# Funktion zum Speichern von Daten in einer JSON-Datei
def save_to_json(data, file_index, output_dir):
    file_path = os.path.join(output_dir, f'steamspy_data_{file_index}.json')
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Gespeichert: {file_path}")

# Pfade und Einstellungen
database_path = r"C:/Users/gueld/Desktop/Portfolio Projekt/Daten/github/steamdb.json"
output_dir = r"C:/Users/gueld/Desktop/Portfolio Projekt/Daten/steam_scrape_json/neu/steamspy/stsp_ownerns_mdntime_neu2"
os.makedirs(output_dir, exist_ok=True)

# Lade die URLs aus der Datenbank mit fehlenden Werten in 'stsp_owners' und 'stsp_mdntime'
urls = load_missing_urls(database_path)

# Initialisiere Variablen
scraped_data = []
file_index = 1
batch_size = 500

# Iteriere über die URLs und scrape jede Seite
for steam_url in urls:
    steamspy_url = generate_steamspy_url(steam_url)
    print(f"Scrape URL: {steamspy_url}")
    data = scrape_steamspy_page(steamspy_url)
    if data:
        scraped_data.append(data)
    
    # Speichere die Daten in einer JSON-Datei, wenn die Batch-Größe erreicht ist
    if len(scraped_data) >= batch_size:
        save_to_json(scraped_data, file_index, output_dir)
        scraped_data = []  # Leere die Liste für den nächsten Batch
        file_index += 1

# Speichere verbleibende Daten in einer letzten JSON-Datei
if scraped_data:
    save_to_json(scraped_data, file_index, output_dir)