In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import os

# Laden des DataFrames und Setzen der ersten Spalte nicht als Index
df = pd.read_json(r"C:/Users/gueld/Desktop/Portfolio Projekt/Daten/github/steamdb.json")

# Überprüfen auf fehlende Werte in der Spalte 'voiceovers'
missing_voiceovers = df[df['voiceovers'].isnull()]

# Ausgeben der Einträge der Spalte 'store_url' bei fehlenden Werten in 'voiceovers'
missing_urls = missing_voiceovers['store_url']

# Funktion zum Laden der URLs aus der Datenbankdatei mit fehlenden Werten in 'voiceovers'
def load_missing_urls(database_path):
    df = pd.read_json(database_path)
    missing_voiceovers = df[df['voiceovers'].isnull()]
    missing_urls = missing_voiceovers['store_url'].tolist()
    return missing_urls

# Funktion zum Scrapen einer Steam-Seite für Voiceovers (Audio)
def scrape_voiceovers(url):
    response = requests.get(url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        voiceovers = {}

        # Find the table with class 'game_language_options'
        language_table = soup.find('table', class_='game_language_options')
        if language_table:
            rows = language_table.find_all('tr')
            for row in rows[1:]:  # Skip the header row
                cols = row.find_all('td')
                if len(cols) > 2:  # Ensure there are enough columns
                    language = cols[0].get_text(strip=True)
                    audio = cols[2].find('span').get_text(strip=True) if cols[2].find('span') else ''
                    voiceovers[language] = 'Available' if audio == '✔' else 'Not Available'
        
        return voiceovers
    else:
        print(f"Fehler beim Laden der Seite: {response.status_code}")
        return None

# Funktion zum Speichern von Daten in einer JSON-Datei
def save_to_json(data, file_index, output_dir):
    file_path = os.path.join(output_dir, f'steam_voiceovers_{file_index}.json')
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
    print(f"Gespeichert: {file_path}")

# Pfade und Einstellungen
database_path = r"C:/Users/gueld/Desktop/Portfolio Projekt/Daten/github/steamdb.json"
output_dir = r"C:/Users/gueld/Desktop/Portfolio Projekt/Daten/steam_scrape_json/neu/voiceovers"
os.makedirs(output_dir, exist_ok=True)

# Lade die URLs aus der Datenbank mit fehlenden Werten in 'voiceovers'
urls = load_missing_urls(database_path)

# Initialisiere Variablen - speichere jeweils 500 Zeilen in einer .json Datei
scraped_data = []
file_index = 1
batch_size = 500

# Iteriere über die URLs und scrape jede Seite für Voiceovers (Audio)
for url in urls:
    print(f"Scraping URL: {url}")
    data = scrape_voiceovers(url)
    if data:
        scraped_data.append({'store_url': url, 'voiceovers': data})
    
    # Speichere die Daten in einer JSON-Datei, wenn die Batch-Größe erreicht ist
    if len(scraped_data) >= batch_size:
        save_to_json(scraped_data, file_index, output_dir)
        scraped_data = []  # Leere die Liste für den nächsten Batch
        file_index += 1

# Speichere verbleibende Daten in einer letzten JSON-Datei
if scraped_data:
    save_to_json(scraped_data, file_index, output_dir)


Scraping URL: https://store.steampowered.com/app/20
Scraping URL: https://store.steampowered.com/app/30
Scraping URL: https://store.steampowered.com/app/40
Scraping URL: https://store.steampowered.com/app/50
Scraping URL: https://store.steampowered.com/app/60
Scraping URL: https://store.steampowered.com/app/80
Scraping URL: https://store.steampowered.com/app/130
Scraping URL: https://store.steampowered.com/app/240
Scraping URL: https://store.steampowered.com/app/280
Scraping URL: https://store.steampowered.com/app/300
Scraping URL: https://store.steampowered.com/app/320
Scraping URL: https://store.steampowered.com/app/340
Scraping URL: https://store.steampowered.com/app/360
Scraping URL: https://store.steampowered.com/app/630
Scraping URL: https://store.steampowered.com/app/1002
Scraping URL: https://store.steampowered.com/app/1200
Scraping URL: https://store.steampowered.com/app/1300
Scraping URL: https://store.steampowered.com/app/1500
Scraping URL: https://store.steampowered.com/app

In [4]:
# Dateien mergen: 

import os
import json

# Funktion zum Laden aller JSON-Dateien in einem Verzeichnis und zum Zusammenführen der Inhalte
def merge_json_files(input_dir, output_file):
    merged_data = []
    
    for filename in os.listdir(input_dir):
        if filename.endswith('.json'):
            file_path = os.path.join(input_dir, filename)
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                merged_data.extend(data)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=4)
    print(f"Gespeichert: {output_file}")

# Pfade und Einstellungen
input_dir = r"C:\Users\gueld\Desktop\Portfolio Projekt\Daten\steam_scrape_json\neu\voiceovers"
output_file = r"C:\Users\gueld\Desktop\Portfolio Projekt\Daten\steam_scrape_json\neu\merged_voiceovers.json"

# Mergen der JSON-Dateien
merge_json_files(input_dir, output_file)


Gespeichert: C:\Users\gueld\Desktop\Portfolio Projekt\Daten\steam_scrape_json\neu\merged_voiceovers.json
