In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import xml.etree.ElementTree as ET
import re

# URL of the main sitemap
sitemap_url = 'https://opencritic.com/sitemap.xml'

# Function to retrieve all sitemap links from the main sitemap
def get_sitemap_links(sitemap_url):
    response = requests.get(sitemap_url)
    if response.status_code == 200:  # Check if the request was successful
        root = ET.fromstring(response.content)
        namespaces = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        # Extract links to specific game sitemaps
        sitemap_links = [elem.text for elem in root.findall('.//ns:loc', namespaces) if 'sitemap_games' in elem.text]
        return sitemap_links
    else:
        print(f"Could not retrieve sitemap. Status code: {response.status_code}")
        return []

# Function to retrieve game links from a specific sitemap
def get_game_links_from_sitemap(sitemap_link):
    response = requests.get(sitemap_link)
    if response.status_code == 200:  # Check if the request was successful
        root = ET.fromstring(response.content)
        namespaces = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
        # Extract all game links from the sitemap
        game_links = [elem.text for elem in root.findall('.//ns:loc', namespaces)]
        # Filter out unwanted links (e.g., "/media" or "/reviews")
        filtered_game_links = [link for link in game_links if "/media" not in link and "/reviews" not in link]
        return filtered_game_links
    else:
        print(f"Could not retrieve sitemap: {sitemap_link}. Status code: {response.status_code}")
        return []

# Function to parse data from a game page
def parse_game_page(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract the game title
        title = soup.find('h1', class_='my-2 my-md-4').get_text(strip=True) if soup.find('h1', class_='my-2 my-md-4') else "n/a"
        
        # Extract the game genre
        genre_tag = soup.find(text="Genre")
        genre = genre_tag.find_next().get_text(strip=True) if genre_tag else "n/a"
        
        # Extract pricing information
        pricing_tag = soup.find(text="Price")
        pricing = pricing_tag.find_next().get_text(strip=True) if pricing_tag else "n/a"
        
        # Extract library size
        library_size_tag = soup.find(text="Storage")
        library_size = library_size_tag.find_next().get_text(strip=True) if library_size_tag else "n/a"
        
        # Extract publisher information
        companies = soup.find('div', {'class': 'companies'})
        publisher_tag = companies.find_all('span') if companies else None
        publisher = [tag.get_text(strip=True) for tag in publisher_tag] if publisher_tag else "n/a"
        
        # Extract platform information
        platform_div = soup.find('div', {'class': 'platforms'})
        platform_tag = platform_div.find_all('span') if platform_div else None
        platform = [tag.get_text(strip=True) for tag in platform_tag] if platform_tag else "n/a"
        
        # Extract release date
        release_date = "n/a"
        if platform_div and "Release Date:" in platform_div.get_text():
            release_text = platform_div.get_text(strip=True).replace("Release Date:", "").strip()
            release_date = release_text.split('-')[0].strip()
        
        # Extract rating
        rating_tag = soup.find('div', class_="inner-orb")
        rating = rating_tag.get_text(strip=True) if rating_tag else "n/a"
        
        # Extract number of reviews
        reviews_tag = soup.find('a', href=lambda href: href and "/reviews" in href)
        if reviews_tag:
            match = re.search(r'\d+', reviews_tag.get_text())
            num_reviews = int(match.group()) if match else "n/a"
        else:
            num_reviews = "n/a"
        
        # Return parsed data as a dictionary
        return {
            "Game Title": title,
            "Game Genre": genre,
            "Pricing": pricing,
            "Game Library Size": library_size,
            "Publisher": publisher,
            "Release Date": release_date,
            "Platform": platform,
            "Rating": rating,
            "Number of Rating": num_reviews
        }
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None

# Step 1: Retrieve all sitemap links
sitemap_links = get_sitemap_links(sitemap_url)

# Step 2: Get game links from the first sitemap
game_links = get_game_links_from_sitemap(sitemap_links[0])

# Step 3: Parse each game page to extract information
games_data = []
for link in game_links:
    game_data = parse_game_page(link)
    if game_data and game_data['Game Title'] != "n/a":  # Only include valid data
        games_data.append(game_data)

# Step 4: Convert parsed data into a DataFrame
df_games = pd.DataFrame(games_data)

# Step 5: Drop rows with all missing values
df_games = df_games.dropna(how='all')

# Step 6: Save the data to a CSV file
df_games.to_csv('game_data.csv', index=False, encoding='utf-8')



# Explanation of the Code

## **Imports**
- **`requests`**: Used for making HTTP requests to retrieve web pages or XML sitemaps.
- **`BeautifulSoup` (from `bs4`)**: Used to parse and extract data from HTML content.
- **`pandas`**: Used to organize extracted data into a structured format (DataFrame) for analysis and storage.
- **`xml.etree.ElementTree`**: Used to parse and navigate XML files (like sitemaps).
- **`re`**: Used for regular expressions, especially to extract numeric patterns (e.g., number of reviews).

---

## **Constants**
- **`sitemap_url`**:
  - Points to the main sitemap of `opencritic.com`.
  - This sitemap is an XML file containing links to sub-sitemaps or specific web pages.

---

## **Functions**

### **1. `get_sitemap_links`**
- **Purpose**: Extract links to all sub-sitemaps related to games from the main sitemap.
- **Logic**:
  1. Sends a GET request to `sitemap_url`.
  2. Checks if the request was successful (`status_code == 200`).
  3. Parses the XML content using `ElementTree`.
  4. Finds all `<loc>` tags using XML namespaces.
  5. Filters the links to only include those containing `'sitemap_games'`.
- **Returns**: A list of URLs pointing to game-related sitemaps.

---

### **2. `get_game_links_from_sitemap`**
- **Purpose**: Extract individual game page links from a specific game-related sitemap.
- **Logic**:
  1. Sends a GET request to a sitemap URL.
  2. Parses the XML content and extracts all `<loc>` tags.
  3. Filters out unwanted URLs containing `/media` or `/reviews`.
- **Returns**: A list of URLs pointing to game pages.

---

### **3. `parse_game_page`**
- **Purpose**: Extract detailed information about a game from its webpage.
- **Logic**:
  1. Sends a GET request to the game page URL.
  2. Parses the HTML using `BeautifulSoup`.
  3. Extracts specific details:
     - **Title**: From `<h1>` with specific classes.
     - **Genre**: Finds the text "Genre" and retrieves the following element.
     - **Pricing**: Finds the text "Price" and retrieves the following element.
     - **Library Size**: Finds the text "Storage" and retrieves the following element.
     - **Publisher**: Extracts all publishers listed under the `companies` section.
     - **Platform**: Extracts platforms listed under the `platforms` section.
     - **Release Date**: Extracts the release date if mentioned in the `platforms` section.
     - **Rating**: Extracts the numeric rating from a specific class.
     - **Number of Reviews**: Searches for the `/reviews` link and uses a regex to find numbers.
  4. Handles errors during requests or parsing.
- **Returns**: A dictionary containing the extracted game data.

---

## **Main Logic**

### **Step 1: Retrieve Sitemap Links**
- Calls `get_sitemap_links` to fetch all game-related sitemap URLs.

### **Step 2: Retrieve Game Links**
- Uses the first sitemap URL from Step 1.
- Calls `get_game_links_from_sitemap` to extract all game page URLs.

### **Step 3: Parse Game Pages**
- Iterates over each game URL from Step 2.
- Calls `parse_game_page` to extract game details.
- Filters out games with missing titles (`"n/a"`).

### **Step 4: Create a DataFrame**
- Converts the list of dictionaries (`games_data`) into a Pandas DataFrame.

### **Step 5: Drop Missing Rows**
- Removes rows where all fields are `NaN` using `dropna`.

### **Step 6: Save to CSV**
- Exports the DataFrame to a CSV file named `game_data.csv`.

