In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

URL = "https://en.wikipedia.org/wiki/List_of_best-selling_video_games"

# Add a browser's User-Agent to spoof the browser's identity.
headers_req = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/130.0.0.0 Safari/537.36"
    )
}

# Get the responce object with URL
resp = requests.get(URL, headers=headers_req)
resp.raise_for_status()   # Call the method raise_for_status() on resp; it raises an error if the request failed
print(resp.status_code)
print(resp.url)

# Create a BeautifulSoup object 'soup' with HTML text and the parser type
soup = BeautifulSoup(resp.text, "html.parser")

# Get the first <table> with class "wikitable"
table = soup.find("table", class_="wikitable")

# Get the header row
header_row = table.find("tr") # Find the first<tr> <tr>: table row
headers = [th.get_text(strip=True) for th in header_row.find_all("th")] #<th>: table header; th <-- tag

# Create an empty list rows_data to store all row data
rows_data = []
# Get each td in each row
for tr in table.find_all("tr")[1:]:  
    cells = tr.find_all(["th", "td"]) # <td>: table data
    if not cells:
        continue
    row = [cell.get_text(separator=" ", strip=True) for cell in cells]
    # Skip the row whose number of colums differs from header's
    if len(row) != len(headers):
        continue
    rows_data.append(row)

# Create a pandas DataFrame df with data=rows_data and column names=headers
df = pd.DataFrame(rows_data, columns=headers)

# Get the first 100 rows and assign back to df
df = df.head(100)

# 8. Get "Sales"
sales_col = [c for c in df.columns if "sales" in c.lower()][0]  

df["Sales_million"] = (
    df[sales_col]
    .str.extract(r"([\d\.]+)", expand=False)
    .astype(float)
)

# Get"Year"
date_col = [c for c in df.columns if "release" in c.lower()][0]
df["Year"] = (
    df[date_col]
    .str.extract(r"(\d{4})", expand=False)
)

# Save the DataFrame to a CSV
df.to_csv("games.csv", index=False)

print("Done, saved to games.csv")


200
https://en.wikipedia.org/wiki/List_of_best-selling_video_games
Done, saved to games.csv
