In [None]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re

> This code block below is for running in Google Colab, this will access your Goodle Drive and directly save outputs to designated location

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def scrape_data(url):

  data = {"Game Title" : [],
          "Game Genre" : [],
          "Pricing" : [],
          "Publisher" : [],
          "Release Date" : [],
          "Platform" : 'n/a',
          "Rating" : [],
          "Number of Ratings" : []
        }
  try:
    response = requests.get(url)
  except Exception as e:
    raise Exception(e)

  soup = bs(response.text, 'html.parser')


  # whether it's a DLC
  isDLC = soup.find("div", {"class" : "game_area_bubble game_area_dlc_bubble"})
  if isDLC:
    raise Exception("Not a game")
  del isDLC

  # ratings

  reviewSection = soup.find("div", id='userReviews')
  divs = reviewSection.find_all("div", class_="user_reviews_summary_row")

  if len(divs) == 1:
    target_div = divs[0]
  else:
    target_div = divs[1]

  tooltip_text = target_div.get("data-tooltip-html")

  if tooltip_text == "Need more user reviews to generate a score" or tooltip_text == "No user reviews":
    raise Exception("Not enough user reviews")

  digits = re.findall(r"\d+", tooltip_text)

  data["Rating"] = float(digits[0])

  data["Number of Ratings"] = int("".join(digits[1:]))

  # price
  # omit demo, early access, etc. by letting it throw an exception error and skip the game
  prices = soup.find_all("div", {"class" : "game_area_purchase_game"})
  for price in prices:
    if "demo_above_purchase" not in price.get("class", []):
      final_pricing = price.find("div", {"class" : "game_purchase_price price"}) or price.find("div", {"class" : "discount_original_price"})
      data["Pricing"] = final_pricing.text.strip()
      break

  # release date
  data["Release Date"] = soup.find("div", {"class" : "date"}).text.strip().replace(',','')

  # genre
  data["Game Genre"] = soup.find("span", {"data-panel" :'{"flow-children":"row"}'}).text.strip()

  # publisher
  data["Publisher"] = soup.find("div", {"id" : "developers_list", "class" : "summary column"}).text.strip()

  # title
  data["Game Title"] = soup.find("b", string="Title:").next_sibling.strip().strip()

  return pd.DataFrame([data])


In [None]:
# On the left toolbar, navigate to folder icon > drive/MyDrive/path/to/your/folder

path = "/content/drive/MyDrive/steam_scraping/data/game_urls_filtered.txt"

In [None]:
step = int(51661/3)
print(step)
for i in range(3):
  print(f"range({103322+(i * step)}, {103322+ ((i + 1) * step)})")

17220
range(103322, 120542)
range(120542, 137762)
range(137762, 154982)


In [None]:
with open(path, "r") as f:
  urls = f.read().splitlines()

print(len(urls))

"""
Tổng: 154984 link
Mỗi đứa chạy: 51661 link
"""
range1 = range(0, 51661)
range2 = range(51661, 103322)
range3 = range(103322, 154984)

def run(r, index, save=True):


  datalist = []

  total = r.stop - r.start
  bar_length = 40

  for i in r:
    progress = (i - r.start + 1) / total
    bar = "#" * int(progress * bar_length) + "-" * (bar_length - int(progress * bar_length))
    print(f"\rProgress: [{bar}] {int(progress * 100)}%", end="")
    try:
      datalist.append(scrape_data(urls[i]))
    except Exception as e:
      # print(e)
      continue

  print()

  dataset =  pd.concat(datalist)

  if save:

    path = "/content/drive/MyDrive/steam_scraping/data/outputs/"

    filepath = path + f"part_{index}_steam_data.csv"

    dataset.to_csv(filepath, index=False)

    print(f"data saved to {filepath}")

  print(f"Dimension: {dataset.shape}")
  print(dataset.head())

154984


In [None]:
run(range(0,50), "none", save=False)

Progress: [########################################] 100%
Dimension: (21, 8)
                           Game Title  \
0                            Mycelium   
0                       Relic Keepers   
0                              OUTBRK   
0         Whipseey and the Lost Atlas   
0  TT Isle Of Man: Ride on the Edge 3   

                                          Game Genre Pricing  \
0                    Adventure, Indie, RPG, Strategy   $5.99   
0                           Action, Adventure, Indie   $0.99   
0  Action, Adventure, Simulation, Strategy, Early...  $34.99   
0                           Action, Adventure, Indie   $5.99   
0                         Racing, Simulation, Sports  $49.99   

           Publisher Release Date Platform  Rating  Number of Ratings  
0          Alex Grim  Oct 22 2024      n/a   100.0                 12  
0         Idea Cabin  Sep 12 2017      n/a    13.0                 15  
0            Sublime  Jun 28 2024      n/a    78.0               1132  
0  

In [None]:
# run(range(0, 17220), "1_1")

In [None]:
run(range(51661, 68882), "2_1")

Progress: [########################################] 100%
data saved to /content/drive/MyDrive/steam_scraping/data/outputs/part_2_1_steam_data.csv
Dimension: (6454, 8)
              Game Title                          Game Genre       Pricing  \
0  Sid Meier's Starships                            Strategy        $14.99   
0              Concealed                    Adventure, Indie        $12.99   
0             8th Heaven     Indie, Simulation, Early Access        $14.99   
0            cloudphobia                       Action, Indie         $5.99   
0              Miner Lou  Free To Play, Indie, RPG, Strategy  Free To Play   

               Publisher Release Date Platform  Rating  Number of Ratings  
0          Firaxis Games  Mar 12 2015      n/a    49.0               1898  
0           CASCHA GAMES   Nov 8 2023      n/a    78.0                 41  
0           Cottage Club  Jul 29 2024      n/a    79.0                 44  
0              Marsbound   Dec 7 2016      n/a    87.0     

In [None]:
run(range(103322, 120542), "3_1")

Progress: [########################################] 100%
data saved to /content/drive/MyDrive/steam_scraping/data/outputs/part_3_1_steam_data.csv
Dimension: (6471, 8)
                         Game Title  \
0                        FOOTBALLER   
0  Dracula VS The Ninja On The Moon   
0                           Bermuda   
0   Hentai Day - Ringsel in Trouble   
0                              OESE   

                                          Game Genre       Pricing  \
0  Casual, Massively Multiplayer, RPG, Simulation...         $1.00   
0                              Adventure, Simulation         $0.99   
0                                   Adventure, Indie         $2.99   
0                                      Casual, Indie         $2.99   
0                          Casual, Indie, Simulation  Free To Play   

                    Publisher Release Date Platform  Rating  Number of Ratings  
0                       LAERT  Aug 11 2021      n/a    54.0                 11  
0  Let's Go to