In [86]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import re

> This code block below is for running in Google Colab, this will access your Goodle Drive and directly save outputs to designated location

In [87]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [88]:
def scrape_data(url):

  data = {"Game Title" : [],
          "Game Genre" : [],
          "Pricing" : [],
          "Publisher" : [],
          "Release Date" : [],
          "Platform" : 'n/a',
          "Rating" : [],
          "Number of Ratings" : []
        }
  try:
    response = requests.get(url)
  except Exception as e:
    raise Exception(e)

  soup = bs(response.text, 'html.parser')


  # whether it's a DLC
  isDLC = soup.find("div", {"class" : "game_area_bubble game_area_dlc_bubble"})
  if isDLC:
    raise Exception("Not a game")
  del isDLC

  # ratings

  reviewSection = soup.find("div", id='userReviews')
  divs = reviewSection.find_all("div", class_="user_reviews_summary_row")

  if len(divs) == 1:
    target_div = divs[0]
  else:
    target_div = divs[1]

  tooltip_text = target_div.get("data-tooltip-html")

  if tooltip_text == "Need more user reviews to generate a score" or tooltip_text == "No user reviews":
    raise Exception("Not enough user reviews")

  digits = re.findall(r"\d+", tooltip_text)

  data["Rating"] = float(digits[0])

  data["Number of Ratings"] = int("".join(digits[1:]))

  # price
  # omit demo, early access, etc. by letting it throw an exception error and skip the game
  price = soup.find("div", {"class" : "game_purchase_price price"}) or soup.find("div", {"class" : "discount_original_price"})
  data["Pricing"] = price.text.strip()

  # release date
  data["Release Date"] = soup.find("div", {"class" : "date"}).text.strip().replace(',','')

  # genre
  data["Game Genre"] = soup.find("span", {"data-panel" :'{"flow-children":"row"}'}).text.strip()

  # publisher
  data["Publisher"] = soup.find("div", {"id" : "developers_list", "class" : "summary column"}).text.strip()

  # title
  data["Game Title"] = soup.find("b", string="Title:").next_sibling.strip().strip()

  return pd.DataFrame([data])


In [92]:
# On the left toolbar, navigate to folder icon > drive/MyDrive/path/to/your/folder

path = "/content/drive/MyDrive/steam_scraping/data/game_urls_filtered.txt"

In [97]:
with open(path, "r") as f:
  urls = f.read().splitlines()

print(len(urls))

"""
Tổng: 154984 link
Mỗi đứa chạy: 51661 link
"""
range1 = range(0, 51661)
range2 = range(51661, 103322)
range3 = range(103322, 154984)

def run(r, index):


  datalist = []

  total = r.stop - r.start
  bar_length = 40

  for i in r:
    progress = (i - r.start + 1) / total
    bar = "#" * int(progress * bar_length) + "-" * (bar_length - int(progress * bar_length))
    print(f"\rProgress: [{bar}] {int(progress * 100)}%", end="")
    try:
      datalist.append(scrape_data(urls[i]))
    except Exception as e:
      # print(e)
      continue

  print()

  dataset =  pd.concat(datalist)

  path = "/content/drive/MyDrive/steam_scraping/data/outputs/"

  filepath = path + f"part_{index}_steam_data.csv"

  dataset.to_csv(filepath, index=False)

  print(f"data saved to {filepath}")

  print(f"Dimension: {dataset.shape}")
  dataset.head()

154984


In [98]:
run(range(0,50), 2)
# small range for game price testing

Progress: [########################################] 100%
(21, 8)
data saved to /content/drive/MyDrive/steam_scraping/data/outputs/part_2_steam_data.csv
