In [1]:
!pip install --quiet cornac==2.3.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.4/51.4 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.5/31.5 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os, sys
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from google.colab import drive
import cornac

In [3]:
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

In [4]:
# set up requests session with retry strategy
session = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 500, 502, 503, 504])
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))

In [5]:
print(f"System version: {sys.version}")
print(f"Cornac version: {cornac.__version__}")

SEED = 42
VERBOSE = True

System version: 3.11.12 (main, Apr  9 2025, 08:55:54) [GCC 11.4.0]
Cornac version: 2.3.2


In [6]:
drive.mount('/content/drive')

file_app_ids = "/content/drive/MyDrive/assignment_2/app_ids.csv"
file_recommender_training = "/content/drive/MyDrive/assignment_2/recommender_training_data.csv"
folder_steam_images = "/content/drive/MyDrive/steam_images_store"

Mounted at /content/drive


In [7]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)
worksheet_output = gc.open_by_key('1v70zQk_2i3M6XiLI3xXgXvMqbpxD7TH9pQcvSK_K9zU').sheet1

headers = worksheet_output.row_values(1)

print(headers)

['app_id', 'description', 'tag']


In [8]:
df_app = pd.read_csv(file_app_ids)
df_app.head()

Unnamed: 0,app_id
0,6060
1,233980
2,204360
3,730
4,298630


In [9]:
# get unique values from df_app['app_id']
unique_app_ids = df_app['app_id'].unique()
print(len(unique_app_ids))
unique_app_ids

37610


array([   6060,  233980,  204360, ..., 2248870, 2251240, 2253290])

In [12]:
# Find the index of the value in unique_app_ids
index = np.where(unique_app_ids == 535890)[0]

print(len(unique_app_ids[index[0]:]), unique_app_ids[index[0]:])

10328 [ 535890 1013820  580570 ... 2248870 2251240 2253290]


In [13]:
# collect data in a list of lists
all_rows = []

for app_id in unique_app_ids[index[0]:]:

  print(f"\n\nApp ID: {app_id}")

  # get steam store page
  url = f"https://store.steampowered.com/app/{app_id}/"
  headers = {"User-Agent": "Mozilla/5.0"}

  # get HTML content
  response = requests.get(url, headers=headers)
  soup = BeautifulSoup(response.content, 'html.parser')

  # get game tags
  list_tags = []
  html_tags = soup.find_all('a', class_='app_tag')

  if html_tags:
    for tag in html_tags:
      for text in tag.stripped_strings:
        list_tags.append(text)
    tags_string = ", ".join(list_tags)
  else:
    tags_string = ""

  print(f"Tags: {tags_string}")

  # get game description
  list_description = []
  html_description = soup.find(id="game_area_description")

  if html_description:
    for text in html_description.stripped_strings:
        list_description.append(text.replace("-", ""))
    text_description = " ".join(list_description)
  else:
    text_description = ""

  print(f"Description: {text_description}")

  # Truncate the description if it exceeds the Google Sheets cell limit (50,000 characters)
  max_chars = 50000
  if len(text_description) > max_chars:
      text_description = text_description[:max_chars]
      print(f"Description truncated to {max_chars} characters.")

  row_values = [int(app_id), text_description, tags_string]

  # worksheet_output.append_row(row_values, value_input_option="RAW")

  # add the row to the list
  all_rows.append([int(app_id), text_description, tags_string])

  # append rows in batches (e.g., every 100 rows)
  if len(all_rows) >= 40:
      worksheet_output.append_rows(all_rows, value_input_option="RAW")
      all_rows = [] # Clear the list after appending

# append any remaining rows after the loop finishes
if all_rows:
    worksheet_output.append_rows(all_rows, value_input_option="RAW")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Tags: RPG, Visual Novel, Dating Sim, Adventure, Modern, Drama, Anime, Free to Play, Story Rich, 2D, Singleplayer, Indie, Early Access, Word Game, Romance, Text-Based
Description: About This Game 《第一千零二夜的童话》是我个人制作的视觉小说，本次上线的产品为试玩版，因为作者利用课余时间制作，所以制作的速度比较有限。 总之这是一首发生在日常生活中的、却又不那么平常的恋歌，讲述了刚刚经历巨大变故的高中男生内海智代和身世离奇的后辈一色咲夜间的恋爱故事，同时也是讲述不完美的人们如何恋上彼此、乃至爱上自己的故事。 完整版正在锐意制作中，demo包含约六分之一的文本量，流程大概在一小时。假如条件允许的话未来有计划加入其余四条女主的路线。 这次的demo主要是想招兵买马，看能不能召集到志同道合的小伙伴加入我们。 急缺懂krkr的程序员，程序白痴用nvl手搓已经累晕。 美术永远也不嫌多，欢迎有绘画能力和ps能力的美工和美术加入我们。 有意请联系qq号：1207485545


App ID: 1840700
Tags: Adventure, Exploration, Metroidvania, Platformer, Space, Action, Pixel Graphics, Sci-fi, Funny, Story Rich, 2D, Robots, Minimalist, 2D Platformer, Dark Humor, Relaxing, Controller, Open World, Investigation, Mystery
Description: About This Game Attraction is a game about the search for answers. You are a friendly explorer robot, searching for answers in a far away planet. Disc



Tags: Video Production, Software, Visual Novel, Word Game, Interactive Fiction, Moddable, Lovecraftian, Drama, Fantasy, Lore-Rich, Story Rich, Text-Based, Linear, Narration, 2D, Singleplayer, Tactical RPG
Description: About This Software 活字引擎是国内最早的 跑团Replay 制作软件，能直接导入 聊天记录 或各种来源的 Log 、提供 123种不同的配音 （每种皆可设置音调、语速等）、提供所有跑团Replay所需演出（从最基础的角色发言到十分酷炫的 掷骰演出 ）、 无需任何编程或视频制作基础 。 黏贴Log，自动生成剧本与角色 设置角色的立绘和配音 调整对话框 导入素材/h2] 大功告成！最后润色一下剧本，点击导出按钮就可以了！ 关于活字引擎 活字引擎是我为了帮助大家省去Replay制作过程中所有繁琐步骤（如合成配音、对轴等）从2020年开始开发的软件，活字1与活字2在前几年里都已免费发布，是国内最早的跑团Replay制作工具，在Replay制作者间以便捷性与低门槛闻名，已经有数千名制作者用活字1与活字2制作Replay并发布到B站等平台，如今与大家在steam上见面的是用了一年重新开发的活字3，在进一步降低门槛的同时极大地丰富了功能和演出效果。 活字引擎内置素材库的背景来源于�也皆来自于我或用户创作的素材以及免费可商用素材。 推荐使用爱发电购买（因为爱发电的抽成更少）↓ 爱发电链接： 活字引擎的QQ群：1055208320 （群内会发布免费预览版，遇到问题也可以在群内提问喔！）


App ID: 2124570
Tags: Hack and Slash, Action, Dungeon Crawler, Dark Fantasy, Action RPG, RPG, PvP, PvE, Third Person, 3D, Realistic, Fantasy, Class-Based, Combat, Multiplayer, Asynchronous Multiplayer, Free to Play, Singleplaye