Skip to content

Commit

Permalink
Add a notebook to download images
Browse files Browse the repository at this point in the history
  • Loading branch information
woctezuma committed Sep 28, 2021
1 parent dac66df commit 1a897a7
Showing 1 changed file with 148 additions and 0 deletions.
148 changes: 148 additions & 0 deletions download_steam_webdataset.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "download_Steam_data.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "UCM9BSLRuY7c"
},
"source": [
"%pip install --quiet gamedatacrunch img2dataset"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "FbQEScZfvR4g"
},
"source": [
"def get_cdn_url():\n",
" return 'https://cdn.cloudflare.steamstatic.com/steam/apps/'\n",
"\n",
"def get_app_url(app_id):\n",
" cdn_url = get_cdn_url()\n",
" return f'{cdn_url}{app_id}/library_600x900.jpg'\n",
"\n",
"def save_urls(app_ids, fname):\n",
" with open(fname, 'w') as f:\n",
" for app_id in sorted(app_ids):\n",
" url = get_app_url(app_id)\n",
" f.write(f'{url}\\n')\n",
" return"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "scrcdsULu0o6",
"outputId": "925092be-3f63-48ec-f0fc-ea769cde48f8"
},
"source": [
"import gamedatacrunch as gdc\n",
"\n",
"fname = 'urls.txt'\n",
"\n",
"app_ids = gdc.load_app_ids()\n",
"save_urls(app_ids, fname)"
],
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"#appIDs = 60057\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "U3ZO7YqZxLaA",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "5b657c10-034a-4c5a-e078-904ca3d0ab53"
},
"source": [
"!img2dataset --url_list={fname} --output_format=webdataset \\\n",
" --image_size=256 --resize_mode=keep_ratio"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Downloading file number 1 of 1 called urls.txt\n",
"Loading the input file\n",
"Splitting the 60057 samples in 7 shards of size 10000\n",
"Done sharding the input file\n",
"Starting the downloading of this file\n",
" 0% 0/7 [00:00<?, ?it/s]worker - success: 0.501 - failed to download: 0.499 - failed to resize: 0.000 - images per sec: 129 - count: 10000\n",
"total - success: 0.501 - failed to download: 0.499 - failed to resize: 0.000 - images per sec: 129 - count: 10000\n",
" 14% 1/7 [01:17<07:43, 77.29s/it]worker - success: 0.389 - failed to download: 0.611 - failed to resize: 0.000 - images per sec: 138 - count: 10000\n",
"total - success: 0.445 - failed to download: 0.555 - failed to resize: 0.000 - images per sec: 133 - count: 20000\n",
" 29% 2/7 [02:29<06:12, 74.58s/it]worker - success: 0.427 - failed to download: 0.573 - failed to resize: 0.000 - images per sec: 142 - count: 10000\n",
"total - success: 0.439 - failed to download: 0.561 - failed to resize: 0.000 - images per sec: 136 - count: 30000\n",
" 43% 3/7 [03:40<04:51, 72.75s/it]worker - success: 0.827 - failed to download: 0.173 - failed to resize: 0.000 - images per sec: 91 - count: 10000\n",
"total - success: 0.536 - failed to download: 0.464 - failed to resize: 0.000 - images per sec: 121 - count: 40000\n",
" 57% 4/7 [05:30<04:22, 87.50s/it]worker - success: 1.000 - failed to download: 0.000 - failed to resize: 0.000 - images per sec: 80 - count: 10000\n",
"total - success: 0.629 - failed to download: 0.371 - failed to resize: 0.000 - images per sec: 110 - count: 50000\n",
" 71% 5/7 [07:36<03:22, 101.20s/it]worker - success: 1.000 - failed to download: 0.000 - failed to resize: 0.000 - images per sec: 78 - count: 10000\n",
"total - success: 0.691 - failed to download: 0.309 - failed to resize: 0.000 - images per sec: 103 - count: 60000\n",
" 86% 6/7 [09:44<01:50, 110.42s/it]worker - success: 1.000 - failed to download: 0.000 - failed to resize: 0.000 - images per sec: 76 - count: 57\n",
"total - success: 0.691 - failed to download: 0.309 - failed to resize: 0.000 - images per sec: 103 - count: 60057\n",
"100% 7/7 [09:45<00:00, 83.60s/it]\n"
]
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "y5xgxr8r0tDM"
},
"source": [
"!tar -cvf steam_webdataset.tar images/"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ij3LcyeI1vNY"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive') "
],
"execution_count": null,
"outputs": []
}
]
}

0 comments on commit 1a897a7

Please sign in to comment.