In [1]:
from collections import deque
from datetime import datetime
import os
import time
import requests
import json

import pickle
from pathlib import Path

import traceback

def print_log(*args):
    print(f"[{str(datetime.now())[:-3]}] ", end="")
    print(*args)

def get_all_app_id():
    # get all app id
    req = requests.get("https://api.steampowered.com/ISteamApps/GetAppList/v2/")

    if (req.status_code != 200):
        print_log("Failed to get all games on steam.")
        return
    
    try:
        data = req.json()
    except Exception as e:
        traceback.print_exc(limit=5)
        return {}
    
    apps_data = data['applist']['apps']

    apps_ids = []

    for app in apps_data:
        appid = app['appid']
        name = app['name']
        
        # skip apps that have empty name
        if not name:
            continue

        apps_ids.append(appid)

    return apps_ids



def save_checkpoints(checkpoint_folder, apps_dict_filename_prefix, exc_apps_filename_prefix, error_apps_filename_prefix, apps_dict, excluded_apps_list, error_apps_list):
    if not checkpoint_folder.exists():
        checkpoint_folder.mkdir(parents=True)

    save_path = checkpoint_folder.joinpath(
        apps_dict_filename_prefix + f'-ckpt-fin.p'
    ).resolve()

    save_path2 = checkpoint_folder.joinpath(
        exc_apps_filename_prefix + f'-ckpt-fin.p'
    ).resolve()
    
    save_path3 = checkpoint_folder.joinpath(
        error_apps_filename_prefix + f'-ckpt-fin.p'
    ).resolve()

    save_pickle(save_path, apps_dict)
    print_log(f'Successfully create app_dict checkpoint: {save_path}')

    save_pickle(save_path2, excluded_apps_list)
    print_log(f"Successfully create excluded apps checkpoint: {save_path2}")

    save_pickle(save_path3, error_apps_list)
    print_log(f"Successfully create error apps checkpoint: {save_path3}")

    print()


def load_pickle(path_to_load:Path) -> dict:
    obj = pickle.load(open(path_to_load, "rb"))
    
    return obj

def save_pickle(path_to_save:Path, obj):
    with open(path_to_save, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)

def check_latest_checkpoints(checkpoint_folder, apps_dict_filename_prefix, exc_apps_filename_prefix, error_apps_filename_prefix):
    # app_dict
    all_pkl = []

    # get all pickle files in the checkpoint folder    
    for root, dirs, files in os.walk(checkpoint_folder):
        all_pkl = list(map(lambda f: Path(root, f), files))
        all_pkl = [p for p in all_pkl if p.suffix == '.p']
        break
            
    # create a list to store all the checkpoint files
    # then sort them
    # the latest checkpoint file for each of the object is the last element in each of the lists
    apps_dict_ckpt_files = [f for f in all_pkl if apps_dict_filename_prefix in f.name and "ckpt" in f.name]
    exc_apps_list_ckpt_files = [f for f in all_pkl if exc_apps_filename_prefix in f.name and "ckpt" in f.name]
    error_apps_ckpt_files = [f for f in all_pkl if error_apps_filename_prefix in f.name and 'ckpt' in f.name]

    apps_dict_ckpt_files.sort()
    exc_apps_list_ckpt_files.sort()
    error_apps_ckpt_files.sort()

    latest_apps_dict_ckpt_path = apps_dict_ckpt_files[-1] if apps_dict_ckpt_files else None
    latest_exc_apps_list_ckpt_path = exc_apps_list_ckpt_files[-1] if exc_apps_list_ckpt_files else None
    latest_error_apps_list_ckpt_path = error_apps_ckpt_files[-1] if error_apps_ckpt_files else None

    return latest_apps_dict_ckpt_path, latest_exc_apps_list_ckpt_path, latest_error_apps_list_ckpt_path

def main():
    print_log("Started Steam scraper process", os.getpid())


    apps_dict_filename_prefix = 'apps_dict'
    exc_apps_filename_prefix = 'excluded_apps_list'
    error_apps_filename_prefix = 'error_apps_list'

    apps_dict = {}
    excluded_apps_list = []
    error_apps_list = []

    all_app_ids = get_all_app_id()

    print_log('Total number of apps on steam:', len(all_app_ids))

    # path = project directory (i.e. steam_data_scraping)/checkpoints
    checkpoint_folder = Path('checkpoints').resolve()

    print_log('Checkpoint folder:', checkpoint_folder)

    if not checkpoint_folder.exists():
        print_log(f'Fail to find checkpoint folder: {checkpoint_folder}')
        print_log(f'Start at blank.')

        checkpoint_folder.mkdir(parents=True)

    latest_apps_dict_ckpt_path, latest_exc_apps_list_ckpt_path, latest_error_apps_list_ckpt_path = check_latest_checkpoints(checkpoint_folder, apps_dict_filename_prefix, exc_apps_filename_prefix, error_apps_filename_prefix)

    if latest_apps_dict_ckpt_path:
        apps_dict = load_pickle(latest_apps_dict_ckpt_path)
        print_log('Successfully load apps_dict checkpoint:', latest_apps_dict_ckpt_path)
        print_log(f'Number of apps in apps_dict: {len(apps_dict)}')
    
    if latest_exc_apps_list_ckpt_path:
        excluded_apps_list = load_pickle(latest_exc_apps_list_ckpt_path)
        print_log("Successfully load excluded_apps_list checkpoint:", latest_exc_apps_list_ckpt_path)
        print_log(f'Number of apps in excluded_apps_list: {len(excluded_apps_list)}')

    if latest_error_apps_list_ckpt_path:
        error_apps_list = load_pickle(latest_error_apps_list_ckpt_path)
        print_log("Successfully load error_apps_list checkpoint:", latest_error_apps_list_ckpt_path)
        print_log(f'Number of apps in error_apps_list: {len(error_apps_list)}')

    # remove app_ids that already scrapped or excluded or error
    all_app_ids = set(all_app_ids) \
            - set(map(int, set(apps_dict.keys()))) \
            - set(map(int, excluded_apps_list)) \
            - set(map(int, error_apps_list))
        
    # first get remaining apps
    apps_remaining_deque = deque(set(all_app_ids))

    
    print('Number of remaining apps:', len(apps_remaining_deque))

    i = 0
    while len(apps_remaining_deque) > 0:
        appid = apps_remaining_deque.popleft()

        # test whether the game exists or not
        # by making request to get the details of the app
        try:
            appdetails_req = requests.get(f"https://store.steampowered.com/api/appdetails?appids={appid}")

            if appdetails_req.status_code == 200:
                appdetails = appdetails_req.json()
                appdetails = appdetails[str(appid)]

            elif appdetails_req.status_code == 429:
                print_log(f'Too many requests. Put App ID {appid} back to deque. Sleep for 10 sec')
                apps_remaining_deque.appendleft(appid)
                time.sleep(10)
                continue


            elif appdetails_req.status_code == 403:
                print_log(f'Forbidden to access. Put App ID {appid} back to deque. Sleep for 5 min.')
                apps_remaining_deque.appendleft(appid)
                time.sleep(5 * 60)
                continue

            else:
                print_log("ERROR: status code:", appdetails_req.status_code)
                print_log(f"Error in App Id: {appid}. Put the app to error apps list.")
                error_apps_list.append(appid)
                continue
                
        except:
            print_log(f"Error in decoding app details request. App id: {appid}")

            traceback.print_exc(limit=5)
            appdetails = {'success':False}
            print()

        # not success -> the game does not exist anymore
        # add the app id to excluded app id list
        if appdetails['success'] == False:
            excluded_apps_list.append(appid)
            print_log(f'No successful response. Add App ID: {appid} to excluded apps list')
            continue

        appdetails_data = appdetails['data']

        appdetails_data['appid'] = appid     

        apps_dict[appid] = appdetails_data
        print_log(f"Successfully get content of App ID: {appid}")

        i += 1
        # for each 2500, save a ckpt
        if i >= 2500:
            save_checkpoints(checkpoint_folder, apps_dict_filename_prefix, exc_apps_filename_prefix, error_apps_filename_prefix, apps_dict, excluded_apps_list, error_apps_list)
            i = 0

    # save checkpoints at the end
    save_checkpoints(checkpoint_folder, apps_dict_filename_prefix, exc_apps_filename_prefix, error_apps_filename_prefix, apps_dict, excluded_apps_list, error_apps_list)

    print_log(f"Total number of valid apps: {len(apps_dict)}")
    print_log(f"Total number of skipped apps: {len(excluded_apps_list)}")
    print_log(f"Total number of error apps: {len(error_apps_list)}")

    print_log('Successful run. Program Terminates.')

if __name__ == '__main__':
    main()

[2024-09-21 19:42:02.427] Started Steam scraper process 28020
[2024-09-21 19:42:03.440] Total number of apps on steam: 214587
[2024-09-21 19:42:03.444] Checkpoint folder: C:\Users\wesdu\GA_DAB_812_python\Capstone\checkpoints
[2024-09-21 19:42:17.919] Successfully load apps_dict checkpoint: C:\Users\wesdu\GA_DAB_812_python\Capstone\checkpoints\apps_dict-ckpt-fin.p
[2024-09-21 19:42:17.919] Number of apps in apps_dict: 137500
[2024-09-21 19:42:17.935] Successfully load excluded_apps_list checkpoint: C:\Users\wesdu\GA_DAB_812_python\Capstone\checkpoints\excluded_apps_list-ckpt-fin.p
[2024-09-21 19:42:17.935] Number of apps in excluded_apps_list: 13017
[2024-09-21 19:42:17.939] Successfully load error_apps_list checkpoint: C:\Users\wesdu\GA_DAB_812_python\Capstone\checkpoints\error_apps_list-ckpt-fin.p
[2024-09-21 19:42:17.939] Number of apps in error_apps_list: 47
Number of remaining apps: 64077
[2024-09-21 19:42:18.660] Successfully get content of App ID: 2344680
[2024-09-21 19:42:19.314

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 783, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
               ^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\util\ssl_.py", line 471, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, server_hostname)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ssl.SSLEOFError: [SSL: UNEXPECT

[2024-09-22 05:47:52.232] Successfully get content of App ID: 1468850
[2024-09-22 05:47:53.255] Successfully get content of App ID: 1993140
[2024-09-22 05:47:54.275] Successfully get content of App ID: 944570
[2024-09-22 05:47:55.210] Successfully get content of App ID: 1468860
[2024-09-22 05:47:56.115] Successfully get content of App ID: 1993150
[2024-09-22 05:47:57.387] Successfully get content of App ID: 2517440
[2024-09-22 05:47:58.139] Successfully get content of App ID: 420290
[2024-09-22 05:47:58.885] Successfully get content of App ID: 3041730
[2024-09-22 05:47:59.607] Successfully get content of App ID: 1468870
[2024-09-22 05:48:00.455] Successfully get content of App ID: 1993160
[2024-09-22 05:48:01.275] Successfully get content of App ID: 3041740
[2024-09-22 05:48:02.450] Successfully get content of App ID: 420300
[2024-09-22 05:48:03.331] Successfully get content of App ID: 944590
[2024-09-22 05:48:04.141] Successfully get content of App ID: 1468880
[2024-09-22 05:48:04.951

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 783, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
               ^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\util\ssl_.py", line 471, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, server_hostname)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ssl.SSLEOFError: [SSL: UNEXPECT

[2024-09-22 09:31:50.139] Successfully get content of App ID: 944630
[2024-09-22 09:31:51.366] Successfully get content of App ID: 1468920
[2024-09-22 09:31:52.296] Successfully get content of App ID: 1993210
[2024-09-22 09:31:53.266] Successfully get content of App ID: 2517500
[2024-09-22 09:31:54.112] Successfully get content of App ID: 944640
[2024-09-22 09:31:54.939] Successfully get content of App ID: 1468930
[2024-09-22 09:31:56.069] Successfully get content of App ID: 2517510
[2024-09-22 09:31:57.056] Successfully get content of App ID: 420360
[2024-09-22 09:31:57.975] Successfully get content of App ID: 944650
[2024-09-22 09:31:58.895] Successfully get content of App ID: 1468940
[2024-09-22 09:31:59.915] Successfully get content of App ID: 3041810
[2024-09-22 09:32:00.782] Successfully get content of App ID: 420370
[2024-09-22 09:32:01.512] Successfully get content of App ID: 944660
[2024-09-22 09:32:02.258] Successfully get content of App ID: 1468950
[2024-09-22 09:32:03.251] 

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 492, in _make_request
    raise new_e
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recen

[2024-09-23 00:05:42.661] Successfully get content of App ID: 443320
[2024-09-23 00:05:43.653] Successfully get content of App ID: 2016190
[2024-09-23 00:05:44.560] Successfully get content of App ID: 2540480
[2024-09-23 00:05:45.395] Successfully get content of App ID: 443330
[2024-09-23 00:05:46.191] Successfully get content of App ID: 1491910
[2024-09-23 00:05:47.018] Successfully get content of App ID: 2016200
[2024-09-23 00:05:47.792] Successfully get content of App ID: 2540490
[2024-09-23 00:06:25.215] Successfully create app_dict checkpoint: C:\Users\wesdu\GA_DAB_812_python\Capstone\checkpoints\apps_dict-ckpt-fin.p
[2024-09-23 00:06:25.232] Successfully create excluded apps checkpoint: C:\Users\wesdu\GA_DAB_812_python\Capstone\checkpoints\excluded_apps_list-ckpt-fin.p
[2024-09-23 00:06:25.232] Successfully create error apps checkpoint: C:\Users\wesdu\GA_DAB_812_python\Capstone\checkpoints\error_apps_list-ckpt-fin.p

[2024-09-23 00:06:26.204] Successfully get content of App ID: 1

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 492, in _make_request
    raise new_e
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recen

[2024-09-23 03:00:45.728] No successful response. Add App ID: 2541040 to excluded apps list
[2024-09-23 05:48:54.124] Successfully get content of App ID: 968180
[2024-09-23 05:48:55.457] Successfully get content of App ID: 1492470
[2024-09-23 05:48:56.533] Successfully get content of App ID: 2541050
[2024-09-23 05:48:57.525] Successfully get content of App ID: 3065340
[2024-09-23 05:48:58.516] No successful response. Add App ID: 443900 to excluded apps list
[2024-09-23 05:48:59.422] Successfully get content of App ID: 968190
[2024-09-23 05:49:00.318] Successfully get content of App ID: 1492480
[2024-09-23 05:49:01.142] Successfully get content of App ID: 3065350
[2024-09-23 05:49:01.897] Successfully get content of App ID: 968200
[2024-09-23 05:49:02.776] Successfully get content of App ID: 1492490
[2024-09-23 05:49:03.907] Successfully get content of App ID: 1492491
[2024-09-23 05:49:04.881] Successfully get content of App ID: 2541070
[2024-09-23 05:49:05.803] Successfully get content

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 783, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
               ^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\util\ssl_.py", line 471, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, server_hostname)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TimeoutError: [WinError 10060] 

[2024-09-23 05:53:10.732] Successfully get content of App ID: 443970
[2024-09-23 05:53:11.463] Successfully get content of App ID: 968260
[2024-09-23 05:53:12.230] Successfully get content of App ID: 1492550
[2024-09-23 05:53:13.025] Successfully get content of App ID: 2803270
[2024-09-23 05:53:13.860] Successfully get content of App ID: 706120
[2024-09-23 05:53:14.768] Successfully get content of App ID: 706121
[2024-09-23 05:53:15.513] Successfully get content of App ID: 706122
[2024-09-23 05:53:16.272] Successfully get content of App ID: 2016840
[2024-09-23 05:53:17.036] Successfully get content of App ID: 3065420
[2024-09-23 05:53:17.851] Successfully get content of App ID: 443980
[2024-09-23 05:53:18.665] Successfully get content of App ID: 968270
[2024-09-23 05:53:19.432] Successfully get content of App ID: 968271
[2024-09-23 05:53:20.193] Successfully get content of App ID: 1492560
[2024-09-23 05:53:20.996] Successfully get content of App ID: 968272
[2024-09-23 05:53:21.541] Suc

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 783, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
               ^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\util\ssl_.py", line 471, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, server_hostname)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TimeoutError: [WinError 10060] 

[2024-09-23 11:25:04.025] No successful response. Add App ID: 472861 to excluded apps list
[2024-09-23 11:25:04.747] Successfully get content of App ID: 1259300
[2024-09-23 11:25:05.472] Successfully get content of App ID: 997150
[2024-09-23 11:25:06.013] Successfully get content of App ID: 1783590
[2024-09-23 11:25:06.561] Successfully get content of App ID: 1521440
[2024-09-23 11:25:07.248] Successfully get content of App ID: 2307880
[2024-09-23 11:25:07.833] No successful response. Add App ID: 2570020 to excluded apps list
[2024-09-23 11:25:08.550] Successfully get content of App ID: 2832170
[2024-09-23 11:25:09.285] Successfully get content of App ID: 3094310
[2024-09-23 11:25:09.935] Successfully get content of App ID: 472870
[2024-09-23 11:25:10.488] No successful response. Add App ID: 997160 to excluded apps list
[2024-09-23 11:25:11.182] Successfully get content of App ID: 1259310
[2024-09-23 11:25:11.799] No successful response. Add App ID: 997163 to excluded apps list
[2024-0

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 783, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
               ^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\util\ssl_.py", line 471, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, server_hostname)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TimeoutError: [WinError 10060] 

[2024-09-23 17:55:30.918] Successfully get content of App ID: 1004919
[2024-09-23 17:55:31.764] Successfully get content of App ID: 2839930
[2024-09-23 17:55:32.471] Successfully get content of App ID: 1004920
[2024-09-23 17:55:33.428] Successfully get content of App ID: 742780
[2024-09-23 17:55:34.267] Successfully get content of App ID: 1529210
[2024-09-23 17:55:35.143] Successfully get content of App ID: 1267070
[2024-09-23 17:55:36.090] Successfully get content of App ID: 1004921
[2024-09-23 17:55:36.997] Successfully get content of App ID: 1791360
[2024-09-23 17:55:37.937] Successfully get content of App ID: 3102080
[2024-09-23 17:55:38.807] Successfully get content of App ID: 1004930
[2024-09-23 17:55:39.642] Successfully get content of App ID: 2839940
[2024-09-23 17:55:40.489] Successfully get content of App ID: 1529220
[2024-09-23 17:55:41.317] Successfully get content of App ID: 742790
[2024-09-23 17:55:42.145] Successfully get content of App ID: 1267080
[2024-09-23 17:55:42.8

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 492, in _make_request
    raise new_e
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recen

[2024-09-23 20:14:27.981] Successfully get content of App ID: 1271792
[2024-09-23 20:14:28.601] Successfully get content of App ID: 1271793
[2024-09-23 20:14:29.411] Successfully get content of App ID: 1271794
[2024-09-23 20:14:29.980] Successfully get content of App ID: 2844660
[2024-09-23 20:14:30.503] Successfully get content of App ID: 223220
[2024-09-23 20:14:31.147] Successfully get content of App ID: 485360
[2024-09-23 20:14:31.764] Successfully get content of App ID: 1533940
[2024-09-23 20:14:32.597] Successfully get content of App ID: 1271800
[2024-09-23 20:14:33.555] Successfully get content of App ID: 2058230
[2024-09-23 20:14:34.548] Successfully get content of App ID: 1796090
[2024-09-23 20:14:35.457] Successfully get content of App ID: 485370
[2024-09-23 20:14:36.850] Successfully get content of App ID: 747520
[2024-09-23 20:14:37.717] Successfully get content of App ID: 2582530
[2024-09-23 20:14:38.502] Successfully get content of App ID: 1796100
[2024-09-23 20:14:39.190

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 537, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 461, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\http\client.py", line 1386, in getresponse
    response.begin()
  File "C:\Users\wesdu\anaconda3\Lib\http\client.py", line 325, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
http.client.RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

Traceback

[2024-09-23 20:46:56.685] Successfully get content of App ID: 1011176
[2024-09-23 20:46:59.179] Successfully get content of App ID: 749040
[2024-09-23 20:47:01.142] Successfully get content of App ID: 1011177
[2024-09-23 20:47:02.833] Successfully get content of App ID: 1011179
[2024-09-23 20:47:23.893] Error in decoding app details request. App id: 1011180

[2024-09-23 20:47:23.903] No successful response. Add App ID: 1011180 to excluded apps list


Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 203, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\util\connection.py", line 85, in create_connection
    raise err
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\util\connection.py", line 73, in create_connection
    sock.connect(sa)
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\sit

[2024-09-23 20:47:30.751] Successfully get content of App ID: 1797620
[2024-09-23 20:47:53.664] No successful response. Add App ID: 1535470 to excluded apps list
[2024-09-23 20:47:54.510] Successfully get content of App ID: 2059760
[2024-09-23 20:47:55.213] Successfully get content of App ID: 2584050
[2024-09-23 20:47:56.099] Successfully get content of App ID: 224760
[2024-09-23 20:47:56.903] Successfully get content of App ID: 2846200
[2024-09-23 20:47:57.509] Successfully get content of App ID: 749050
[2024-09-23 20:47:58.329] Successfully get content of App ID: 1011190
[2024-09-23 20:47:59.163] Successfully get content of App ID: 2059770
[2024-09-23 20:48:00.053] Successfully get content of App ID: 2059771
[2024-09-23 20:48:00.755] Successfully get content of App ID: 1797630
[2024-09-23 20:48:01.586] Successfully get content of App ID: 2321920
[2024-09-23 20:48:02.357] Successfully get content of App ID: 1011200
[2024-09-23 20:48:03.266] Successfully get content of App ID: 2846210


Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 492, in _make_request
    raise new_e
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recen

[2024-09-24 05:50:53.638] Successfully get content of App ID: 751110
[2024-09-24 05:50:54.817] Successfully get content of App ID: 1537540
[2024-09-24 05:50:55.915] Successfully get content of App ID: 3110410
[2024-09-24 05:50:56.743] Successfully get content of App ID: 488970
[2024-09-24 05:50:57.697] Successfully get content of App ID: 2323980
[2024-09-24 05:50:58.496] Successfully get content of App ID: 2848270
[2024-09-24 05:50:59.304] Successfully get content of App ID: 1275410
[2024-09-24 05:51:00.208] Successfully get content of App ID: 2586130
[2024-09-24 05:51:01.008] Successfully get content of App ID: 3110420
[2024-09-24 05:51:01.851] No successful response. Add App ID: 488980 to excluded apps list
[2024-09-24 05:51:02.488] No successful response. Add App ID: 488981 to excluded apps list
[2024-09-24 05:51:03.164] Successfully get content of App ID: 226840
[2024-09-24 05:51:03.955] Successfully get content of App ID: 2061850
[2024-09-24 05:51:04.702] Successfully get content 

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 783, in _ssl_wrap_socket_and_match_hostname
    ssl_sock = ssl_wrap_socket(
               ^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\util\ssl_.py", line 471, in ssl_wrap_socket
    ssl_sock = _ssl_wrap_socket_impl(sock, context, tls_in_tls, server_hostname)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TimeoutError: [WinError 10060] 

[2024-09-24 06:00:37.251] Successfully get content of App ID: 2061900
[2024-09-24 06:00:38.110] Successfully get content of App ID: 1275470
[2024-09-24 06:00:38.899] No successful response. Add App ID: 2586190 to excluded apps list
[2024-09-24 06:00:39.644] Successfully get content of App ID: 1799760
[2024-09-24 06:00:40.468] Successfully get content of App ID: 1998966
[2024-09-24 06:00:41.176] No successful response. Add App ID: 1537620 to excluded apps list
[2024-09-24 06:00:41.986] No successful response. Add App ID: 1537621 to excluded apps list
[2024-09-24 06:00:42.804] No successful response. Add App ID: 2061910 to excluded apps list
[2024-09-24 06:00:43.378] No successful response. Add App ID: 1537623 to excluded apps list
[2024-09-24 06:00:43.953] Successfully get content of App ID: 1537624
[2024-09-24 06:00:44.688] No successful response. Add App ID: 1537622 to excluded apps list
[2024-09-24 06:00:45.514] Successfully get content of App ID: 1799770
[2024-09-24 06:00:46.319] Su

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 537, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 461, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\http\client.py", line 1386, in getresponse
    response.begin()
  File "C:\Users\wesdu\anaconda3\Lib\http\client.py", line 325, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
http.client.RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

Traceback

[2024-09-24 11:11:51.242] Error in decoding app details request. App id: 2077910

[2024-09-24 11:11:51.246] No successful response. Add App ID: 2077910 to excluded apps list


Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 537, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 461, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\http\client.py", line 1386, in getresponse
    response.begin()
  File "C:\Users\wesdu\anaconda3\Lib\http\client.py", line 325, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
http.client.RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

Traceback

[2024-09-24 11:12:26.237] Error in decoding app details request. App id: 1815770

[2024-09-24 11:12:26.244] No successful response. Add App ID: 1815770 to excluded apps list


Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 537, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 461, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\http\client.py", line 1386, in getresponse
    response.begin()
  File "C:\Users\wesdu\anaconda3\Lib\http\client.py", line 325, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
http.client.RemoteDisconnected: Remote end closed connection without response

During handling of the above exception, another exception occurred:

Traceback

[2024-09-24 11:12:47.310] Error in decoding app details request. App id: 2602200

[2024-09-24 11:12:47.325] No successful response. Add App ID: 2602200 to excluded apps list


Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 203, in _new_conn
    sock = connection.create_connection(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\util\connection.py", line 85, in create_connection
    raise err
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\util\connection.py", line 73, in create_connection
    sock.connect(sa)
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\sit

[2024-09-24 11:12:55.420] No successful response. Add App ID: 2340060 to excluded apps list
[2024-09-24 11:13:07.879] Successfully get content of App ID: 3126490
[2024-09-24 11:13:08.609] Successfully get content of App ID: 2864350
[2024-09-24 11:13:09.265] Successfully get content of App ID: 1029340
[2024-09-24 11:13:09.886] Successfully get content of App ID: 767200
[2024-09-24 11:13:10.520] Successfully get content of App ID: 2602210
[2024-09-24 11:13:11.108] Successfully get content of App ID: 1815780
[2024-09-24 11:13:11.774] Successfully get content of App ID: 505060
[2024-09-24 11:13:12.571] Successfully get content of App ID: 1029350
[2024-09-24 11:13:13.179] Successfully get content of App ID: 1029351
[2024-09-24 11:13:13.787] Successfully get content of App ID: 2864360
[2024-09-24 11:13:14.432] Successfully get content of App ID: 242920
[2024-09-24 11:13:15.193] Successfully get content of App ID: 1553640
[2024-09-24 11:13:15.950] Successfully get content of App ID: 1029352
[

Traceback (most recent call last):
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 791, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 492, in _make_request
    raise new_e
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 468, in _make_request
    self._validate_conn(conn)
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connectionpool.py", line 1097, in _validate_conn
    conn.connect()
  File "C:\Users\wesdu\anaconda3\Lib\site-packages\urllib3\connection.py", line 642, in connect
    sock_and_verified = _ssl_wrap_socket_and_match_hostname(
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host

During handling of the above exception, another exception occurred:

Traceback (most recen

[2024-09-24 14:40:31.518] Successfully get content of App ID: 1037370
[2024-09-24 14:40:32.754] Successfully get content of App ID: 1561660
[2024-09-24 14:40:33.638] Successfully get content of App ID: 1299520
[2024-09-24 14:40:34.492] No successful response. Add App ID: 2610240 to excluded apps list
[2024-09-24 14:40:35.409] Successfully get content of App ID: 1823810
[2024-09-24 14:40:36.550] Successfully get content of App ID: 513091
[2024-09-24 14:40:37.472] Successfully get content of App ID: 513090
[2024-09-24 14:40:38.867] Successfully get content of App ID: 1037380
[2024-09-24 14:40:40.119] Successfully get content of App ID: 513092
[2024-09-24 14:40:41.179] Successfully get content of App ID: 513095
[2024-09-24 14:40:42.696] Successfully get content of App ID: 513096
[2024-09-24 14:40:43.715] Successfully get content of App ID: 513093
[2024-09-24 14:40:44.500] Successfully get content of App ID: 513094
[2024-09-24 14:40:45.281] Successfully get content of App ID: 2348100
[2024

In [2]:
import pickle
from pathlib import Path
import os

checkpoint_folder = Path("checkpoints")

apps_dict_filename_prefix = 'apps_dict'
exc_apps_filename_prefix = 'excluded_apps_list'
error_apps_filename_prefix = 'error_apps_list'

apps_dict = {}
excluded_apps_list = []
error_apps_list = []

In [3]:
def check_latest_checkpoints(checkpoint_folder, apps_dict_filename_prefix, exc_apps_filename_prefix, error_apps_filename_prefix):
    # app_dict
    all_pkl = []
    
    for root, dirs, files in os.walk(checkpoint_folder):
        all_pkl = list(map(lambda f: Path(root, f), files))
        all_pkl = [p for p in all_pkl if p.suffix == '.p']
        break
            
    apps_dict_ckpt_files = [f for f in all_pkl if apps_dict_filename_prefix in f.name and "ckpt" in f.name]
    exc_apps_list_ckpt_files = [f for f in all_pkl if exc_apps_filename_prefix in f.name and "ckpt" in f.name]
    error_apps_ckpt_files = [f for f in all_pkl if error_apps_filename_prefix in f.name and 'ckpt' in f.name]

    apps_dict_ckpt_files.sort()
    exc_apps_list_ckpt_files.sort()
    error_apps_ckpt_files.sort()

    latest_apps_dict_ckpt_path = apps_dict_ckpt_files[-1] if apps_dict_ckpt_files else None
    latest_exc_apps_list_ckpt_path = exc_apps_list_ckpt_files[-1] if exc_apps_list_ckpt_files else None
    latest_error_apps_list_ckpt_path = error_apps_ckpt_files[-1] if error_apps_ckpt_files else None

    return latest_apps_dict_ckpt_path, latest_exc_apps_list_ckpt_path, latest_error_apps_list_ckpt_path

In [4]:
if not checkpoint_folder.exists():
    print(f'Fail to find checkpoint folder: {checkpoint_folder}')
    print(f'Start at blank.')

In [5]:
latest_apps_dict_ckpt_path, latest_exc_apps_list_ckpt_path, latest_error_apps_list_ckpt_path = check_latest_checkpoints(checkpoint_folder, apps_dict_filename_prefix, exc_apps_filename_prefix, error_apps_filename_prefix)

if latest_apps_dict_ckpt_path:
    apps_dict = load_pickle(latest_apps_dict_ckpt_path)
    print('Successfully load apps_dict checkpoint:', latest_apps_dict_ckpt_path)
    print(f'Number of apps in apps_dict: {len(apps_dict)}')

if latest_exc_apps_list_ckpt_path:
    excluded_apps_list = load_pickle(latest_exc_apps_list_ckpt_path)
    print("Successfully load excluded_apps_list checkpoint:", latest_exc_apps_list_ckpt_path)
    print(f'Number of apps in excluded_apps_list: {len(excluded_apps_list)}')

if latest_error_apps_list_ckpt_path:
    error_apps_list = load_pickle(latest_error_apps_list_ckpt_path)
    print("Successfully load error_apps_list checkpoint:", latest_error_apps_list_ckpt_path)
    print(f'Number of apps in error_apps_list: {len(error_apps_list)}')

Successfully load apps_dict checkpoint: checkpoints\apps_dict-ckpt-fin.p
Number of apps in apps_dict: 196724
Successfully load excluded_apps_list checkpoint: checkpoints\excluded_apps_list-ckpt-fin.p
Number of apps in excluded_apps_list: 17849
Successfully load error_apps_list checkpoint: checkpoints\error_apps_list-ckpt-fin.p
Number of apps in error_apps_list: 68
