In [2]:
import requests # send HTTP requests to get the HTML code of a web page
from bs4 import BeautifulSoup # parse HTML files
import time   # time.sleep to pause between requests to the server
import random # randomize the time between requests
import glob # get the list of file names in a directory
from datetime import datetime

WARNING: The code in this notebook may make changes to existing files in the system.

This notebook is still a work in progress and not yet fully checked for errors. It has been revised many times during the data collection phase and may contain errors.

## Get the manufacturer list

In [28]:
def is_valid_html_text(html_text):
  return 'HTTP 429' not in html_text


def send_request_and_wait(
    url,
    min_sleep_time_seconds=60,
    max_sleep_time_seconds=90,
    max_attempts=10,
    verbose=False
):
  html_text = "HTTP 429"
  attempts = max_attempts
  if verbose:
    print(f"Getting {url}...")
  while not is_valid_html_text(html_text) and attempts > 0:
    sleep_time_seconds = random.randint(min_sleep_time_seconds, max_sleep_time_seconds)
    if verbose:
      print(f"{datetime.now()}: {attempts} attempts remaining. Sleeping for {sleep_time_seconds} seconds...")
    time.sleep(sleep_time_seconds)

    response = requests.get(url)
    html_text = response.text
    attempts -= 1
  if verbose:
    print(f"Done getting {url}!\n")
  return html_text

In [42]:
def get_brand_list(min_sleep_time_seconds=60, max_sleep_time_seconds=90, max_attempts=10, verbose=True):
  """
  Returns a list of brands and the count of SSDs by each brand from the root page "https://www.techpowerup.com/ssd-specs/"
  """
  # Get the HTML from the root page.
  ROOT_URL = 'https://www.techpowerup.com/ssd-specs/'
  html_root_text = send_request_and_wait(
    ROOT_URL,
    min_sleep_time_seconds=min_sleep_time_seconds,
    max_sleep_time_seconds=max_sleep_time_seconds,
    max_attempts=max_attempts,
    verbose=verbose
  )
  html_root_object = BeautifulSoup(html_root_text, 'html.parser')

  # Get the list of manufacturers and the count of SSDs by each manufacturer.
  div_manufacturer_object = html_root_object.find('div', {'data-title': 'Manufacturer'})
  a_objects = div_manufacturer_object.find_all('a')
  brand_list = []
  # Example 'a' object: <a  href="/ssd-specs/filter/?mfgr=SK Hynix" >SK Hynix <span class="filter-list-item-entry-count">(25)</span></a>
  for a_object in a_objects:
    url_suffix_filtered_by_brand = a_object['href']  # example: '/ssd-specs/filter/?mfgr=SK Hynix'
    brand_name_and_count = a_object.text.strip()  # example: 'SK Hynix (25)'
    brand_name = brand_name_and_count.split('(')[0].strip()  # example: 'SK Hynix'
    ssd_count = brand_name_and_count.split('(')[1][:-1]  # example: '25'
    ssd_count = int(ssd_count)
    brand_list.append((
      url_suffix_filtered_by_brand,
      brand_name,
      ssd_count
    ))

  return brand_list


# manufacturer_list = get_manufacturer_list()
# print(manufacturer_list)

## Get the list of URLs for the data pages of all SSDs from each manufacturer

In [43]:
def get_SSD_data_URLs_from_manufacturer(valid_html_object_by_manufacturer):
  # Example: <a class="drive-capacity" href="/ssd-specs/sk-hynix-bettle-x31-1-tb.d1726" style="white-space: nowrap">1 TB</a>
  a_drive_capacity_objects = valid_html_object_by_manufacturer.find_all('a', {'class': 'drive-capacity'})
  data_URL_set = set()
  for a_drive_capacity_object in a_drive_capacity_objects:
    data_URL = a_drive_capacity_object['href']
    data_URL_set.add(data_URL)
  return data_URL_set

In [44]:
def get_all_html_file_paths(folder):
  all_html_file_paths = glob.glob(folder + '*.html')
  return all_html_file_paths


def get_brand_name_from_file_path(file_path):
  # Example: "data/manufacturer/SK_Hynix-2024-01-14-14:36:45.html"
  file_name = file_path.split('/')[-1]      # example: "SK_Hynix-2024-01-14-14:36:45.html"
  brand_name = file_name.split('-')[0]      # example: "SK_Hynix"
  brand_name = brand_name.replace('_', ' ') # example: "SK Hynix"
  return brand_name


def get_data_URLs_by_brand(folder='data/manufacturer'):
  """
  Returns a dictionary mapping a brand to a set of data URLs for that brand.
  """
  brand_html_file_paths = get_all_html_file_paths(folder='data/manufacturer/')
  brand_to_data_URLs = dict()
  for brand_html_file_path in brand_html_file_paths:
    with open(brand_html_file_path, 'r') as f:
      html_text = f.read()
    html_object = BeautifulSoup(html_text, 'html.parser')
    if not is_valid_html_text(html_text):
      print(f'Invalid HTML for {brand_html_file_path}')
      continue
    data_URLs = get_SSD_data_URLs_from_manufacturer(valid_html_object_by_manufacturer=html_object)
    brand_name = get_brand_name_from_file_path(brand_html_file_path)
    brand_to_data_URLs[brand_name] = data_URLs
  return brand_to_data_URLs

print(len(get_data_URLs_by_brand()['Acer']))
print(len(get_data_URLs_by_brand()))
# get_data_URLs_by_brand()

27
43


In [47]:
def download_html_of_queries_by_brand_and_get_data_URLs_by_brand(folder='data/manufacturer'):
  HOMEPAGE_URL = 'https://www.techpowerup.com'
  brand_list = get_brand_list(
    min_sleep_time_seconds=10,
    max_sleep_time_seconds=30,
    max_attempts=10,
    verbose=True
  )
  brand_to_data_URLs = {}
  downloaded_data_urls_by_brand = get_data_URLs_by_brand(folder=folder)

  for url_suffix_filtered_by_brand, brand_name, ssd_count in brand_list:
    brand_name_underscored = brand_name.replace(' ', '_')
    if brand_name in downloaded_data_urls_by_brand:
      print(f'Brand "{brand_name}" already downloaded')
      data_URLs = downloaded_data_urls_by_brand[brand_name]
      brand_to_data_URLs[brand_name] = data_URLs
      continue

    url_filtered_by_brand = HOMEPAGE_URL + url_suffix_filtered_by_brand

    html_text_filtered_by_brand = send_request_and_wait(
      url_filtered_by_brand,
      min_sleep_time_seconds=60,
      max_sleep_time_seconds=90,
      max_attempts=10,
      verbose=True
    )

    if not is_valid_html_text(html_text_filtered_by_brand):
      print(f'Invalid HTML for {brand_name}')
      continue

    html_object_filtered_by_brand = BeautifulSoup(html_text_filtered_by_brand, 'html.parser')
    data_URLs = get_SSD_data_URLs_from_manufacturer(html_object_filtered_by_brand)
    print(f'{brand_name} ({ssd_count}): {len(data_URLs)}')
    # assert len(data_URLs) == ssd_count, \
      # f'Expected {ssd_count} data URLs for {brand_name}, but got {len(data_URLs)} data URLs instead.'
    brand_to_data_URLs[brand_name] = data_URLs

    current_datetime = datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
    with open(f'{folder}/{brand_name_underscored}-{current_datetime}.html', 'w') as f:
      f.write(html_text_filtered_by_brand)
    # break

  print(f'len(brand_to_data_URLs): {len(brand_to_data_URLs)}')
  print(f'len(brand_list): {len(brand_list)}')
  return brand_to_data_URLs


def start_download():
  download_html_of_queries_by_brand_and_get_data_URLs_by_brand()

start_download()

Getting https://www.techpowerup.com/ssd-specs/...
2024-01-15 10:13:38.602596: 10 attempts remaining. Sleeping for 28 seconds...
Done getting https://www.techpowerup.com/ssd-specs/!

Brand "Acer" already downloaded
Brand "ADATA" already downloaded
Brand "Addlink" already downloaded
Brand "AMD" already downloaded
Brand "Aorus" already downloaded
Brand "Apacer" already downloaded
Brand "Apple" already downloaded
Brand "Asgard" already downloaded
Brand "Asura" already downloaded
Brand "ASUS" already downloaded
Brand "ASUS ROG" already downloaded
Brand "Biostar" already downloaded
Brand "CFD Gaming" already downloaded
Brand "Colorful" already downloaded
Brand "Corsair" already downloaded
Brand "Crucial" already downloaded
Brand "DapuStor" already downloaded
Brand "Dera" already downloaded
Brand "Digifast" already downloaded
Brand "Digma" already downloaded
Brand "Drevo" already downloaded
Brand "Enmotus" already downloaded
Brand "Fanxiang" already downloaded
Brand "Galax" already downloaded

## (WORK IN PROGRESS) Get list of downloaded SSD HTML files and add brand name to the prefix

In [None]:
def get_downloaded_html_file_paths(html_input_folder='data/html/'):
  """
  Returns a list of paths to the downloaded HTML files.
  """
  downloaded_html_file_paths = glob.glob(html_input_folder + '*.html')
  return downloaded_html_file_paths

def get_new_html_file_paths(html_input_folder='data/html/'):
  """
  Returns a list of paths to the new HTML files.
  """
  downloaded_html_file_paths = get_downloaded_html_file_paths(html_input_folder=html_input_folder)
  new_html_file_paths = []
  manufacturer_list = get_manufacturer_and_count_list()
  for manufacturer_name, ssd_count in manufacturer_list:
    for ssd_index in range(1, ssd_count + 1):
      html_file_name = f'{manufacturer_name}_{ssd_index}.html'
      html_file_path = html_input_folder + html_file_name
      if html_file_path not in downloaded_html_file_paths:
        new_html_file_paths.append(html_file_path)
  return new_html_file_paths

### (WORK IN PROGRESS) Get the page of an SSD

In [1]:
def get_html_object_from_file(html_file_path : str):
    html_object = None
    with open(html_file_path, 'r') as html_file_handle:
        html_text = html_file_handle.read()
        html_object = BeautifulSoup(html_text, 'html.parser')
    return html_object


def is_valid_html(html_object):
    html_text = html_object.get_text()
    # print(html_text)
    return 'HTTP 429' not in html_text


def get_url_from_invalid_html_object(html_object):
    html_object.find('input')['value']


def get_url_from_valid_html_object(html_object):
    html_object.find('meta', {'property': 'og:url'})['content']


def get_url_suffix_from_full_url(url : str):
    # https://www.techpowerup.com/ssd-specs/xpg-sx8200-pro-256-gb.d882
    # Remove 'https://www.techpowerup.com'
    url_suffix = url[28:]
    # Remove '/ssd-specs/'
    url_suffix = url_suffix[10:]
    return url_suffix


def get_url_suffix_from_partial_url(url : str):
    # /ssd-specs/xpg-sx8200-pro-256-gb.d882
    # Remove '/ssd-specs/'
    url_suffix = url[10:]
    return url_suffix


def get_url_suffix_from_html_file_path(html_file_path : str):
    # print(html_file_path)
    url_suffix = html_file_path[10:]    # Remove 'ssd_htmls/'
    # print(url_suffix)
    url_suffix = url_suffix[11:]        # Remove the '_ssd-specs_'
    # print(url_suffix)
    url_suffix = url_suffix[:-25]   # Remove the datetime suffix
    # print(url_suffix)
    # print()
    return url_suffix


def categorise_html_urls(
    html_file_paths : list,
    valid_html_url_suffixes : dict,
    invalid_html_url_suffixes : dict
):
    for html_file_path in html_file_paths:
        url_suffix = get_url_suffix_from_html_file_path(html_file_path)
        html_object = get_html_object_from_file(html_file_path)
        if is_valid_html(html_object):
            # print(html_file_path)
            html_url = get_url_from_valid_html_object(html_object)
            valid_html_url_suffixes.add(url_suffix)
        else:
            html_url = get_url_from_invalid_html_object(html_object)
            invalid_html_url_suffixes.add(url_suffix)


def get_html_file_paths(html_files_folder):
    html_file_paths = glob.glob(html_files_folder + '*.html')
    # pprint.pprint(html_file_paths)
    return html_file_paths


def get_invalid_html_urls_file_path():
    current_date_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    invalid_htmls_urls_file_path = f'invalid_htmls_urls-{current_date_time}.txt'
    return invalid_htmls_urls_file_path


def get_valid_htmls_new_folder_path():
    # current_date_time = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
    valid_htmls_new_folder_path = f'valid_htmls-2/'
    return valid_htmls_new_folder_path


def copy_latest_valid_htmls_to_new_folder(
    valid_html_url_suffixes : set
):
    valid_htmls_new_folder_path = get_valid_htmls_new_folder_path()
    html_file_paths = get_html_file_paths(html_files_folder='data/html/')
    for html_file_path in html_file_paths:
        url_suffix = get_url_suffix_from_html_file_path(html_file_path)
        if url_suffix in valid_html_url_suffixes:
            url_suffix_without_dot = url_suffix.replace('.', '-')
            new_file_path = valid_htmls_new_folder_path + url_suffix_without_dot + '.html'
            shutil.copy(
                html_file_path,
                new_file_path
            )
            # Remove from valid_html_url_suffixes
            valid_html_url_suffixes.remove(url_suffix)



def check_all_htmls_and_record_urls_to_redownload(
    html_file_paths : list,
    invalid_htmls_urls_file_path : str
):
    valid_html_url_suffixes = set()
    invalid_html_url_suffixes = set()
    categorise_html_urls(
        html_file_paths,
        valid_html_url_suffixes,
        invalid_html_url_suffixes
    )
    # pprint.pprint(valid_html_url_suffixes)
    # pprint.pprint(invalid_html_url_suffixes)
    copy_latest_valid_htmls_to_new_folder(
        valid_html_url_suffixes
    )
    invalid_html_url_suffixes = invalid_html_url_suffixes - valid_html_url_suffixes
    print(f'Number of valid htmls: {len(valid_html_url_suffixes)}')
    print(f'Number of invalid htmls: {len(invalid_html_url_suffixes)}')
    return
    with open(invalid_htmls_urls_file_path, 'w') as invalid_htmls_urls_file_handle:
        for invalid_html_url_suffix in invalid_html_url_suffixes:
            # print(invalid_html_url_suffix)
            invalid_htmls_urls_file_handle.write(invalid_html_url_suffix + '\n')


def check_all_htmls():
    html_file_paths = get_html_file_paths(html_files_folder='data/html/')
    invalid_htmls_urls_file_path = get_invalid_html_urls_file_path()
    check_all_htmls_and_record_urls_to_redownload(
        html_file_paths,
        invalid_htmls_urls_file_path
    )


# check_all_htmls()


In [24]:
def send_request_and_wait(url : str, min_wait_time_seconds=30, max_wait_time_seconds=60):
    html_text = 'HTTP 429'
    print(f'Getting {url}...')
    while 'HTTP 429' in html_text:
        wait_time_seconds = random.randint(min_wait_time_seconds, max_wait_time_seconds)
        print(f'Waiting {wait_time_seconds} seconds...')
        time.sleep(wait_time_seconds)
        response = requests.get(url)
        html_text = response.text
        response.close()
    return html_text



def redownload_failed_attempts(failed_urls_suffixes : list):
    # counter = 0
    for i, url_suffix in enumerate(failed_urls_suffixes):
        if i < 249:
            continue
        url = f'https://www.techpowerup.com/ssd-specs/{url_suffix}'
        html_text = send_request_and_wait(url)
        # counter += 1
        print(f'Got {i+1} : {url_suffix}!\n')
        # Write to file
        url_suffix_without_dot = url_suffix.replace('.', '-')
        html_file_path = f'ssd_htmls-try-1/{url_suffix_without_dot}.html'
        with open(html_file_path, 'w') as html_file_handle:
            html_file_handle.write(html_text)


def redownload():
    invalid_htmls_url_file_path = 'invalid_htmls_urls-2024-01-01-12-52-14.txt'
    failed_urls_suffixes = []
    with open(invalid_htmls_url_file_path, 'r') as invalid_htmls_urls_file_handle:
        for line in invalid_htmls_urls_file_handle:
            url_suffix = line.strip()
            failed_urls_suffixes.append(url_suffix)
    redownload_failed_attempts(failed_urls_suffixes)

redownload()

Getting https://www.techpowerup.com/ssd-specs/reletech-p400-evo-500-gb.d1515...
Waiting 58 seconds...
Got 250 : reletech-p400-evo-500-gb.d1515!

Getting https://www.techpowerup.com/ssd-specs/micron-9300-pro-3-8-tb.d1286...
Waiting 48 seconds...
Got 251 : micron-9300-pro-3-8-tb.d1286!

Getting https://www.techpowerup.com/ssd-specs/kingston-kc2000-500-gb.d278...
Waiting 32 seconds...
Got 252 : kingston-kc2000-500-gb.d278!

Getting https://www.techpowerup.com/ssd-specs/dapustor-haishen3-h3100-6-tb.d1650...
Waiting 56 seconds...
Got 253 : dapustor-haishen3-h3100-6-tb.d1650!

Getting https://www.techpowerup.com/ssd-specs/kingston-kc2500-250-gb.d262...
Waiting 49 seconds...
Got 254 : kingston-kc2500-250-gb.d262!

Getting https://www.techpowerup.com/ssd-specs/kodak-x100-120-gb.d1072...
Waiting 50 seconds...
Got 255 : kodak-x100-120-gb.d1072!

Getting https://www.techpowerup.com/ssd-specs/dera-d7436-7-5-tb.d1690...
Waiting 56 seconds...
Got 256 : dera-d7436-7-5-tb.d1690!

Getting https://www.t