In [1]:
from bs4 import BeautifulSoup # parse HTML files
import json # read and write json files
import glob # get the list of file names in a directory

## Convert and combine HTML files to one JSON file

In [2]:
def is_valid_html(html_text):
  return 'HTTP 429' not in html_text


def get_url_from_invalid_html_object(html_object):
  html_object.find('input')['value']


def get_url_from_valid_html_object(html_object):
  html_object.find('meta', {'property': 'og:url'})['content']
# ---------------------------------------------




# ---------------------------------------------
def get_table_object(valid_html_object, table_name):
  h1_table_name_object = valid_html_object.find('h1', text=table_name)
  if h1_table_name_object is None:
    return None

  table_object = h1_table_name_object.find_next_sibling('table')
  return table_object



def get_table_row_objects(table_object):
  row_objects = table_object.find_all('tr')
  return row_objects


def get_row_name(row_object):
  th_object = row_object.find('th')
  if th_object is None:
    return None
  row_name = th_object.text
  row_name = row_name[:-1] # remove colon
  return row_name


def get_row_text(row_object):
  row_text = row_object.find('td').text
  row_text = row_text.replace('\t', '') # remove tabs
  row_text = row_text.replace('\n', ' ') # replace newline with space
  row_text = row_text.strip() # remove trailing spaces
  row_text = row_text.replace('None', 'No') # replace 'None' with 'No' to avoid confusion with NoneType
  return row_text


def get_table_row_names(row_objects):
  row_names_set = set()
  for row_object in row_objects:
    row_name = get_row_name(row_object=row_object)
    if row_name is not None:
      row_names_set.add(row_name)
  return row_names_set
# ---------------------------------------------



def insert_row_data(row_object, row_name_to_text_dict):
  # Each of these row has a variable-length list with multiple values.
  # Skip these for now.
  ROW_NAMES_WITH_VARIANTS_LIST = [
    'Variants',
    'Hardware Versions',
  ]
  row_name = get_row_name(row_object=row_object)
  if row_name not in ROW_NAMES_WITH_VARIANTS_LIST:
    row_text = get_row_text(row_object=row_object)
    row_name_to_text_dict[row_name] = row_text


def get_table_row_contents(row_objects):
  row_names_and_contents_dict = dict()
  for row_object in row_objects:
    insert_row_data(
      row_object=row_object,
      row_name_to_text_dict=row_names_and_contents_dict
    )
  return row_names_and_contents_dict




# ---------------------------------------------
def get_html_object(file_path):
  with open(file_path, 'r') as f:
    html_text = f.read()
    if not is_valid_html(html_text=html_text):
      return None
    html_object = BeautifulSoup(html_text, 'html.parser')
    return html_object
# ---------------------------------------------



def get_table_data(html_object, table_name):
  table_object = get_table_object(valid_html_object=html_object, table_name=table_name)
  if table_object is None:
    return None
  row_objects = get_table_row_objects(table_object=table_object)
  row_names_and_contents = get_table_row_contents(row_objects=row_objects)
  return row_names_and_contents


def get_ssd_id(valid_html_object):
  # Example: get id "acer-fa100-1-tb.d333" from tag	<meta property="og:url" content="https://www.techpowerup.com/ssd-specs/acer-fa100-1-tb.d333">
  meta_og_url_object = valid_html_object.find('meta', {'property': 'og:url'})
  ssd_id = meta_og_url_object['content'] # the "content" attribute has the URL (example: https://www.techpowerup.com/ssd-specs/acer-fa100-1-tb.d333)
  ssd_id = ssd_id.split('/')[-1]  # get last part of URL (example: acer-fa100-1-tb.d333)
  return ssd_id


def get_ssd_name(valid_html_object):
  # Example: get name of the SSD "Acer FA100 1 TB (Micron B27B)" from tag <h1 class="drivename">Acer FA100 1 TB (Micron B27B)</h1>
  h1_drivename_object = valid_html_object.find('h1', {'class': 'drivename'})
  ssd_name = h1_drivename_object.text
  return ssd_name


def get_all_tables_data_of_one_ssd(valid_html_object):
  all_tables = dict()
  all_tables['SSD ID'] = get_ssd_id(valid_html_object=valid_html_object)
  all_tables['SSD Name'] = get_ssd_name(valid_html_object=valid_html_object)

  TABLE_NAMES = [
    'Solid-State-Drive',
    'Performance',
    'Physical',
    'NAND Flash',
    'Controller',
    'DRAM Cache',
    'Features',
    'RAID Controller',
    # 'Same Drive',   # Not needed for now
    # 'Reviews',      # Not needed for now
    # 'Notes',        # Not needed for now
  ]
  for table in TABLE_NAMES:
    table_data = get_table_data(html_object=valid_html_object, table_name=table)
    if table_data is not None:
      all_tables[table] = table_data
  return all_tables



def write_data_dict_to_json(data_dict, file_path):
  with open(file_path, 'w') as f:
    json.dump(data_dict, f, indent=2)


def convert_html_to_json(
    html_input_folder='data/html/',
    json_output_file_path='data/ssd.json'
):
  html_input_files_paths = glob.glob(html_input_folder + '*.html')
  all_ssd_drives = dict()
  processed_html_files_count = 0
  for html_file_path in html_input_files_paths:
    ssd_drive_html_object = get_html_object(file_path=html_file_path)
    if ssd_drive_html_object is not None:
      if processed_html_files_count % 100 == 0:
        print(f'{processed_html_files_count} HTML files processed...  {html_file_path}')
      processed_html_files_count += 1
      ssd_data = get_all_tables_data_of_one_ssd(valid_html_object=ssd_drive_html_object)
      ssd_id = ssd_data['SSD ID']
      all_ssd_drives[ssd_id] = ssd_data
  write_data_dict_to_json(
    data_dict=all_ssd_drives,
    file_path=json_output_file_path
  )

convert_html_to_json()

0 HTML files processed...  data/html/kingston-nv2-250-gb-d1041.html
100 HTML files processed...  data/html/addlink-a93-1-tb-d1610.html
200 HTML files processed...  data/html/hyperx-savage-960-gb-d327.html
300 HTML files processed...  data/html/acer-gm3500-1-tb-d710.html
400 HTML files processed...  data/html/samsung-850-evo-2-tb-d32.html
500 HTML files processed...  data/html/patriot-viper-vp4300-2-tb-d903.html
600 HTML files processed...  data/html/kingspec-nx-series-128-gb-d1087.html
700 HTML files processed...  data/html/seagate-barracuda-1-tb-d239.html
800 HTML files processed...  data/html/corsair-mp600-500-gb-d372.html
900 HTML files processed...  data/html/seagate-barracuda-500-gb-d231.html
1000 HTML files processed...  data/html/msi-spatium-m450-500-gb-d705.html
1100 HTML files processed...  data/html/seagate-game-drive-ps5-nvme-2-tb-d1701.html
1200 HTML files processed...  data/html/seagate-barracuda-120-250-gb-d196.html
1300 HTML files processed...  data/html/pichau-aldrin-a1