In [4]:
import json
import re
import os
import time
from dicttoxml import dicttoxml
import xml.dom.minidom

# Step 1: Load the HTML file from disk
# html_file_path = r'C:\Users\mjsteenberg\Documents\VoiceMap\Viator\scraped_pages\Gjirokaster_d50367-ttd_sortType_rating.html'  # Replace with your actual HTML file path
html_file_path = os.path.join(os.getcwd(), 'scraped_pages', 'NAtours', 'Acapulco_d629-ttd_sortType_rating.html')


with open(html_file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Step 2: Extract JSON data between "productList" and "productFilters"
pattern = r'"productList"\s*:\s*(\{.*?\})\s*,\s*"productFilters"'
match = re.search(pattern, html_content, re.DOTALL)

if not match:
    raise ValueError('Could not find JSON data between "productList" and "productFilters".')

product_list_json_str = match.group(1)

# Step 3: Ensure the extracted JSON is valid
product_list_json_str = '{ "productList": ' + product_list_json_str + ' }'
product_list_json_str = product_list_json_str.replace('\\u002F', '/')

# Step 4: Parse the JSON to get the "productList" data
try:
    product_list_data = json.loads(product_list_json_str)
except json.JSONDecodeError as e:
    print("JSON decoding failed:", e)
    raise

product_list = product_list_data['productList']

# Optional: Print or process the extracted data
total_count = product_list.get('totalCount')
pages = product_list.get('pages')
current_page = product_list.get('currentPage')
products = product_list.get('products')

print(f"Total Count: {total_count}")
print(f"Pages: {pages}")
print(f"Current Page: {current_page}")
print(f"Number of Products: {len(products)}")

# Step 5: Extract the first part of the filename until the first underscore
# Get the base filename without the directory path
base_filename = os.path.basename(html_file_path)

# Remove the file extension
filename_without_extension = os.path.splitext(base_filename)[0]

# Extract the part before the first underscore
location_name = filename_without_extension.split('_')[0]

# Step 6: Add the extracted location name as a value under a tag called "Destination"
product_list['Destination'] = location_name

# Step 7: Add the filename and file creation datetime to the product list data

# Get the file creation time
creation_time = os.path.getctime(html_file_path)
# Convert it to a readable format
creation_datetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(creation_time))

# Add the filename and creation datetime to the product list
product_list['FileName'] = base_filename
product_list['FileCreationDateTime'] = creation_datetime

# Step 8: Convert the entire "productList" to XML using dicttoxml
xml_bytes = dicttoxml(product_list, custom_root='ProductList', attr_type=False)
xml_str = xml_bytes.decode('utf-8')

# Pretty-print the XML
dom = xml.dom.minidom.parseString(xml_str)
pretty_xml_str = dom.toprettyxml()

# Step 9: Save the XML data to a file
xml_file_path = 'product_list.xml'  # Replace with your desired output XML file path

with open(xml_file_path, 'w', encoding='utf-8') as file:
    file.write(pretty_xml_str)

print(f"XML data has been saved to '{xml_file_path}'")


Total Count: 131
Pages: 6
Current Page: 1
Number of Products: 24
XML data has been saved to 'product_list.xml'


In [7]:
import json
import re
import os
import time
from dicttoxml import dicttoxml
import xml.dom.minidom

# Directory containing HTML files
# html_directory = r'C:\Users\mjsteenberg\Documents\VoiceMap\Viator\scraped_pages'  # Replace with your directory path
html_directory = '/Users/mjsteenberg/Desktop/Desktop - MJ\'s MacBook Air - 1/VM/viator-scraper/scraped_pages/NAtours'

# Directory to save XML files
# xml_output_directory = r'C:\Users\mjsteenberg\Documents\VoiceMap\Viator\scraped_pages\XML'  # Replace with your desired output directory
xml_output_directory = '/Users/mjsteenberg/Desktop/Desktop - MJ\'s MacBook Air - 1/VM/viator-scraper/XML'

# Ensure output directory exists
if not os.path.exists(xml_output_directory):
    os.makedirs(xml_output_directory)

# Loop through all HTML files in the directory
for filename in os.listdir(html_directory):
    if filename.endswith('.html'):
        html_file_path = os.path.join(html_directory, filename)
        
        # Load the HTML file
        with open(html_file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        
        # Extract JSON data between "productList" and "productFilters"
        pattern = r'"productList"\s*:\s*(\{.*?\})\s*,\s*"productFilters"'
        match = re.search(pattern, html_content, re.DOTALL)
        
        if not match:
            print(f'Could not find JSON data in file: {filename}')
            continue  # Skip to the next file
        
        product_list_json_str = match.group(1)
        
        # Ensure the extracted JSON is valid
        product_list_json_str = '{ "productList": ' + product_list_json_str + ' }'
        product_list_json_str = product_list_json_str.replace('\\u002F', '/')
        
        # Parse the JSON to get the "productList" data
        try:
            product_list_data = json.loads(product_list_json_str)
        except json.JSONDecodeError as e:
            print(f"JSON decoding failed for file {filename}: {e}")
            continue  # Skip to the next file
        
        product_list = product_list_data['productList']
        
        # Extract metadata from filename and file properties
        # Get the base filename without the directory path
        base_filename = os.path.basename(html_file_path)
        
        # Remove the file extension
        filename_without_extension = os.path.splitext(base_filename)[0]
        
        # Extract the part before the first underscore
        location_name = filename_without_extension.split('_')[0]
        
        # Get the file creation time
        creation_time = os.path.getctime(html_file_path)
        # Convert it to a readable format
        creation_datetime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(creation_time))
        
        # Add the extracted metadata to the product list
        product_list['Destination'] = location_name
        product_list['FileName'] = base_filename
        product_list['FileCreationDateTime'] = creation_datetime
        
        # Convert the entire "productList" to XML
        xml_bytes = dicttoxml(product_list, custom_root='ProductList', attr_type=False)
        xml_str = xml_bytes.decode('utf-8')
        
        # Pretty-print the XML
        dom = xml.dom.minidom.parseString(xml_str)
        pretty_xml_str = dom.toprettyxml()
        
        # Save the XML data to a file
        xml_filename = filename_without_extension + '.xml'
        xml_file_path = os.path.join(xml_output_directory, xml_filename)
        
        with open(xml_file_path, 'w', encoding='utf-8') as file:
            file.write(pretty_xml_str)
        
        print(f"Processed and saved XML for file '{filename}' as '{xml_filename}'")


Processed and saved XML for file 'Homer_d4359-ttd_sortType_rating.html' as 'Homer_d4359-ttd_sortType_rating.xml'
Processed and saved XML for file 'Morelos_d23884-ttd_sortType_rating.html' as 'Morelos_d23884-ttd_sortType_rating.xml'
Processed and saved XML for file 'San-Diego_d736-ttd_sortType_rating.html' as 'San-Diego_d736-ttd_sortType_rating.xml'
Processed and saved XML for file 'Mismaloya_d51522-ttd_sortType_rating.html' as 'Mismaloya_d51522-ttd_sortType_rating.xml'
Processed and saved XML for file 'Morelia_d50525-ttd_sortType_rating.html' as 'Morelia_d50525-ttd_sortType_rating.xml'
Processed and saved XML for file 'California_d272-ttd_sortType_rating.html' as 'California_d272-ttd_sortType_rating.xml'
Processed and saved XML for file 'Temecula_d24203-ttd_sortType_rating.html' as 'Temecula_d24203-ttd_sortType_rating.xml'
Processed and saved XML for file 'Buena-Vista_d22370-ttd_sortType_rating.html' as 'Buena-Vista_d22370-ttd_sortType_rating.xml'
Processed and saved XML for file 'Sant