In [1]:
# scans all xml files from spcified folder Viator\scraped_pages\XML\NAtours and creates a single consolidated csv file for further analysis

import xml.etree.ElementTree as ET
import csv
import glob
from pathlib import Path

def flatten_duration(duration_elem):
    if duration_elem is None:
        return "", "", ""
    
    from_duration = duration_elem.find('fromDuration')
    if from_duration is not None:
        from_days = from_duration.findtext('days', '0')
        from_hours = from_duration.findtext('hours', '0')
        from_minutes = from_duration.findtext('minutes', '0')
        return from_days, from_hours, from_minutes
    return "", "", ""

def extract_xml_to_csv(input_folder, output_file):
    # Define the CSV headers based on the XML structure
    headers = [
        'total_count', 'total_pages', 'current_page',  # Added these fields
        'code', 'title', 'description', 'image_src', 'image_alt',
        'rating_score', 'rating_exact_score', 'rating_review_count',
        'location', 'category', 'retail_price_amount', 'retail_price_currency',
        'discounted_price_amount', 'discounted_price_currency',
        'url', 'is_private_tour', 'languages',
        'duration_days', 'duration_hours', 'duration_minutes',
        'has_free_cancellation', 'latitude', 'longitude',
        'max_travelers_allowed'
    ]

    # Track total items processed
    total_items = 0
    files_processed = 0

    # Open the CSV file for writing
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()

        # Process each XML file in the input folder
        for xml_file in glob.glob(f"{input_folder}/*.xml"):
            try:
                tree = ET.parse(xml_file)
                root = tree.getroot()
                
                # Extract pagination information
                total_count = root.findtext('totalCount', '')
                total_pages = root.findtext('pages', '')
                current_page = root.findtext('currentPage', '')
                
                # Get total count for this file
                items_in_file = len(root.findall('.//products/item'))
                print(f"\nProcessing {xml_file}")
                print(f"Found {items_in_file} items in this file (Page {current_page} of {total_pages})")
                
                # Process each product item
                for index, item in enumerate(root.findall('.//products/item'), 1):
                    # Extract data into a dictionary
                    product_data = {
                        # Add pagination information to each row
                        'total_count': total_count,
                        'total_pages': total_pages,
                        'current_page': current_page,
                        
                        'code': item.findtext('code', ''),
                        'title': item.findtext('title', ''),
                        'description': item.findtext('description', ''),
                        'image_src': item.find('.//image/src').text if item.find('.//image/src') is not None else '',
                        'image_alt': item.find('.//image/alt').text if item.find('.//image/alt') is not None else '',
                        'rating_score': item.findtext('.//rating/score', ''),
                        'rating_exact_score': item.findtext('.//rating/exactScore', ''),
                        'rating_review_count': item.findtext('.//rating/reviewCount', ''),
                        'location': item.findtext('location', ''),
                        'category': item.findtext('category', ''),
                        'retail_price_amount': item.findtext('.//retailPrice/amount', ''),
                        'retail_price_currency': item.findtext('.//retailPrice/currencyCode', ''),
                        'discounted_price_amount': item.findtext('.//discountedPrice/amount', ''),
                        'discounted_price_currency': item.findtext('.//discountedPrice/currencyCode', ''),
                        'url': item.findtext('url', ''),
                        'is_private_tour': item.findtext('isPrivateTour', ''),
                        'languages': ','.join([lang.text for lang in item.findall('.//languages/item')]),
                        'has_free_cancellation': item.findtext('.//behaviours/hasFreeCancellation', ''),
                        'latitude': item.findtext('.//geolocation/latitude', ''),
                        'longitude': item.findtext('.//geolocation/longitude', ''),
                        'max_travelers_allowed': item.findtext('maxTravelersAllowed', '')
                    }
                    
                    # Extract duration information
                    days, hours, minutes = flatten_duration(item.find('.//displayDuration'))
                    product_data.update({
                        'duration_days': days,
                        'duration_hours': hours,
                        'duration_minutes': minutes
                    })
                    
                    # Write the row to CSV
                    writer.writerow(product_data)
                    
                    if index % 100 == 0:  # Progress update every 100 items
                        print(f"Processed {index}/{items_in_file} items in current file")
                
                total_items += items_in_file
                files_processed += 1
                print(f"Completed processing {xml_file}")
                
            except Exception as e:
                print(f"Error processing {xml_file}: {str(e)}")

    print(f"\nFinal Summary:")
    print(f"Files processed: {files_processed}")
    print(f"Total items processed: {total_items}")
    print(f"Output saved to: {output_file}")


if __name__ == "__main__":
    # Usage
    # input_folder =  r'C:\Users\mjsteenberg\Documents\VoiceMap\Viator\scraped_pages\XML\NAtours'  # Replace with your input folder path
    input_folder = '/Users/mjsteenberg/Desktop/Desktop - MJ\'s MacBook Air - 1/VM/viator-scraper/scraped_pages/XML'
    output_file = "NorthAmericaViatorProducts.csv"  # Replace with your desired output file path

    extract_xml_to_csv(input_folder, output_file)



Processing /Users/mjsteenberg/Desktop/Desktop - MJ's MacBook Air - 1/VM/viator-scraper/scraped_pages/XML/Whistler_d618-ttd_sortType_rating.xml
Found 24 items in this file (Page 1 of 5)
Completed processing /Users/mjsteenberg/Desktop/Desktop - MJ's MacBook Air - 1/VM/viator-scraper/scraped_pages/XML/Whistler_d618-ttd_sortType_rating.xml

Processing /Users/mjsteenberg/Desktop/Desktop - MJ's MacBook Air - 1/VM/viator-scraper/scraped_pages/XML/Ontario_d263-ttd_sortType_rating.xml
Found 24 items in this file (Page 1 of 36)
Completed processing /Users/mjsteenberg/Desktop/Desktop - MJ's MacBook Air - 1/VM/viator-scraper/scraped_pages/XML/Ontario_d263-ttd_sortType_rating.xml

Processing /Users/mjsteenberg/Desktop/Desktop - MJ's MacBook Air - 1/VM/viator-scraper/scraped_pages/XML/Mobile_d4377-ttd_sortType_rating.xml
Found 15 items in this file (Page 1 of 1)
Completed processing /Users/mjsteenberg/Desktop/Desktop - MJ's MacBook Air - 1/VM/viator-scraper/scraped_pages/XML/Mobile_d4377-ttd_sortTy