In [11]:
import os
from lxml import etree
import csv

# Folder where your extracted XML files are located
xml_folder = r"C:\Users\lenovo\Desktop\sementic search\extracted_epo_xml\docdb_xml_202530_CreateDelete_001\Root\nested_xmls"

# Output CSV file to save parsed data
output_csv = 'epo_docdb_parsed.csv'

# Define CSV headers you want to extract
csv_headers = ['patent_number', 'title', 'abstract', 'inventors', 'filing_date', 'publication_date', 'classifications']

def parse_xml_file(xml_file):
    # Parse XML file
    tree = etree.parse(xml_file)
    root = tree.getroot()

    # Namespace dictionary to use in xpath (adjust if needed)
    ns = {'ep': 'http://www.epo.org/exchange'}

    # Extract patent number (publication reference)
    pub_ref = root.find('.//ep:publication-reference/ep:document-id/ep:doc-number', namespaces=ns)
    patent_number = pub_ref.text if pub_ref is not None else ''

    # Extract title
    title_el = root.find('.//ep:invention-title', namespaces=ns)
    title = title_el.text if title_el is not None else ''

    # Extract abstract text (joining all paragraphs if multiple)
    abstract_el = root.find('.//ep:abstract', namespaces=ns)
    abstract = ''
    if abstract_el is not None:
        paragraphs = abstract_el.findall('.//ep:p', namespaces=ns)
        if paragraphs:
            abstract = ' '.join([p.text for p in paragraphs if p.text])
        else:
            abstract = abstract_el.text if abstract_el.text else ''

    # Extract inventors (concatenate multiple inventors separated by ;)
    inventors = []
    for inventor in root.findall('.//ep:parties/ep:applicants/ep:applicant/ep:addressbook/ep:name', namespaces=ns):
        if inventor.text:
            inventors.append(inventor.text)
    inventors_str = '; '.join(inventors)

    # Extract filing date
    filing_date_el = root.find('.//ep:application-reference/ep:document-id/ep:date', namespaces=ns)
    filing_date = filing_date_el.text if filing_date_el is not None else ''

    # Extract publication date
    pub_date_el = root.find('.//ep:publication-reference/ep:document-id/ep:date', namespaces=ns)
    publication_date = pub_date_el.text if pub_date_el is not None else ''

    # Extract classifications (all IPC classifications joined by ;)
    classifications = []
    for ipc in root.findall('.//ep:classifications-ipc/ep:classification-ipc/ep:text', namespaces=ns):
        if ipc.text:
            classifications.append(ipc.text)
    classifications_str = '; '.join(classifications)

    return {
        'patent_number': patent_number,
        'title': title,
        'abstract': abstract,
        'inventors': inventors_str,
        'filing_date': filing_date,
        'publication_date': publication_date,
        'classifications': classifications_str
    }

def main():
    with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_headers)
        writer.writeheader()

        # Loop through all XML files in the folder
        files = [f for f in os.listdir(xml_folder) if f.endswith('.xml')]
        print(f'Found {len(files)} XML files.')

        for file in files:
            xml_path = os.path.join(xml_folder, file)
            try:
                data = parse_xml_file(xml_path)
                writer.writerow(data)
            except Exception as e:
                print(f'Error parsing {file}: {e}')

    print(f'Parsing complete. Data saved to {output_csv}')

if __name__ == '__main__':
    main()


Parsing extracted_epo_xml/docdb_xml_202530_CreateDelete_001/Root/nested_xmls\DOCDB-202530-CreateDelete-PubDate20250718AndBefore-AT-0001.xml ...
Parsing extracted_epo_xml/docdb_xml_202530_CreateDelete_001/Root/nested_xmls\DOCDB-202530-CreateDelete-PubDate20250718AndBefore-AU-0001.xml ...
Parsing extracted_epo_xml/docdb_xml_202530_CreateDelete_001/Root/nested_xmls\DOCDB-202530-CreateDelete-PubDate20250718AndBefore-BE-0001.xml ...
Parsing extracted_epo_xml/docdb_xml_202530_CreateDelete_001/Root/nested_xmls\DOCDB-202530-CreateDelete-PubDate20250718AndBefore-CA-0001.xml ...
Parsing extracted_epo_xml/docdb_xml_202530_CreateDelete_001/Root/nested_xmls\DOCDB-202530-CreateDelete-PubDate20250718AndBefore-CN-0001.xml ...
Parsing extracted_epo_xml/docdb_xml_202530_CreateDelete_001/Root/nested_xmls\DOCDB-202530-CreateDelete-PubDate20250718AndBefore-CN-0002.xml ...
Parsing extracted_epo_xml/docdb_xml_202530_CreateDelete_001/Root/nested_xmls\DOCDB-202530-CreateDelete-PubDate20250718AndBefore-CN-0003.