In [None]:
# 📦 Install required packages if needed
!pip install lxml pandas

# 📚 Import libraries
from lxml import etree
import pandas as pd

# 🔧 Parsing Function
def parse_uspto_bulk_file(xml_file_path):
    with open(xml_file_path, 'rb') as f:
        content = f.read()

    # Split by individual patent records
    raw_docs = content.split(b"<?xml")[1:]  # skip first empty chunk
    parsed_patents = []
    print(f"🔍 Total patent records found: {len(raw_docs)}")

    for i, doc in enumerate(raw_docs):
        single_xml = b"<?xml" + doc  # Add back the XML header

        try:
            root = etree.fromstring(single_xml)

            # Patent Number & Publication Date
            pub_doc_id = root.find('.//publication-reference/document-id')
            patent_number = pub_doc_id.findtext('doc-number') if pub_doc_id is not None else None
            pub_date = pub_doc_id.findtext('date') if pub_doc_id is not None else None

            # Application Number
            app_doc_id = root.find('.//application-reference/document-id')
            application_number = app_doc_id.findtext('doc-number') if app_doc_id is not None else None

            # Title
            title = root.findtext('.//invention-title')

            # Abstract
            abstract = root.findtext('.//abstract/p')

            # Claims
            claims = " ".join(root.xpath('.//claims//claim-text/text()'))

            # Assignee
            assignee = root.findtext('.//assignees/assignee/addressbook/orgname')

            parsed_patents.append({
                'patent_number': patent_number,
                'application_number': application_number,
                'title': title,
                'abstract': abstract,
                'claims': claims,
                'assignee': assignee,
                'publication_date': pub_date
            })

        except Exception as e:
            print(f"[!] Skipping record {i+1}: {e}")
            continue

    return parsed_patents

# 📁 Run the parser and save to CSV
xml_file_path = r"C:\Users\lenovo\Desktop\sementic search\ipg250722.xml"  # 👈 Change this to your XML path
patent_data = parse_uspto_bulk_file(xml_file_path)

df = pd.DataFrame(patent_data)
csv_file = "uspto.csv"
df.to_csv(csv_file, index=False, encoding='utf-8')

print(f"\n✅ Done! Extracted {len(df)} records and saved to '{csv_file}'")
df.head()  # Preview
