In [75]:
import xml.etree.ElementTree as ET
import pandas as pd

def parse_xml(file_path):
    """Parses an XML file and extracts specified fields into a single-row-per-vehicle DataFrame, ensuring each vehicle is linked to its correct state."""
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    # Define field mappings
    field_mappings = {
        "AuVehicle": ["AuRegisteredStateCd", "AuVehicleNo", "AuCityCntyTx", "AuZipCd_201010", "AuPrincGarAutoTerrCd", "AuTownCd_201012", 
                      "AuCountyCd", "AuTaxDistCd_201009", "AuVehYear", "AuVehDesc1Tx", "AuVehDesc2Tx", 
                      "AuVehicleIdNo", "AuVehTypeCd", "CompGroupNo" ],  
        "AuVehClassInput": ["AuClassCd_202006", "AuSecondClassCd", "AuZoneTerritoryCd_202022", "AuFarthestTermZoneCd", 
                            "AuMisc4Info_202036", "AuMisc2Info", "AuActualWeightAmt"],
        "AuVehSelCovgInput": ["AuPIPCovgInd", "AuAddlPIPNFltCovgInd", "AuMedPayCovgInd", "AuUMCovgInd_203016", 
                              "AuVehUMPDCovgInd", "AuOTCCovgTypeCd", "AuOTCDedAmt_203024", "AuOTCValCd", 
                              "AuNewVehCostAmt", "AuCollCovgInd", "AuCollDedAmt_203035", "AuMiscCollInfo", 
                              "AuVehOptCovLst", "AuVehMiscPIPCovgInd", "AuLeasVehAddlInsInd", "AuTowingCovgInd"],
        "AuVehRentReimInput": ["AuRentalCovgCd", "AuRentalReimbMaxAmt", "AuMaxDaysNo"],
        "AuVehOvrdInput": ["AuCompDedOvrdPct", "AuCollDedOvrdPct"]
    }
    
    # Define columns that should be treated as text
    text_columns = {"AuVehicleNo", "AuZipCd_201010", "AuPrincGarAutoTerrCd", "AuCountyCd", "AuTaxDistCd_201009", 
                    "AuVehYear", "AuVehTypeCd", "CompGroupNo", "AuClassCd_202006", "AuSecondClassCd", "AuActualWeightAmt", 
                    "AuOTCCovgTypeCd", "AuOTCDedAmt_203024", "AuNewVehCostAmt", "AuCollDedAmt_203035"}
    
    extracted_data = []
    
    # Iterate over each AuVehicle (parent tag)
    for vehicle in root.findall(".//AuVehicle"):
        data = {}

        # Extract fields under AuVehicle, including AuRegisteredStateCd
        for field in field_mappings["AuVehicle"]:
            element = vehicle.find(field)
            value = element.text if element is not None else None
            data[field] = str(value) if field in text_columns and value is not None else value
        
        # Extract fields from related nested structures
        for tag, fields in field_mappings.items():
            if tag == "AuVehicle":
                continue  # Skip already processed fields
            
            sub_element = vehicle.find(tag)  # Find nested element within AuVehicle
            for field in fields:
                if sub_element is not None:
                    element = sub_element.find(field)
                    value = element.text if element is not None else None
                    data[field] = str(value) if field in text_columns and value is not None else value
                else:
                    data[field] = None  # Ensure missing fields are set to None
        
        extracted_data.append(data)
    
    return pd.DataFrame(extracted_data)

# Example Usage
# df = parse_xml("sample.xml")
# print(df.head())


In [76]:
path = r'C:\Users\ez4ke.KDAWG\Desktop\astrus\samples\HMS\2023 - Rev 002 - AOS.xml'
# path = r'C:\Users\ez4ke.KDAWG\Desktop\astrus\samples\Dewitt\DeWittCustomInc2024Final-AutoSchedule.xml'

df = parse_xml(path)
df


Unnamed: 0,AuRegisteredStateCd,AuVehicleNo,AuCityCntyTx,AuZipCd_201010,AuPrincGarAutoTerrCd,AuTownCd_201012,AuCountyCd,AuTaxDistCd_201009,AuVehYear,AuVehDesc1Tx,...,AuMiscCollInfo,AuVehOptCovLst,AuVehMiscPIPCovgInd,AuLeasVehAddlInsInd,AuTowingCovgInd,AuRentalCovgCd,AuRentalReimbMaxAmt,AuMaxDaysNo,AuCompDedOvrdPct,AuCollDedOvrdPct
0,CA,1,VISTA,92081,142,,073,,2018,CHEVROLET,...,N,NNNNNNNNNN,N,N,N,,,,,
1,CA,2,VISTA,92081,142,,073,,2018,CHEVROLET,...,N,NNNNNNNNNN,N,N,N,,,,,
2,CA,3,VISTA,92081,142,,073,,2018,CHEVROLET,...,N,NNNNNNNNNN,N,N,N,,,,,
3,CA,4,VISTA,92081,142,,073,,2023,FORD,...,N,NNNNNNNNNN,N,N,N,,,,,
4,CA,5,VISTA,92081,142,,073,,2012,FORD,...,N,NNNNNNNNNN,N,N,N,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,CA,104,VISTA,92081,142,,073,,2020,PJ,...,N,NNNNNNNNNN,N,N,N,,,,,
104,CA,105,VISTA,92081,142,,073,,2022,SOLAR GUIDE LITE,...,N,NNNNNNNNNN,N,N,N,,,,,
105,CA,106,VISTA,92081,142,,073,,2001,TOW DOLLY,...,N,NNNNNNNNNN,N,N,N,,,,,
106,CA,107,VISTA,92081,142,,073,,2000,TRAILEZE,...,N,NNNNNNNNNN,N,N,N,,,,,


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108 entries, 0 to 107
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   AuRegisteredStateCd       108 non-null    object
 1   AuVehicleNo               108 non-null    object
 2   AuCityCntyTx              108 non-null    object
 3   AuZipCd_201010            108 non-null    object
 4   AuPrincGarAutoTerrCd      108 non-null    object
 5   AuTownCd_201012           0 non-null      object
 6   AuCountyCd                108 non-null    object
 7   AuTaxDistCd_201009        1 non-null      object
 8   AuVehYear                 108 non-null    object
 9   AuVehDesc1Tx              108 non-null    object
 10  AuVehDesc2Tx              108 non-null    object
 11  AuVehicleIdNo             108 non-null    object
 12  AuVehTypeCd               108 non-null    object
 13  CompGroupNo               0 non-null      object
 14  AuClassCd_202006          

In [78]:
df.to_clipboard()