In [2]:
from lxml import etree as ET
import pandas as pd
import re

# input parameters
xml_namespaces = {'xsd': 'http://www.w3.org/2001/XMLSchema'
             }
xml_data = ET.parse("./xsd/ares_datatypes_v_1.0.2.xsd")

def strip_replace_new_line(tag_value):
    """
Remove /n from begining of tag value and end & replace with space when inside tag value
Args:
    tag_value: String tag value
Returns:
    tag_value w/o /n (string)
"""
    if tag_value:
        tag_value = tag_value.strip('\n')
        tag_value = tag_value.replace('\n',' ')
    return tag_value

def set_tag_value(attr_xpath, elem, xml_namespaces):
    """
Checks if tag/attribute @ exists and return its value
Args:
    attr_xpath: checked xpath
    elem: current element in xml
    xml_namespaces: namespaces in xml file
Returns:
    tag_value from xpath (string) 
"""
    if re.search('@',attr_xpath) is not None and elem.xpath(attr_xpath,namespaces = xml_namespaces) != []:# check for @ for attributes
        tag_value = elem.xpath(attr_xpath,namespaces = xml_namespaces)[0]
    elif elem.xpath(attr_xpath,namespaces = xml_namespaces) != []:
            tag_value = elem.xpath(attr_xpath,namespaces = xml_namespaces)[0].text
    else:
        tag_value = None
    return strip_replace_new_line(tag_value)

def populate_table (parent_tag, table_columns):
    """
Iterate elements under parent tag and get element tag/attr values for columns
Args:
    parent_tag: initial xml tag
    table_columns: xpaths to be checked and transformed to column values
Returns:
    table from xml data according to table_columns definition (pandas dataframe)
"""
    table_columns_pd = pd.DataFrame(table_columns,columns =['tag','xpath'])
    iter_tag_values = []
    for elem in parent_tag:
        tag_values = []
        for item in table_columns:
            tag_values.append(set_tag_value(item[1],elem, xml_namespaces)) 
        iter_tag_values.append(tag_values)
    table_pd = pd.DataFrame(iter_tag_values, columns=table_columns_pd['tag'].tolist())
    return table_pd

Generuje xls s 3 zalozkami:
1. complex types
1. atributy komplexnich typu
1. simple typy

ignoruji se typy enumeration a choice

In [19]:
xsd_complex_types = [
    ['name','../../@name'],
    ['annotation','../../xsd:annotation/xsd:documentation'],
    ['element_name','./@name'],
    ['type','./@type'],
    ['id','./@id'],
    ['minOccurs','./@minOccurs'],
    ['maxOccurs','./@maxOccurs'],
    ['default','./@default']
]
parent_tag = xml_data.xpath('./xsd:complexType/xsd:sequence/xsd:element',namespaces = xml_namespaces)
xsd_complet_type_result = populate_table(parent_tag,xsd_complex_types)


In [20]:
xsd_simple_types = [
    ['name','./@name'],
    ['annotation','./xsd:annotation/xsd:documentation']
]
parent_tag = xml_data.xpath('./xsd:simpleType',namespaces = xml_namespaces)
xsd_simple_result = populate_table(parent_tag,xsd_simple_types)

In [24]:
xsd_attributes = [
    ['complextype_name','../@name'],
    ['name','./@name'],
    ['type','./@type']
]
parent_tag = xml_data.xpath('./xsd:complexType/xsd:attribute',namespaces = xml_namespaces)
xsd_attributes_result = populate_table(parent_tag,xsd_attributes)

Unnamed: 0,complextype_name,name,type
0,adresa_ARES,dod,xsd:date
1,adresa_ARES,ddo,xsd:date
2,adresa_ARES,zdroj,zdroj_type
3,fyzicka_osoba,dod,xsd:date
4,fyzicka_osoba,ddo,xsd:date
5,pravnicka_osoba,dod,xsd:date
6,pravnicka_osoba,ddo,xsd:date
7,obor_cinnosti,aktivni,ano_ne
8,adresa_dorucovaci,zdroj,zdroj_type
9,spisova_znacka,dod,xsd:date


In [27]:
inner_complex_type = [
    ['complextype_name','../../../@name'],
    ['name','./@name'],
    ['type','./@type']
]
parent_tag = xml_data.xpath('./xsd:complexType/xsd:sequence/xsd:element/xsd:complexType/xsd:sequence/xsd:element',namespaces = xml_namespaces)
inner_complex_type_result = populate_table(parent_tag,inner_complex_type)
inner_complex_type_result             

Unnamed: 0,complextype_name,name,type
0,Emise,Druh_akcie,druh_akcie
1,Emise,Hodnota,hodnota_emise
2,Emise,Pocet,pocet_akcii
3,Emise,Text,textType
4,Emise,Podoba,podoba_akcii
5,Odstepny_zavod,ICO,ico
6,Odstepny_zavod,Obchodni_firma,obchodni_firma
7,Odstepny_zavod,Sidlo,adresa_ARES
8,Odstepny_zavod,Cinnosti,cinnosti
9,Odstepny_zavod,Vedouci,angazma


In [21]:
"""Zapsání """
# Create a Pandas Excel writer using XlsxWriter as the engine.
writer = pd.ExcelWriter('xsd_xls.xlsx')

# Convert the dataframe to an XlsxWriter Excel object.
xsd_complet_type_result.to_excel(writer, sheet_name='complex_types')
# Convert the dataframe to an XlsxWriter Excel object.
xsd_attributes_result.to_excel(writer, sheet_name='complex_types_attributes')
# Convert the dataframe to an XlsxWriter Excel object.
xsd_simple_result.to_excel(writer, sheet_name='simple_types')


# Close the Pandas Excel writer and output the Excel file.
writer.save()