XML is an inherently hierarchical data format, and the most natural way to represent it is with a tree. ET has two classes for this purpose - ElementTree represents the whole XML document as a tree, and Element represents a single node in this tree. Interactions with the whole document (reading and writing to/from files) are usually done on the ElementTree level. Interactions with a single XML element and its sub-elements are done on the Element level.

In [1]:
import xml.etree.ElementTree as ET
from xml import etree
import html

In [2]:
import os
relevant_path = '.'
included_extensions = ['xml']
file_names = [fn for fn in os.listdir(relevant_path)
              if any(fn.endswith(ext) for ext in included_extensions)]
file_names

['MIATEmini.xml', 'Transgenicmini.xml']

In [3]:
out_path = './outputs/studySample.xml'
ns = '{http://www.ebi.ac.uk/bii/isatab_configuration#}'
exceptions = ['measurement' , 'technology']
Dict = {}

new_xml = ET.Element('isatab-config-file', {'xmlns':'http://www.ebi.ac.uk/bii/isatab_configuration#'})
config = ET.SubElement(new_xml, 'isatab-configuration', {'table-name':'studySample'})
#ET.SubElement(config, 'measurement', root[0][0].attrib)
#ET.SubElement(config, 'technology', root[0][1].attrib)

for file in file_names:
    tree = ET.parse(file)
    root = tree.getroot()
    child = root[0] 
    
    itr = -1
    for field in child:
        itr = itr + 1
        mTag = field.tag.replace(ns, '')
        mAttrib = list(field.attrib.items())[0][0]
        
        if mTag in exceptions:
            if mTag not in Dict.keys():
                ET.SubElement(config, mTag, field.attrib)
                Dict[mTag] = field.attrib
                continue
            else:
                continue
            
 
        if mTag == 'structured-field':
            continue
        
        if mTag != 'unit-field':
            key = field.attrib[mAttrib]
        else:
            key = field[1].text
            
        if field.attrib[mAttrib] == 'Sample Name':
            redo = itr
            continue
        
        if key not in Dict.keys():
            Dict[key] = field.attrib
            temp = ET.SubElement(config, mTag, field.attrib)
            for im in field.iter():
                tag = im.tag.replace(ns, '')
                if tag != 'field' and im.text != None:
                    value = im.text.strip()
                    b = ET.SubElement(temp, tag)
                    if tag == 'description' or tag == 'default-values':
                        b.text = str("<![CDATA[" + value + "]]>")
                    else:
                        b.text = value        

field = child[redo]
mTag = field.tag.replace(ns, '')
mAttrib = list(field.attrib.items())[0][0]        
temp = ET.SubElement(config, mTag, field.attrib)
for im in field.iter():
    tag = im.tag.replace(ns, '')
    if tag != 'field' and im.text != None:
        value = im.text.strip()
        b = ET.SubElement(temp, tag)
    if tag == 'description' or tag == 'default-values':
        b.text = str("<![CDATA[" + value + "]]>")
    else:
        b.text = value   

ET.SubElement(config, 'structure-field', {'name':'characteristics'})
ET.SubElement(config, 'structure-field', {'name':'factors'})
tree = ET.ElementTree(new_xml)
tree.write(out_path)

with open(out_path, 'r') as file:
    filedata = file.read()
    
filedata = filedata.replace('&lt;', '<')
filedata = filedata.replace('&gt;', '>')

with open(out_path, 'w') as file:
    file.write(html.unescape(filedata))