In [1]:
from bs4 import BeautifulSoup

# XML (Extensible Markup Language) 

is a markup language used to store structured data. The Pandas data analysis library provides functions to read/write data for most of the file types.

For example, it includes read_csv() and to_csv() for interacting with CSV files. However, Pandas does not include any methods to read and write XML files.

Unfortunately there is no method in pandas library convert xml file to a dataframe easily. So, you need to do it yourself. You can do it by using the etree module in python.

You would need to firstly parse an XML file and create a list of columns for data frame. then extract useful information from the XML file and add to a pandas data frame.

# Python provides many libraries for working with xml files such as 

lxml: It is a clean, fast and strict library for dealing with xml files. It's also the most accepted library. It also supports xpath and xslt.

BeautifulSoup: It is flexible but a bit slower than lxml. The good thing is if your xml markup is messed up, it will try to correct it. It's perfect for dealing with web scrapped data in HTML formats. For clean xml, it might be too slow. 


REST API - Server & Clientsxml : It has native integration in Python and is fast & clean but do not support xpath and xslt. 

# Reading with xml.etree.ElementTree

The xml.etree.ElementTree module comes built-in with Python. It provides functionality for parsing and creating XML documents. ElementTree represents the XML document as a tree. We can move across the document using nodes which are elements and sub-elements of the XML file.

In this approach, we read the file content in a variable and use ET.XML() to parse the XML document from the string constant. We will loop across each child and sub child maintaining a list of data they contain. Meanwhile, writing child tags for the DataFrame column. Then we write this data into a DataFrame.

Note: When reading data from XML, we have to transpose the DataFrame, as the data list's sub-elements are written in columns.

In [2]:
#From file to XML object
from lxml import etree

with open("C:\\Users\\hcluser1\\DataXML\\books.xml") as fxml:    
    parsed = etree.parse(fxml)

print(parsed, dir(parsed))

<lxml.etree._ElementTree object at 0x000001F241BBBA88> ['__class__', '__copy__', '__deepcopy__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__pyx_vtable__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_setroot', 'docinfo', 'find', 'findall', 'findtext', 'getelementpath', 'getiterator', 'getpath', 'getroot', 'iter', 'iterfind', 'parse', 'parser', 'relaxng', 'write', 'write_c14n', 'xinclude', 'xmlschema', 'xpath', 'xslt']


In [6]:
#we obtained an instance of type lxml.etree._ElementTree
# New etree parser, with empty text nodes removed

parser = etree.XMLParser(remove_blank_text=True)

with open("C:\\Users\\hcluser1\\DataXML\\books.xml") as file:
    parsed = etree.parse(file, parser)

print(parsed)

<lxml.etree._ElementTree object at 0x000001F241C043C8>


# Traversing the Parsed Tree
 To visit all of the children in order, use iter() to create a generator that iterates over the ElementTree instance.

In [7]:
from xml.etree import ElementTree

    
with open('C:\\Users\\hcluser1\\DataXML\\books.xml', 'r') as f:
    tree = ElementTree.parse(f)



for node in tree.iter():
    print (node.tag, node.attrib)
    print("-----")

catalog {}
-----
book {'id': 'bk101'}
-----
author {}
-----
title {}
-----
genre {}
-----
book {'part': '2'}
-----
price {}
-----
publish_date {}
-----
description {}
-----
book {'id': 'bk102'}
-----
author {}
-----
title {}
-----
genre {}
-----
price {}
-----
publish_date {}
-----
description {}
-----
book {'id': 'bk103'}
-----
author {}
-----
title {}
-----
genre {}
-----
price {}
-----
publish_date {}
-----
description {}
-----
book {'id': 'bk104'}
-----
author {}
-----
title {}
-----
genre {}
-----
price {}
-----
publish_date {}
-----
description {}
-----
book {'id': 'bk105'}
-----
author {}
-----
title {}
-----
genre {}
-----
price {}
-----
publish_date {}
-----
description {}
-----
book {'id': 'bk106'}
-----
author {}
-----
title {}
-----
genre {}
-----
price {}
-----
publish_date {}
-----
description {}
-----
book {'id': 'bk107'}
-----
author {}
-----
title {}
-----
genre {}
-----
price {}
-----
publish_date {}
-----
description {}
-----
book {'id': 'bk108'}
-----
author {}
----

In [8]:
# only getting 5 elements from the generator
from itertools import islice

for node in islice(tree.iter(), 5):
    print (node.tag, node.attrib)
    print("-----")
   

catalog {}
-----
book {'id': 'bk101'}
-----
author {}
-----
title {}
-----
genre {}
-----


In [9]:
from xml.etree import ElementTree

    
with open('C:\\Users\\hcluser1\\DataXML\\books.xml', 'r') as f:
    tree = ElementTree.parse(f)


#Here, we create an ElementTree object by parsing the passed xmlfile.

root = tree.getroot()
#getroot() function return the root of tree as an Element object.

for item in root:
    Author_name = item.find("author").text if node is not None else None
    print(Author_name)

Gambardella, Matthew
Ralls, Kim
Corets, Eva
Corets, Eva
Corets, Eva
Randall, Cynthia
Thurman, Paula
Knorr, Stefan
Kress, Peter
O'Brien, Tim
O'Brien, Tim
Galos, Mike


In [10]:
import pandas as pd 
import xml.etree.ElementTree as et 

#The parsing of our “students.xml” file starts at the root of the tree, namely the <data> element,
#which contains the entire data structure.
xtree = et.parse("C:\\Users\\hcluser1\\DataXML\\Students.xml")
xroot = xtree.getroot() 

df_cols = ["name", "email", "grade", "age"]
rows = []

#iterate through each node of the tree, which means we will get each student element and grab its name 
#attribute and all of its sub-elements to build our dataframe.
#We can access the value of the attributes of an element via .attrib, just like we would access the information in a 
#Python dictionary, that is via key-based indexing. We know that our sonnet element, for instance, should have an author
#and year attribute. We can inspect the value of these as follows:
for node in xroot: 
    s_name = node.attrib.get("name")
    s_mail = node.find("email").text if node is not None else None
    s_grade = node.find("grade").text if node is not None else None
    s_age = node.find("age").text if node is not None else None
    
    #get the name attribute, we use the attrib.get() function, while the text content of each element can be retrieved 
    #using the find() function of nodes.
    
    rows.append({"name": s_name, "email": s_mail, 
                 "grade": s_grade, "age": s_age})

out_df = pd.DataFrame(rows, columns = df_cols)
out_df

Unnamed: 0,name,email,grade,age
0,John,john@mail.com,A,16
1,Alice,alice@mail.com,B,17
2,Bob,bob@mail.com,C,16
3,Hannah,hannah@mail.com,A,17


# String to XML object
 lxml parses strings using fromstring function which is similar to parse which is used to parses files as shown in the below example.

In [11]:
xml = '<root xmlns:a="xmlns1" xmlns:b="xmlns2"><tag xmlns:c="xmlns3" /><tag xmlns:a="xmlns1" /><tag /></root>'
parsed = etree.fromstring(xml)
print(parsed)

<Element root at 0x1f241c09c88>


# Parsing Strings
 To work with smaller bits of XML text, especially string literals as might be embedded in the source of a program, use XML() and the string containing the XML to be parsed as the only argument.

In [12]:
from xml.etree.ElementTree import XML

parsed = XML('''
<root>
  <group>
    <child id="a">This is child "a".</child>
    <child id="b">This is child "b".</child>
  </group>
  <group>
    <child id="c">This is child "c".</child>
  </group>
</root>
''')

print ('parsed =', parsed)

for elem in parsed:
    print (elem.tag)
    if elem.text is not None and elem.text.strip():
        print ('  text: "%s"' % elem.text)
    if elem.tail is not None and elem.tail.strip():
        print ('  tail: "%s"' % elem.tail)
    for name, value in sorted(elem.attrib.items()):
        print('  %-4s = "%s"' % (name, value))
    print

parsed = <Element 'root' at 0x000001F249BD5778>
group
group


In [14]:
#Parsing XML with Namespaces
xml_text = """<?xml version="1.0"?>
<actors xmlns:fictional="http://characters.example.com"
        xmlns="http://people.example.com">
    <actor>
        <name>John Cleese</name>
        <fictional:character>Lancelot</fictional:character>
        <fictional:character>Archie Leach</fictional:character>
    </actor>
    <actor>
        <name>Eric Idle</name>
        <fictional:character>Sir Robin</fictional:character>
        <fictional:character>Gunther</fictional:character>
        <fictional:character>Commander Clement</fictional:character>
    </actor>
</actors>"""

In [20]:
import xml.etree.ElementTree as ET
root = ET.fromstring(xml_text)
for actor in root.findall('{http://people.example.com}actor'):
    name = actor.find('{http://people.example.com}name')
    print(name.text)
    for char in actor.findall('{http://characters.example.com}character'):
        print('   |->', char.text)

John Cleese
   |-> Lancelot
   |-> Archie Leach
Eric Idle
   |-> Sir Robin
   |-> Gunther
   |-> Commander Clement


In [16]:
#Writing XML
import xml.etree.ElementTree as gfg 
  
  
def GenerateXML(fileName) :
      
    root = gfg.Element("Catalog")
      
    m1 = gfg.Element("mobile")
    root.append (m1)
      
    b1 = gfg.SubElement(m1, "brand")
    b1.text = "Redmi"
    b2 = gfg.SubElement(m1, "price")
    b2.text = "6999"
      
    m2 = gfg.Element("mobile")
    root.append (m2)
      
    c1 = gfg.SubElement(m2, "brand")
    c1.text = "Samsung"
    c2 = gfg.SubElement(m2, "price")
    c2.text = "9999"
      
    m3 = gfg.Element("mobile")
    root.append (m3)
      
    d1 = gfg.SubElement(m3, "brand")
    d1.text = "RealMe"
    d2 = gfg.SubElement(m3, "price")
    d2.text = "11999"
      
    tree = gfg.ElementTree(root)
      
    with open (fileName, "wb") as files :
        tree.write(files)
  
# Driver Code
if __name__ == "__main__": 
    GenerateXML("C:\\Users\\hcluser1\\DataXML\\Catalog.xml")

In [17]:
#Convert XML to JSON
import xml.etree.ElementTree as ET
import xmltodict
import json

tree = ET.parse('C:\\Users\\hcluser1\\DataXML\\Students.xml')
xml_data = tree.getroot()

xmlstr = ET.tostring(xml_data, encoding='utf8', method='xml')


data_dict = dict(xmltodict.parse(xmlstr))

print(data_dict)

with open('C:\\Users\\hcluser1\\DataXML\\Student_new_data.json', 'w+') as json_file:
    json.dump(data_dict, json_file, indent=4, sort_keys=True)

{'data': OrderedDict([('student', [OrderedDict([('@name', 'John'), ('email', 'john@mail.com'), ('grade', 'A'), ('age', '16')]), OrderedDict([('@name', 'Alice'), ('email', 'alice@mail.com'), ('grade', 'B'), ('age', '17')]), OrderedDict([('@name', 'Bob'), ('email', 'bob@mail.com'), ('grade', 'C'), ('age', '16')]), OrderedDict([('@name', 'Hannah'), ('email', 'hannah@mail.com'), ('grade', 'A'), ('age', '17')])])])}


# Covert XML to CSV using Pandas

In [18]:
import pandas as pd 
import xml.etree.ElementTree as et 

#The parsing of our “students.xml” file starts at the root of the tree, namely the <data> element,
#which contains the entire data structure.
xtree = et.parse("C:\\Users\\hcluser1\\DataXML\\Students.xml")
xroot = xtree.getroot() 

df_cols = ["name", "email", "grade", "age"]
rows = []

#iterate through each node of the tree, which means we will get each student element and grab its name 
#attribute and all of its sub-elements to build our dataframe.
#We can access the value of the attributes of an element via .attrib, just like we would access the information in a 
#Python dictionary, that is via key-based indexing. We know that our sonnet element, for instance, should have an author
#and year attribute. We can inspect the value of these as follows:
for node in xroot: 
    s_name = node.attrib.get("name")
    s_mail = node.find("email").text if node is not None else None
    s_grade = node.find("grade").text if node is not None else None
    s_age = node.find("age").text if node is not None else None
    
    #get the name attribute, we use the attrib.get() function, while the text content of each element can be retrieved 
    #using the find() function of nodes.
    
    rows.append({"name": s_name, "email": s_mail, 
                 "grade": s_grade, "age": s_age})

out_df = pd.DataFrame(rows, columns = df_cols)
out_df


out_df.to_csv()





Unnamed: 0,name,email,grade,age
0,John,john@mail.com,A,16
1,Alice,alice@mail.com,B,17
2,Bob,bob@mail.com,C,16
3,Hannah,hannah@mail.com,A,17


# Reading XML using another method BeautifulSoup

In [19]:
from bs4 import BeautifulSoup
# Open and read the XML file
file = open("C:\\Users\\hcluser1\\DataXML\\plants.xml", "r")
contents = file.read()
# Create the BeautifulSoup Object and use the parser
soup = BeautifulSoup(contents, 'lxml')
# extract the contents of the common, botanical and price tags
plant_name = soup.find_all('common')  # store the name of the plant
scientific_name = soup.find_all('botanical')  # store the scientific name of the plant
price = soup.find_all('price')  # store the price of the plant
# Use a for loop along with the enumerate function that keeps count of each iteration
for n, title in enumerate(plant_name):
    print("Plant Name:", title.text)  # print the name of the plant using text
    print("Botanical Name: ", scientific_name[
        n].text)  # use the counter to access each index of the list that stores the scientific name of the plant
    print("Price: ",
          price[n].text)  # use the counter to access each index of the list that stores the price of the plant
    print()

Plant Name: 
     Bloodroot
    
Botanical Name:  
     Sanguinaria canadensis
    
Price:  
     $2.44
    

Plant Name: 
     Marsh Marigold
    
Botanical Name:  
     Caltha palustris
    
Price:  
     $6.81
    

Plant Name: 
     Cowslip
    
Botanical Name:  
     Caltha palustris
    
Price:  
     $9.90
    

