# Exercise - Data Exchange Formats with Python

In the following three exercises, you are asked to write Python code for reading
data from XML, JSON and RDF files and for querying the data using the XPath
and SPARQL query languages. Each subsection is dedicated to one of the three
data exchange formats. The tasks are rather basic and the goal is to refresh
your knowledge in Python in general and in particular in parsing those formats.

## 1 XML

This subsection is dedicated to the XML format. In particular, you are asked
to perform XPath queries on the Mondial dataset. This dataset includes world
geographic information integrated from the CIA World Factbook, the International
Atlas and the TERRA database, to name just the pre-dominant sources.

Please inspect the documents in '/input' manually (using a text editor) in order to explore
the structure.

You can also have a look at the [w3school XPath tutorial 2](https://www.w3schools.com/xml/xpath_intro.asp) to solve the following tasks.

In [12]:
"""
XML Tree Visualization Examples
Demonstrates various ways to visualize XML as a tree structure in Python
"""

import xml.etree.ElementTree as ET
from typing import Optional, List, Dict, Any
import pandas as pd
from pathlib import Path


def print_ascii_tree(element: ET.Element, prefix: str = "", is_last: bool = True):
    """Print XML tree as ASCII art"""
    # Print current element
    connector = "└── " if is_last else "├── "
    print(f"{prefix}{connector}{element.tag}")

    # Prepare prefix for children
    child_prefix = prefix + ("    " if is_last else "│   ")

    # Get children
    children = list(element)

    # Print attributes if any
    if element.attrib:
        for i, (key, value) in enumerate(element.attrib.items()):
            attr_connector = "└── " if i == len(
                element.attrib) - 1 and not children else "├── "
            print(f"{child_prefix}{attr_connector}@{key}: {value}")

    # Print text content if any
    if element.text and element.text.strip():
        text_connector = "└── " if not children else "├── "
        print(f"{child_prefix}{text_connector}text: {repr(element.text.strip())}")

    # Print children recursively
    for i, child in enumerate(children):
        is_last_child = (i == len(children) - 1)
        print_ascii_tree(child, child_prefix, is_last_child)


def get_tree_stats(element: ET.Element) -> Dict[str, Any]:
    """Get statistics about the XML tree"""
    def traverse(elem: ET.Element, depth: int = 0) -> Dict[str, Any]:
        stats = {
            'max_depth': depth,
            'total_elements': 1,
            'element_types': {elem.tag: 1},
            'attributes_count': len(elem.attrib),
            'text_nodes': 1 if elem.text and elem.text.strip() else 0
        }

        for child in elem:
            child_stats = traverse(child, depth + 1)
            stats['max_depth'] = max(
                stats['max_depth'], child_stats['max_depth'])
            stats['total_elements'] += child_stats['total_elements']
            stats['attributes_count'] += child_stats['attributes_count']
            stats['text_nodes'] += child_stats['text_nodes']

            # Merge element type counts
            for tag, count in child_stats['element_types'].items():
                stats['element_types'][tag] = stats['element_types'].get(
                    tag, 0) + count

        return stats

    return traverse(element)


def visualize_with_networkx(element: ET.Element, filename: str = "xml_tree.png"):
    """Create a graphical tree visualization using NetworkX and matplotlib"""
    try:
        import networkx as nx
        import matplotlib.pyplot as plt
    except ImportError:
        print("NetworkX and matplotlib required for graphical visualization")
        print("Install with: pip install networkx matplotlib")
        return

    def build_graph(elem: ET.Element, graph, parent_id=None, node_id=0):
        current_id = node_id
        label = elem.tag

        # Add attributes to label if present
        if elem.attrib:
            attrs = ", ".join(f"{k}={v}" for k, v in elem.attrib.items())
            label += f"\n({attrs})"

        # Add text content if present
        if elem.text and elem.text.strip():
            label += f"\n\"{elem.text.strip()}\""

        graph.add_node(current_id, label=label)

        if parent_id is not None:
            graph.add_edge(parent_id, current_id)

        next_id = current_id + 1
        for child in elem:
            next_id = build_graph(child, graph, current_id, next_id)

        return next_id

    # Create graph
    G = nx.DiGraph()
    build_graph(element, G)

    # Draw the tree
    plt.figure(figsize=(12, 8))
    pos = nx.spring_layout(G, k=2, iterations=50)

    # Draw nodes
    nx.draw(G, pos, with_labels=True, labels=nx.get_node_attributes(G, 'label'),
            node_color='lightblue', node_size=3000, font_size=8,
            font_weight='bold', edge_color='gray', arrows=True)

    plt.title("XML Tree Structure")
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(filename, dpi=300, bbox_inches='tight')
    plt.show()

    print(f"Tree visualization saved as {filename}")


def visualize_with_graphviz(element: ET.Element, filename: str = "xml_tree"):
    """Create a tree visualization using GraphViz"""
    try:
        from graphviz import Digraph
    except ImportError:
        print("GraphViz required for visualization")
        print("Install with: pip install graphviz")
        print("Also install GraphViz system package: https://graphviz.org/download/")
        return

    def build_dot_graph(elem: ET.Element, dot, parent_id=None, node_id=0):
        current_id = f"node_{node_id}"
        label = elem.tag

        # Add attributes to label if present
        if elem.attrib:
            attrs = "\\n".join(f"{k}={v}" for k, v in elem.attrib.items())
            label += f"\\n{attrs}"

        # Add text content if present
        if elem.text and elem.text.strip():
            label += f"\\n{elem.text.strip()}"

        dot.node(current_id, label=label)

        if parent_id is not None:
            dot.edge(parent_id, current_id)

        next_id = node_id + 1
        for child in elem:
            next_id = build_dot_graph(child, dot, current_id, next_id)

        return next_id

    # Create graph
    dot = Digraph(comment='XML Tree Structure')
    dot.attr(rankdir='TB')  # Top to bottom layout

    build_dot_graph(element, dot)

    # Save and render
    dot.render(filename, view=True, format='png')
    print(f"Tree visualization saved as {filename}.png")


def create_tree_dataframe(element: ET.Element) -> pd.DataFrame:
    """Convert XML tree to a pandas DataFrame for analysis"""
    def traverse_tree(elem: ET.Element, path: str = "", depth: int = 0) -> List[Dict]:
        rows = []

        # Current element
        row = {
            'path': path + elem.tag,
            'tag': elem.tag,
            'depth': depth,
            'has_children': len(list(elem)) > 0,
            'attributes': str(elem.attrib) if elem.attrib else None,
            'text_content': elem.text.strip() if elem.text and elem.text.strip() else None,
            'parent': path.rstrip('/') if path else None
        }
        rows.append(row)

        # Children
        for child in elem:
            child_path = f"{path}{elem.tag}/"
            rows.extend(traverse_tree(child, child_path, depth + 1))

        return rows

    rows = traverse_tree(element)
    return pd.DataFrame(rows)



"""Demonstrate XML tree visualization with the Mondial dataset"""
    # Create a sample XML for demonstration
sample_xml = """<?xml version="1.0" encoding="UTF-8"?>
<bookstore>
<book category="fiction">
    <title>Harry Potter</title>
    <author>J.K. Rowling</author>
</book>
<book category="non-fiction">
    <title>XML Guide</title>
    <author>John Doe</author>
</book>
</bookstore>"""
root = ET.fromstring(sample_xml)
print("Using sample XML for demonstration:")
print(sample_xml)

print("\n" + "="*60)
print("1. ASCII TREE VISUALIZATION")
print("="*60)
print_ascii_tree(root)

print("\n" + "="*60)
print("2. TREE STATISTICS")
print("="*60)
stats = get_tree_stats(root)
for key, value in stats.items():
    print(f"{key}: {value}")

print("\n" + "="*60)
print("3. TREE AS DATAFRAME")
print("="*60)
df = create_tree_dataframe(root)
print(df.head(10))


Using sample XML for demonstration:
<?xml version="1.0" encoding="UTF-8"?>
<bookstore>
<book category="fiction">
    <title>Harry Potter</title>
    <author>J.K. Rowling</author>
</book>
<book category="non-fiction">
    <title>XML Guide</title>
    <author>John Doe</author>
</book>
</bookstore>

1. ASCII TREE VISUALIZATION
└── bookstore
    ├── book
    │   ├── @category: fiction
    │   ├── title
    │   │   └── text: 'Harry Potter'
    │   └── author
    │       └── text: 'J.K. Rowling'
    └── book
        ├── @category: non-fiction
        ├── title
        │   └── text: 'XML Guide'
        └── author
            └── text: 'John Doe'

2. TREE STATISTICS
max_depth: 2
total_elements: 7
element_types: {'bookstore': 1, 'book': 2, 'title': 2, 'author': 2}
attributes_count: 2
text_nodes: 4

3. TREE AS DATAFRAME
                    path        tag  depth  has_children  \
0              bookstore  bookstore      0          True   
1         bookstore/book       book      1          True  

### 1.1 Load the dataset and inspect the schema.

We use the [pandas](https://pandas.pydata.org/) library to load and process XML files in Python.

Pandas offers the function [read_xml](https://pandas.pydata.org/docs/reference/api/pandas.read_xml.html) to read XML documents into a pandas DataFrame object.

In this first task, load the dataset from 'input/mondial-3.0.xml' and print the names of the nodes below the root node.

In [2]:
import pandas as pd

# Load the file and return the columns. The columns of the dataframe represent nodes of the input XML.
df_nodes = pd.read_xml("../Task/input/mondial-3.0.xml", xpath = "/*")
df_nodes.columns

Index(['continent', 'country', 'organization', 'mountain', 'desert', 'island',
       'river', 'sea', 'lake'],
      dtype='object')

### 1.1a Basic Element Selection

Let's start with the basics of XPath. First, select all continent elements from the XML document and print how many continents there are.

Hint: The `//` selector finds all elements of a specific type anywhere in the document, regardless of their position in the hierarchy.

In [3]:
# Select all continent elements and count them
df_continents = pd.read_xml("../Task/input/mondial-3.0.xml", xpath="//continent")
print(f"Number of continents: {len(df_continents)}")
df_continents.head()

Number of continents: 5


Unnamed: 0,id,name
0,f0_119,Europe
1,f0_123,Asia
2,f0_126,America
3,f0_129,Australia/Oceania
4,f0_132,Africa


### 1.1b Attribute Selection

Now, let's learn how to select attributes. Get the names of all continents using attribute selection.

Hint: The `@` symbol is used to select attributes in XPath. You can access attributes directly or select elements and then access their attributes using pandas column syntax.

In [14]:
from lxml import etree
import pandas as pd

tree = etree.parse('../Task/input/mondial-3.0.xml')

# Get the names of all continents
print(tree.xpath('//continent/@name'))

['Europe', 'Asia', 'America', 'Australia/Oceania', 'Africa']


### 1.1c Filtering by Attribute Value

Let's practice filtering elements by their attribute values. Find the continent element that has the name 'Europe'.

Hint: Square brackets `[]` are used for filtering in XPath. Inside the brackets, you can specify conditions using attributes and comparison operators.

In [None]:
# Find the continent with name 'Europe'
df_europe_continent = pd.read_xml("../Task/input/mondial-3.0.xml", xpath="//continent[@name='Europe']")
print("Europe continent details:")
df_europe_continent

Europe continent details:


Unnamed: 0,id,name
0,f0_119,Europe


### 1.1d Positional Selection

XPath allows you to select elements by their position. Get the name of the first country in the dataset.

Hint: Position indexing in XPath uses square brackets with numbers. XPath positions start counting from 1, not 0.

In [None]:
# Get the name of the first country
df_first_country = pd.read_xml("../Task/input/mondial-3.0.xml", xpath="//country[1]")
print(f"First country name: {df_first_country['name'].iloc[0]}")
df_first_country[['name', 'population', 'capital']]

First country name: 
       Albania
     


Unnamed: 0,name,population,capital
0,\n Albania\n,3249136,f0_1461


### 1.1e Countries in Europe (Complex XPath)

Select all `country` elements encompassed by the continent named 'Europe' and print their names.


In [None]:
# Countries that belong to Europe
df_europe_preview = pd.read_xml("../Task/input/mondial-3.0.xml",
                                xpath="/mondial/country[encompassed/@continent=/mondial/continent[@name='Europe']/@id]")
df_europe_preview['name']


### 1.1f Conditional Selection (and/or)

Select all countries that (are in Europe and have a population greater than 20,000,000) or have a total area greater than 1,000,000, and print their names.


In [None]:
# Countries in Europe with population > 20,000,000 OR total_area > 1,000,000
df_conditional = pd.read_xml("../Task/input/mondial-3.0.xml",
                             xpath="/mondial/country[(encompassed/@continent=/mondial/continent[@name='Europe']/@id and @population > 20000000) or @total_area > 1000000]")
df_conditional['name']


Now that you've practiced the basics of XPath, let's move on to more complex queries that combine multiple concepts:

## 2 JSON

### 2.1 Load the dataset

We use the [pandas](https://pandas.pydata.org/) library to load and process JSON files in Python.

Pandas offers the function [read_json](https://pandas.pydata.org/docs/reference/api/pandas.read_json.html) to read JSON documents into a pandas DataFrame object.

Load the dataset from 'input/mondial-3.0-europe-countries.json' and calculate the total number of inhabitants of all countries in the file.

Hint:
- Double-check the format of the JSON document before loading the file.

In [None]:
# Load and inspect the dataset.
df_countries = pd.read_json('input/mondial-3.0-europe-countries.json', lines=True)
df_countries.head()

Unnamed: 0,id,total_area,infant_mortality,datacode,name,indep_date,gdp_total,population_growth,inflation,government,gdp_agri,car_code,capital,population,gdp_serv,gdp_ind
0,f0_136,28750.0,49.2,AL,Albania,28 11 1912,4100.0,1.34,16.0,emerging democracy,55.0,AL,f0_1461,3249136,,
1,f0_144,450.0,2.2,AN,Andorra,,1000.0,2.96,,parliamentary democracy that retains as i...,,AND,f0_1464,72766,,
2,f0_149,83850.0,6.2,AU,Austria,12 11 1918,152000.0,0.41,2.3,federal republic,2.0,A,f0_1467,8023244,64.0,34.0
3,f0_157,207600.0,13.4,BO,Belarus,25 08 1991,49200.0,0.2,244.0,republic,21.0,BY,f0_1474,10415973,30.0,49.0
4,f0_162,30510.0,6.4,BE,Belgium,04 10 1830,197000.0,0.33,1.6,constitutional monarchy,2.0,B,f0_1477,10170241,70.0,28.0


In [None]:
# Calculate the total population.
df_countries['population'].sum()

792002189