In [1]:
import os
import wget
import time
import requests
import traceback
import pandas as pd
import zipfile as zp
import logging as lg
import xml.etree.ElementTree as et

In [2]:
def get_data_file():
    
    """
    Method to retrieve the parent XML data file and create a parse tree.
    """
    
    url = "https://registers.esma.europa.eu/solr/esma_registers_firds_files/select?q=*&fq=publication_date:%5B2021-01-17T00:00:00Z+TO+2021-01-19T23:59:59Z%5D&wt=xml"
    try:
        lg.info('Connecting to urllib3...')
        r = requests.get(url)
        lg.info('Writing data to "data.xml"')
        with open('data.xml', 'w') as f:
            f.write(r.text)
        lg.info('Parsing into XML tree...')
        data = et.parse('data.xml')
        return data
    except:
        print("Failed to parse xml from response (%s)" % traceback.format_exc())

In [3]:
def get_xml_file():
    
    """
    Method to extract the XML file in the specified zip folder.
    """
    
    try:
        doc = get_data_file()
        root = doc.getroot()
        result = root.find("result")
        elem = list(result)[0]
        url = list(elem)[1].text
        lg.info('Downloading the file...')
        filename = wget.download(url)
        lg.info('File downloaded: ' + filename)
        zipfile= zp.ZipFile(filename)
        conts = [name for name in zipfile.namelist()]
        filename = conts[0]
        lg.info('Found file: ' + filename)
        zipfile.extractall('data/')
        filepath = "data/"+filename
        lg.info('Exctracted contents into: ' + filepath)
        return filepath
    except Exception as e:
        lg.error(e)

In [4]:
def get_file_size_in_mb(file_path):
    """
    Method to check the file size in MB.
    params: 
        file_path: takes the path of the XML file which is a <'str'>
    """
    try:
        lg.info('Obtaining file size...')
        file_size = 0
        if os.path.isfile(file_path):
            lg.info('file found with the path ' + file_path)
            file_size = os.path.getsize(file_path) # Get the file size
            file_size = round(file_size / (1024 * 1024.0), 2) # Convert into MB
        else:
            lg.error('file not found with the path ' + file_path)
        file_size = '{:,.2f}'.format(file_size)
        return str(file_size) + ' MB'
    except Exception as e:
        lg.error(e)

In [5]:
def get_xml_info(file_path):
    
    """
    Method to check the file info 
    which includes the total no.of tags the file has
    and also displays the size of the file.
    
    params: 
        file_path: takes the path of the XML file which is a <'str'>
    """
    
    
    
    try:
        lg.info("Getting XML file Info...")
        doc = et.parse(file_path)

        elemListFull = []
        for elem in doc.iter():
            elemListFull.append(elem.tag)

        elemList = list(set(elemListFull))

        lg.info("Total Size of the file in MB: " + get_file_size_in_mb(file_path))
        lg.info("Total no.of tags found in the file: "+str(len(elemListFull)))
        lg.info("Total no.of unique tags found in the file: "+str(len(elemList)))
        lg.info("Finished getting info")
        return doc
    except Exception as e:
        lg.error(e)

In [6]:
def clean_xml_file(doc):
    
    """
    Method to clean the file and remove noisy data
    and present a clean simple tag structure.
    
    params: 
        doc: takes the eTree object of the XML file of the type <'xml.etree.ElementTree.ElementTree'>
    """
    
    try:
        lg.info('Cleaning the file...')
        elemNew = []
        for elem in doc.iter():
            elem.tag = elem.tag.split('}')[1]
            elemNew.append(elem.tag)
        lg.info('Finished cleaning')
        lg.info('Writing changes into "output.xml"')
        doc.write("output.xml")
        lg.info('Cleaned the file and stored as "output.xml"')
        doc = et.parse("output.xml")
        return doc
    except Exception as e:
        lg.error(e)

In [7]:
def extract_data(root):
    
    """
    Method to extract the relevant data from the XML file
    and it creates a CSV file out of the extracted data.
    
    params: 
        root: takes the eTree element object of the XML file of the type <'xml.etree.ElementTree.Element'>
    """
    
    try:
        lg.info('Extracting the data...')
        pyld = root.find('Pyld')
        document = list(pyld)[0]
        proot = list(document)[0]
        data = list(proot)
        data.pop(0)
        rows = []
        for fin in data:
            termnt = list(fin)[0]
            figa = termnt.find('FinInstrmGnlAttrbts')
            issr = termnt.find('Issr')

            for elem in figa:
                Id = figa.find('Id').text
                FullNm = figa.find('FullNm').text
                ClssfctnTp = figa.find('ClssfctnTp').text
                CmmdtyDerivInd = figa.find('CmmdtyDerivInd').text
                NtnlCcy = figa.find('NtnlCcy').text
            issr_data = issr.text

            rows.append({"Id":Id, "FullNm":FullNm, "ClssfctnTp":ClssfctnTp, "CmmdtyDerivInd":CmmdtyDerivInd, "NtnlCcy":NtnlCcy, "Issr":issr_data})
        lg.info("Creating Pandas Dataframe...")
        df = pd.DataFrame(rows)
        df.to_csv('dataset.csv', index=False)
        lg.info('Created dataframe and stored into "dataset.csv"')
        lg.info('End of program')
        
    except Exception as e:
        lg.error(e)

In [8]:
if __name__ == "__main__":
    lg.basicConfig(filename='logs.log',
                    filemode='a',
                    format='%(asctime)s,%(name)s %(levelname)s %(message)s',
                    datefmt='%H:%M:%S',
                    level=lg.DEBUG)
    start_time = time.perf_counter()
    filepath = get_xml_file()
    doc = get_xml_info(filepath)
    clean_doc = clean_xml_file(doc)
    extract_data(clean_doc.getroot())
    end_time = time.perf_counter()
    total_time = round(end_time - start_time, 2)
    lg.info(f'XML ops - Total time taken:[{total_time}s]')
    print("")
    print(f'XML ops - Total time taken:[{total_time}s]')

100% [..........................................................................] 4434514 / 4434514
XML ops - Total time taken:[29.66s]
