## All necessary libraries

- **os :**  for creating and removing a directory (folder), fetching its contents, changing and identifying the current directory, etc.
- **re :** Regular Expression for pattern matching, it helps during parsing and extracting desirable pattern.
- **boto :** Amazon Web Services (AWS) SDK for Python.
- **boto3 :** it makes easy to integrate your Python application, library, or script with AWS services including Amazon S3.
- **request :** allows you to send HTTP requests using Python.
- **pandas :** responsible for data frame also It provides ready to use high-performance data structures and data analysis tools.
- **io/StringIO :** provides Python's main facilities for dealing with various types of I/O./ the StringIO module is an in-memory file-like object.
- **xml.etree.ElementTree :** allows you to parse and navigate an XML document.
- **BeautifulSoup :** used for web scraping purposes to pull the data out of HTML and XML files.

In [5]:
import os
import re
import boto
import boto3
import requests
import pandas as pd
from io import StringIO
import boto.s3.connection
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup as bs

# Step - 1
## Download the xml from URL

In [2]:
target_url = 'https://registers.esma.europa.eu/solr/esma_registers_firds_files/select?q=*&fq=publication_date:%5B2021-01-17T00:00:00Z+TO+2021-01-19T23:59:59Z%5D&wt=xml&indent=true&start=0&rows=100'

In [4]:
xml_url = requests.get(target_url)

In [5]:
#xml.content

# Step -2
## From the xml, parsing through to the download link whose file_type is DLTINS and download the zip

In [6]:
# extracting links from UIRL
url_containt = bs(xml_url.content, 'lxml')
for line in url_containt:
    regex = r'(https?://\S+)'
    output = re.findall(regex, str(line))
    for url in output:
        url = url.split('<')[0]
        print(url)

http://firds.esma.europa.eu/firds/DLTINS_20210117_01of01.zip
http://firds.esma.europa.eu/firds/DLTINS_20210119_01of02.zip
http://firds.esma.europa.eu/firds/DLTINS_20210119_02of02.zip
http://firds.esma.europa.eu/firds/DLTINS_20210118_01of01.zip


# Step - 3
## Extract the xml from the zip.

In [3]:
import zipfile

In [4]:
with zipfile.ZipFile('C:/Users/Asus/Desktop/DLTINS_20210117_01of01.zip', 'r') as zip_ref:
    zip_ref.extractall('C:/Users/Asus/Downloads/')

In [5]:
zip_ref

<zipfile.ZipFile [closed]>

In [11]:
# with open('C:/Users/Asus/Desktop/DLTINS_20210117_01of01.xml', 'r', encoding='utf8') as f:
#     data = f.read()

In [13]:
# load and parse the file
xmlTree = ET.parse('C:/Users/Asus/Desktop/DLTINS_20210117_01of01.xml')

elemList = []

for elem in xmlTree.iter():
    elemList.append(elem.tag)

# now I remove duplicities - by convertion to set and back to list
elemList = list(set(elemList))

# Just printing out the result
print(elemList)

['{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}Val', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}Prcs', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}FinInstrmRptgRefDataDltaRpt', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}Pdg', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}IntrstRate', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}Pulp', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}Nrgy', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}FnlPricTp', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}Fltg', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}NewRcrd', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}FrstTradDt', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}Id', '{urn:iso:std:iso:20022:tech:xsd:head.003.001.01}Hdr', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}BasePdct', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}MntryVal', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02}OptnExrcStyle', '{urn:iso:std:iso:20022:tech:xsd:auth.036.001.02

In [9]:
with open("C:/Users/Asus/Desktop/DLTINS_20210117_01of01.xml", "r", encoding="utf-8") as file:
    # Read each line in the file, readlines() returns a list of lines
    content = file.read()
    # Combine the lines in the list into a string
    
    #content = "".join(content)
bs_content = bs(content, "xml")

In [11]:
result = bs_content.find_all("FinInstrm")

# Step - 4
## Convert the contents of the xml into a CSV with the following header:
- FinInstrmGnlAttrbts.Id
- FinInstrmGnlAttrbts.FullNm
- FinInstrmGnlAttrbts.ClssfctnTp
- FinInstrmGnlAttrbts.CmmdtyDerivInd
- FinInstrmGnlAttrbts.NtnlCcy
- Issr

In [10]:
cols = ['FinInstrmGnlAttrbts.Id', 'FinInstrmGnlAttrbts.FullNm',
        'FinInstrmGnlAttrbts.ClssfctnTp', 'FinInstrmGnlAttrbts.CmmdtyDerivInd', 
       'FinInstrmGnlAttrbts.NtnlCcy', 'Issr']
rows = []

In [12]:
for data in result:
    FinInstrmGnlAttrbts = data.find('FinInstrmGnlAttrbts')
    Id = FinInstrmGnlAttrbts.find('Id').text
    full_name = FinInstrmGnlAttrbts.find('FullNm').text
    ClssfctnTp = FinInstrmGnlAttrbts.find('ClssfctnTp').text
    CmmdtyDerivInd = FinInstrmGnlAttrbts.find('CmmdtyDerivInd').text
    NtnlCcy = FinInstrmGnlAttrbts.find('NtnlCcy').text
    Issr = data.find('Issr').text
    rows.append({'FinInstrmGnlAttrbts.Id': Id,
                'FinInstrmGnlAttrbts.FullNm': full_name,
                'FinInstrmGnlAttrbts.ClssfctnTp': ClssfctnTp,
                'FinInstrmGnlAttrbts.CmmdtyDerivInd': CmmdtyDerivInd,
                'FinInstrmGnlAttrbts.NtnlCcy': NtnlCcy,
                'Issr': Issr})

In [14]:
df = pd.DataFrame(rows, columns=cols)

In [16]:
df.head()

Unnamed: 0,FinInstrmGnlAttrbts.Id,FinInstrmGnlAttrbts.FullNm,FinInstrmGnlAttrbts.ClssfctnTp,FinInstrmGnlAttrbts.CmmdtyDerivInd,FinInstrmGnlAttrbts.NtnlCcy,Issr
0,DE000A1R07V3,Kreditanst.f.Wiederaufbau Anl.v.2014 (2021),DBFTFB,False,EUR,549300GDPG70E3MBBU98
1,DE000A1R07V3,KFW 1 5/8 01/15/21,DBFTFB,False,EUR,549300GDPG70E3MBBU98
2,DE000A1R07V3,Kreditanst.f.Wiederaufbau Anl.v.2014 (2021),DBFTFB,False,EUR,549300GDPG70E3MBBU98
3,DE000A1R07V3,Kreditanst.f.Wiederaufbau Anl.v.2014 (2021),DBFTFB,False,EUR,549300GDPG70E3MBBU98
4,DE000A1X3J56,IKB Deutsche Industriebank AG Stufenz.MTN-IHS ...,DTVUFB,False,EUR,PWEFG14QWWESISQ84C69


In [19]:
df.to_csv('xml_extract.csv')

In [18]:
os.getcwd()

'C:\\Users\\Asus\\Desktop\\B.tech\\upGrad\\python with AIML\\NLP'

# Step - 5
## Process of storing the csv from step 4) in an AWS S3 bucket

- Due to some technical issue with my aws account i was not able to acess so that here I am giving the path of process

In [2]:
access_key = ""
secret_key = ""

In [15]:
connection = boto.connect_s3(aws_access_key_id=" ",
                            aws_secret_access_key=" ")

{urn:iso:std:iso:20022:tech:xsd:head.003.001.01}Hdr {}
{urn:iso:std:iso:20022:tech:xsd:head.003.001.01}Pyld {}


In [None]:
# Get list of existing buckets
for bucket in connection.get_all_buckets():
    print("{name}\t{created}".format(
    name = bucket.anem,
    time = bucket.creation_date,
    ))

In [None]:
# Creating New bucket
bucket_name = connection.create_bucket("XYZ")

### upload csv file to s3 bucket

In [8]:
csv_file = pd.read_csv("xml_extract.csv")

In [9]:
s3 = boto3.client('s3',
                 aws_access_key_id = " ",
                 aws_secret_access_key = " ")

In [None]:
csv_buf = StringIO()

In [None]:
csv_file.to_csv(csv_buf, header=True, index=False)