## Import Libraries

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd
from collections import OrderedDict

## Read Config File

In [None]:
import configparser
config = configparser.ConfigParser()
config.read('config.ini')
input_file = config['DEFAULT']['Input-File']
output_file = config['DEFAULT']['Output-File']

## Read XML

In [None]:
tree = ET.parse(input_file)
root = tree.getroot()

## Get the Number of Records

In [None]:
for rs in root.findall('records'):
    l = len(rs.findall("record"))

## Initialize Lists

In [None]:
database_list = [None] * l
subsidiary_authors_list = [None] * l
title_list = [None] * l
keyword_list = [None] * l
publisher_list = [None] * l
access_date_list = [None] * l
related_url_list = [None] * l
pub_location_list = [None] * l
pub_dates_list = [None] * l
abstract_list = [None] * l
notes_list = [None] * l
doi_list = [None] * l
custom1_list = [None] * l
custom2_list = [None] * l
custom3_list = [None] * l
custom4_list = [None] * l
custom5_list = [None] * l

## Get Value From XML

In [None]:
i = 0
for rs in root.findall("records"):
    for r in rs.findall("record"):    
        # database
        database = getattr(r.find("database"),'text','')
        database_list[i] = database 
        
        # subsidiary-authors
        for cs in r.findall("contributors"):
            for sa in cs.findall("subsidiary-authors"):
                for a in sa.findall("author"):
                    subsidiary_authors = getattr(a.find("style"),'text','')
                    subsidiary_authors_list[i] = subsidiary_authors 
                    
        
        # title
        for ts in r.findall("titles"):  
            for t in ts.findall("title"):
                title = getattr(t.find("style"),'text','')
                title_list[i] = title 

        # keywords
        keyword = ""
        for ks in r.findall("keywords"):
            for k in ks.findall("keyword"):
                if len(ks.findall("keyword")) <= 1:
                    keyword = getattr(k.find("style"),'text','')
                else:
                    if keyword == "":
                        keyword = getattr(k.find("style"),'text','')
                    else:
                        keyword = keyword + ", " + getattr(k.find("style"),'text','') 
                keyword_list[i] = keyword  

        # publisher
        for p in r.findall("publisher"):
            publisher = getattr(p.find("style"),'text','')
            publisher_list[i] = publisher
                
        # access-date
        for ad in r.findall("access-date"):
            access_date = getattr(ad.find("style"),'text','')
            access_date_list[i] = access_date

        # pub-location
        for pl in r.findall("pub-location"):
            pub_location = getattr(pl.find("style"),'text','')
            pub_location_list[i] = pub_location

        # pub-dates
        for ds in r.findall("dates"):
            for pds in ds.findall("pub-dates"):
                for d in pds.findall("date"):
                    pub_dates = getattr(d.find("style"),'text','')
                    pub_dates_list[i] = pub_dates
                    
        # abstract
        for a in r.findall("abstract"):
            abstract = getattr(a.find("style"),'text','')
            abstract_list[i] = abstract
            
        # notes
        for ns in r.findall("notes"):
            notes = getattr(ns.find("style"),'text','')
            notes_list[i] = notes
            
        # DOI(electronic-resource-num)
        for ern in r.findall("electronic-resource-num"):
            doi = getattr(ern.find("style"),'text','')
            doi_list[i] = doi
            
        # custom 1 (benefit)
        for c1 in r.findall("custom1"):
            custom1 = getattr(c1.find("style"),'text','')
            custom1_list[i] = custom1

        # custom 2 (method)
        for c2 in r.findall("custom2"):
            custom2 = getattr(c2.find("style"),'text','')
            custom2_list[i] = custom2
            
        # custom 3 (source type)
        for c3 in r.findall("custom3"):
            custom3 = getattr(c3.find("style"),'text','')
            custom3_list[i] = custom3  
            
        # custom 4 (coverage)
        for c4 in r.findall("custom4"):
            custom4 = getattr(c4.find("style"),'text','')
            custom4_list[i] = custom4 

        # custom 5 (Yes Maybe No)
        for c5 in r.findall("custom5"):
            custom5 = getattr(c5.find("style"),'text','')
            custom5_list[i] = custom5
        
        i += 1

## Supporting Functions

In [None]:
def replace_special_character(column_list,replace_string):
    for i in range(0,len(column_list)):
        if column_list[i] is not None:
            new = column_list[i].replace('\r', replace_string)
            column_list[i] = new
    return column_list

## Replace Special Characters

In [None]:
title_list = replace_special_character(title_list,' ')
abstract_list = replace_special_character(abstract_list,' ')
notes_list = replace_special_character(notes_list,' ')
custom1_list = replace_special_character(custom1_list,', ')
custom2_list = replace_special_character(custom2_list,', ')
custom3_list = replace_special_character(custom3_list,', ')
custom4_list = replace_special_character(custom4_list,'')

## Write Into CSV

In [None]:
d = {'database':database_list,'title':title_list,'subsidiary_authors':subsidiary_authors_list, \
     'keywords':keyword_list,'abstract':abstract_list,'notes':notes_list,'DOI':doi_list,'publisher':publisher_list, \
     'access_date':access_date_list,'related_url':related_url_list,'pub_location':pub_location_list, \
     'pub_dates':pub_dates_list,'1.benefit':custom1_list, '2.method':custom2_list, \
    '3.source type':custom3_list,'4.coverage':custom4_list,'5.yes maybe no':custom5_list}

In [None]:
df = pd.DataFrame(OrderedDict(d))

In [None]:
df.to_csv(output_file,index = False, encoding='UTF-8')