In [1]:
import numpy as np
import pandas as pd
import re

from bs4 import BeautifulSoup

In [119]:
pd.set_option('max_colwidth',2000)

In [2]:
# Load xml data and parse it with BeautifulSoup
file_object = open('to_be_parsed.txt') 
to_be_parsed = file_object.read() 
Soup = BeautifulSoup(to_be_parsed, 'lxml')

In [7]:
print(Soup.prettify())

<html>
 <body>
  <records xmlns="http://scientific.thomsonreuters.com/schema/wok5.4/public/FullRecord">
   <rec r_id_disclaimer="ResearcherID data provided by Clarivate Analytics">
    <uid>
     WOS:000165681500009
    </uid>
    <static_data>
     <summary>
      <ewuid>
       <wuid coll_id="WOS">
       </wuid>
       <edition value="WOS.SCI">
       </edition>
      </ewuid>
      <pub_info coverdate="DEC 2000" has_abstract="Y" issue="10" pubmonth="DEC" pubtype="Journal" pubyear="2000" sortdate="2000-12-01" vol="22">
       <page begin="925" end="933" page_count="9">
        925-933
       </page>
      </pub_info>
      <titles count="6">
       <title type="source">
        ENERGY SOURCES
       </title>
       <title type="source_abbrev">
        ENERG SOURCE
       </title>
       <title type="abbrev_iso">
        Energy Sources
       </title>
       <title type="abbrev_11">
        ENERG SOURC
       </title>
       <title type="abbrev_29">
        ENERG SOURCE
       </titl

In [117]:
paper_list = Soup.find_all('rec')
data = []
for paper in paper_list:
    # Paper information
    title = paper.find('title',type='item').text 
    uid = paper.find('uid').text
    publish_date = paper.find('pub_info').get('sortdate')
    vol = paper.find('pub_info').get('vol')
    pubtype = paper.find('pub_info').get('pubtype')
    issue = paper.find('pub_info').get('issue') 
    language = list(map(lambda x: x.text, paper.find('languages').find_all('language', type='primary')))
    doctype = paper.find('doctype').text
    source = paper.find('title', type='source').text
    keywords = list(map(lambda x: x.text, paper.find_all('keyword')))
    abstract = paper.find('p').text if paper.find('p')!=None else paper.find('p')
    headings = list(map(lambda x: x.text, paper.find_all('heading'))) \
                if len(paper.find_all('heading'))!=0 else None
    subheadings = list(map(lambda x: x.text, paper.find_all('subheading'))) \
                    if len(paper.find_all('subheading'))!=0 else None
    subjects = list(map(lambda x: x.text, paper.find_all('subject'))) \
                if len(paper.find_all('subject'))!=0 else None                 
    category_info = {'headings':headings, 'subheadings':subheadings, 'subjects':subjects}
    
    # Address info
    addresses = paper.find(lambda tag: tag.has_attr('count') and tag.name=='addresses')
    addresses_list = addresses.find_all('address_spec')
    addresses_info_dict = {}
    for address in addresses_list:
        addr_no = address.get('addr_no')
        full_address = address.find('full_address').text
        organization = address.find('organization').text
        city = address.find('city').text
        country = address.find('country').text
        addresses_info_dict.update({'addr_no_' + addr_no: {'full_address':full_address,\
                                                           'organization': organization,\
                                                           'city':city,\
                                                           'country':country}})
#         print(addresses_info_dict)
    
    # Author information
    names = paper.find(lambda tag: tag.has_attr('count') and tag.name=='names')
    names_list = names.find_all('name')
    name_info_dict = {}
    for name in names_list:
        # Basic name info
        daisng_id = name.get('daisng_id') 
        role = name.get('role') 
        seq_no = name.get('seq_no') 
        display_name = name.find('display_name').text
        full_name = name.find('full_name').text
        # Zip as a dictionary and add to list
        name_info_personal = {'daisng_id':daisng_id,\
                              'role':role,\
                              'seq_no':seq_no,\
                              'display_name': display_name}
#         print(name_info_dict)
        # If the relationship between address and name exist
        if name.get('addr_no') != None:
            name_addr_no = name.get('addr_no')
            name_info_personal.update({'address':addresses_info_dict['addr_no_' + name_addr_no]})
        else:
            name_info_personal.update(addresses_info_dict)
        name_info_dict.update({full_name:name_info_personal})
    data.append({'title': title, 
                 'uid': uid, 
                 'publish_date': publish_date,
                 'vol':vol,
                 'pubtype':pubtype,
                 'issue':issue,
                 'language':language,
                 'doctype':doctype,
                 'source':source,
                 'name_info':name_info_dict,
                 'keywords':keywords,
                 'category_info':category_info,
                 'abstract':abstract})
                                                
publications = pd.DataFrame.from_dict(data)
publications.head(2)

Unnamed: 0,abstract,category_info,doctype,issue,keywords,language,name_info,publish_date,pubtype,source,title,uid,vol
0,The Ermenek Basin in the central Taurus Region...,"{'headings': ['Science & Technology'], 'subhea...",Article,10,"[coal, geostatistics, kriging, Turkey, variogram]",[English],"{'Demirel, IH': {'daisng_id': '1674276', 'role...",2000-12-01,Journal,ENERGY SOURCES,Geostatistical reserve estimation: A case stud...,WOS:000165681500009,22
1,This research is focused on the integrated pro...,"{'headings': ['Science & Technology'], 'subhea...",Article,6,"[ozone, pulp bleaching, pretreatment, algal tr...",[English],"{'Balcioglu, I. Akmehmet': {'daisng_id': '8977...",2006-12-01,Journal,OZONE-SCIENCE & ENGINEERING,Application of ozonation and biotreatment for ...,WOS:000242899300005,28


In [120]:
publications[['category_info']].head(2)

Unnamed: 0,category_info
0,"{'headings': ['Science & Technology'], 'subheadings': ['Technology'], 'subjects': ['Energy & Fuels', 'Engineering, Chemical', 'Energy & Fuels', 'Engineering']}"
1,"{'headings': ['Science & Technology'], 'subheadings': ['Technology', 'Life Sciences & Biomedicine'], 'subjects': ['Engineering, Environmental', 'Environmental Sciences', 'Engineering', 'Environmental Sciences & Ecology']}"


In [105]:
publications[['name_info']]

Unnamed: 0,name_info
0,"{'Demirel, IH': {'daisng_id': '1674276', 'role': 'author', 'seq_no': '1', 'display_name': 'Demirel, IH', 'addr_no_1': {'full_address': 'Hacettepe Univ, Dept Geol Engn, TR-06532 Ankara, Turkey', 'organization': 'Hacettepe Univ', 'city': 'Ankara', 'country': 'Turkey'}}, 'Sarac, C': {'daisng_id': '4486498', 'role': 'author', 'seq_no': '2', 'display_name': 'Sarac, C', 'addr_no_1': {'full_address': 'Hacettepe Univ, Dept Geol Engn, TR-06532 Ankara, Turkey', 'organization': 'Hacettepe Univ', 'city': 'Ankara', 'country': 'Turkey'}}, 'Sen, O': {'daisng_id': '450530', 'role': 'author', 'seq_no': '3', 'display_name': 'Sen, O', 'addr_no_1': {'full_address': 'Hacettepe Univ, Dept Geol Engn, TR-06532 Ankara, Turkey', 'organization': 'Hacettepe Univ', 'city': 'Ankara', 'country': 'Turkey'}}}"
1,"{'Balcioglu, I. Akmehmet': {'daisng_id': '897760', 'role': 'author', 'seq_no': '1', 'display_name': 'Balcioglu, I. Akmehmet', 'addr_no_1': {'full_address': 'Bogazici Univ, Inst Environm Sci, TR-34342 Istanbul, Turkey', 'organization': 'Bogazici Univ', 'city': 'Istanbul', 'country': 'Turkey'}, 'addr_no_2': {'full_address': 'Middle E Tech Univ, Dept Environm Engn, TR-06531 Ankara, Turkey', 'organization': 'Middle E Tech Univ', 'city': 'Ankara', 'country': 'Turkey'}, 'addr_no_3': {'full_address': 'Selcuk Univ, Fac Engn & Architecture, Dept Environm Engn, Konya, Turkey', 'organization': 'Selcuk Univ', 'city': 'Konya', 'country': 'Turkey'}}, 'Sarac, C.': {'daisng_id': '8108135', 'role': 'author', 'seq_no': '2', 'display_name': 'Sarac, C.', 'addr_no_1': {'full_address': 'Bogazici Univ, Inst Environm Sci, TR-34342 Istanbul, Turkey', 'organization': 'Bogazici Univ', 'city': 'Istanbul', 'country': 'Turkey'}, 'addr_no_2': {'full_address': 'Middle E Tech Univ, Dept Environm Engn, TR-06531 Ankara, Turkey', 'organization': 'Middle E Tech Univ', 'city': 'Ankara', 'country': 'Turkey'}, 'addr_no_3': {'full_address': 'Selcuk Univ, Fac Engn & Architecture, Dept Environm Engn, Konya, Turkey', 'organization': 'Selcuk Univ', 'city': 'Konya', 'country': 'Turkey'}}, 'Kivilcimdan, C.': {'daisng_id': '11336889', 'role': 'author', 'seq_no': '3', 'display_name': 'Kivilcimdan, C.', 'addr_no_1': {'full_address': 'Bogazici Univ, Inst Environm Sci, TR-34342 Istanbul, Turkey', 'organization': 'Bogazici Univ', 'city': 'Istanbul', 'country': 'Turkey'}, 'addr_no_2': {'full_address': 'Middle E Tech Univ, Dept Environm Engn, TR-06531 Ankara, Turkey', 'organization': 'Middle E Tech Univ', 'city': 'Ankara', 'country': 'Turkey'}, 'addr_no_3': {'full_address': 'Selcuk Univ, Fac Engn & Architecture, Dept Environm Engn, Konya, Turkey', 'organization': 'Selcuk Univ', 'city': 'Konya', 'country': 'Turkey'}}, 'Tarlan, E.': {'daisng_id': '4522409', 'role': 'author', 'seq_no': '4', 'display_name': 'Tarlan, E.'..."
2,"{'DOWD, PA': {'daisng_id': '235744', 'role': 'author', 'seq_no': '1', 'display_name': 'DOWD, PA'}, 'SARAC, C': {'daisng_id': '9722157', 'role': 'author', 'seq_no': '2', 'display_name': 'SARAC, C'}}"
3,"{'Henin, Simon': {'daisng_id': '368363', 'role': 'author', 'seq_no': '1', 'display_name': 'Henin, Simon', 'address': {'full_address': 'NYU, Sch Med, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Shankar, Anita': {'daisng_id': '1010882', 'role': 'author', 'seq_no': '2', 'display_name': 'Shankar, Anita', 'address': {'full_address': 'NYU, Sch Med, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Hasulak, Nicholas': {'daisng_id': '26775283', 'role': 'author', 'seq_no': '3', 'display_name': 'Hasulak, Nicholas', 'address': {'full_address': 'Neuropace Inc, Mountain View, CA USA', 'organization': 'Neuropace Inc', 'city': 'Mountain View', 'country': 'USA'}}, 'Friedman, Daniel': {'daisng_id': '267336', 'role': 'author', 'seq_no': '4', 'display_name': 'Friedman, Daniel', 'address': {'full_address': 'NYU, Sch Med, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Dugan, Patricia': {'daisng_id': '1972116', 'role': 'author', 'seq_no': '5', 'display_name': 'Dugan, Patricia', 'address': {'full_address': 'NYU, Sch Med, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Melloni, Lucia': {'daisng_id': '1289774', 'role': 'author', 'seq_no': '6', 'display_name': 'Melloni, Lucia', 'address': {'full_address': 'Max Planck Inst Empir Aesthet, Frankfurt, Germany', 'organization': 'Max Planck Inst Empir Aesthet', 'city': 'Frankfurt', 'country': 'Germany'}}, 'Flinker, Adeen': {'daisng_id': '3229930', 'role': 'author', 'seq_no': '7', 'display_name': 'Flinker, Adeen', 'address': {'full_address': 'NYU, Sch Med, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Sarac, Cansu': {'daisng_id': '28055296', 'role': 'author', 'seq_no': '8', 'display_name': 'Sarac, Cansu', 'address': {'full_address': 'NYU, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Fang, May': {'daisng_id': '28054879', 'role': 'a..."
4,"{'Henin, Simon': {'daisng_id': '368363', 'role': 'author', 'seq_no': '1', 'display_name': 'Henin, Simon', 'address': {'full_address': 'NYU, Sch Med, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Shankar, Anita': {'daisng_id': '1010882', 'role': 'author', 'seq_no': '2', 'display_name': 'Shankar, Anita', 'address': {'full_address': 'NYU, Sch Med, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Hasulak, Nicholas': {'daisng_id': '26775283', 'role': 'author', 'seq_no': '3', 'display_name': 'Hasulak, Nicholas', 'address': {'full_address': 'Neuropace Inc, Mountain View, CA USA', 'organization': 'Neuropace Inc', 'city': 'Mountain View', 'country': 'USA'}}, 'Friedman, Daniel': {'daisng_id': '267336', 'role': 'author', 'seq_no': '4', 'display_name': 'Friedman, Daniel', 'address': {'full_address': 'NYU, Sch Med, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Dugan, Patricia': {'daisng_id': '1972116', 'role': 'author', 'seq_no': '5', 'display_name': 'Dugan, Patricia', 'address': {'full_address': 'NYU, Sch Med, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Melloni, Lucia': {'daisng_id': '1289774', 'role': 'author', 'seq_no': '6', 'display_name': 'Melloni, Lucia', 'address': {'full_address': 'Max Planck Inst Empir Aesthet, Frankfurt, Germany', 'organization': 'Max Planck Inst Empir Aesthet', 'city': 'Frankfurt', 'country': 'Germany'}}, 'Flinker, Adeen': {'daisng_id': '3229930', 'role': 'author', 'seq_no': '7', 'display_name': 'Flinker, Adeen', 'address': {'full_address': 'NYU, Sch Med, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Sarac, Cansu': {'daisng_id': '28055296', 'role': 'author', 'seq_no': '8', 'display_name': 'Sarac, Cansu', 'address': {'full_address': 'NYU, New York, NY USA', 'organization': 'NYU', 'city': 'New York', 'country': 'USA'}}, 'Fang, May': {'daisng_id': '28054879', 'role': 'a..."
