## parsing with lxml lib using parse() utility function

In [None]:
from lxml import etree
import pandas as pd

#validate xml egainst extrenal dtd file
parser = etree.XMLParser(load_dtd=True) #load_dtd=True #dtd_validation=True
tree = etree.parse("data/dblp.xml", parser)
root = tree.getroot()
#note that parse() returns an ElementTree object, not an Element object as the string parser functions

In [None]:
import re

def sanitize(text):
    '''
    Removes specific HTML Formatting Elements for italic text, i.e. <i>
    # example <title><i> NP </i> -Hard Problems in Hierarchical-Tree Clustering.</title>
    '''
    if text:
        return re.sub('<(/)*i>', u'', text)
    else:
        return text

In [None]:
import re
string = '<i>text here</i>'
re.sub('<(/)*i>', '', string)

In [None]:
from time import time

print("Parsing articles only...")
t0 = time()

### parse the entire file
dfcols = ['author', 'title', 'journal']
df_xml = pd.DataFrame(columns=dfcols)

collaborations = ['article']

author_list = []

#if 'key1' in dict.keys():

for node in root:
    if node.tag in collaborations:
        for child in node:
            if 'author' in child.tag:
                author_list.append(child.text)
                #print(child.text)
            elif 'title' in child.tag:
                title = sanitize(child.text)
                #print(child.text) 
            elif 'journal' in child.tag:
                journal = child.text
                #print(child.text) 
        for a in author_list:
            df_xml = df_xml.append(pd.Series([a, title, journal], index=dfcols), ignore_index = True)
        #clearing a list 
        del author_list[:]

print("done in %0.3fs." % (time() - t0))

df_xml.to_csv("data/parsed_articles.csv", header=True, sep = ',')

I let it run for an hour and it was not finished! We try another method.

In [None]:
df_xml.shape

## parsing with lxml lib using parse interparse() utility function

In [None]:
import re

def sanitize(text):
    '''
    Removes specific HTML Formatting Elements for italic text, i.e. <i>
    # example <title><i> NP </i> -Hard Problems in Hierarchical-Tree Clustering.</title>
    '''
    return re.sub('<(/)*i>', u'', text) 

In [None]:
# parsing only one child element called article  

from lxml import etree
import pandas as pd
from time import time

def fast_iter(context, func, *args, **kwargs):
    collaborations = ['article']
    author_list = []

    for event, elem in context:
     
        if 'author' in elem.tag:
            author_list.append(elem.text)
            #print(elem.text)
        elif 'title' in elem.tag:
            if elem.text:
                title = sanitize(elem.text) 
                #print(elem.text)
        elif 'journal' in elem.tag:
            journal = sanitize(elem.text)
            #print(elem.text)   
        
        elif elem.tag in collaborations:
            if len(author_list) is not 0:
                for a in author_list:
                    func(a + "," + title + "," + journal, *args, **kwargs)
                title = ''
                del author_list[:]    
        
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
    del context            

    
def process_element(elem, fout):
    #print("writing ... " + elem)
    print(elem, file = fout)
    
    
if __name__ == "__main__":
    print("Parsing articles only...")
    t0 = time()
    fout = open('data/parsed_articles.txt', 'w')
    context = etree.iterparse('data/dblp.xml', load_dtd=True)
    fast_iter(context, process_element, fout)
    print("done in %0.3fs." % (time() - t0))

    

In [None]:
# parsing all child elements, including www child element which is a person record

from lxml import etree
import pandas as pd
from time import time

def fast_iter(context, func, *args, **kwargs):
    collaborations = [u'www', u'article', u'phdthesis', u'inproceedings', u'incollection', u'proceedings', u'book', u'mastersthesis']
    #collaborations = [u'article']
    
    author_list = []
    title = ''
    journal = ''

    for event, elem in context:
     
        if 'author' in elem.tag:
            author_list.append(elem.text)
        
        # use comparison instead of in. otherwise problems with proceedings and inproceedings child element
        elif elem.tag == 'title':
            #sometimes the elem.text returns None while there is a text inside
            #<title><i> LALR </i> (1, 1) Parser Generation for Regular Right Part Grammars.</title>
            if elem.text:
                title = sanitize(elem.text) 
               
        elif 'journal' in elem.tag:
            if elem.text:
                journal = sanitize(elem.text)
             
        elif elem.tag in collaborations:
            type_publication = elem.tag 
            key_value = elem.get('key')
            if len(author_list) is not 0:
                for a in author_list:
                    func(type_publication + "," + key_value + "," + a + "," + title + "," + journal, *args, **kwargs)
                title = ''
                journal = ''
                type_publication = ''
                key_value = ''
                del author_list[:]    
        
        elem.clear()
        #while elem.getprevious() is not None:
        #    del elem.getparent()[0]
    del context            

    
def process_element(elem, fout):
    #print("writing ... " + elem)
    print(elem, file = fout)
    
    
if __name__ == "__main__":
    print("Parsing...")
    t0 = time()
    fout = open('/Users/aj186039/projects/PMI_UseCase/data/parsed_dblp.txt', 'w')
    context = etree.iterparse('/Users/aj186039/projects/PMI_UseCase/data/dblp.xml', load_dtd = True)
    fast_iter(context, process_element, fout)
    print("done in %0.3fs." % (time() - t0))

    

In [None]:
string = "incollection"
"collection" in string

#### example person record

<www key="homepages/r/CJvanRijsbergen">
<author>C. J. van Rijsbergen</author>
<author>Cornelis Joost van Rijsbergen</author>
<author>Keith van Rijsbergen</author>
<title>Home Page</title>
<url>http://www.dcs.gla.ac.uk/~keith/</url>
</www>


## import parsed data and map author names to its unique key

In [None]:
import pandas as pd
data = pd.read_csv('/Users/aj186039/projects/PMI_UseCase/data/parsed_dblp.txt', sep = ',', header=None, encoding='utf-8', 
                   names = ["type_publication", "key_value" , "author", "title", "journal"], low_memory=False)
data.shape

In [None]:
data.dtypes

In [None]:
# return the columns names of df
data.columns

In [None]:
# unique values in a given df column
data.type_publication.unique()

In [None]:
# select rows whose column value equals some value
personal_data = data.loc[data['type_publication'] == 'www']

In [None]:
# drop rows whose column value equals some value
data = data[data['type_publication'] != 'www']

In [None]:
# return tuple representing the dimensionality of df
data.shape

In [None]:
# remove unnecessary columns
personal_data.drop(['type_publication','journal','title'] , axis = 1, inplace = True)

In [None]:
# return the first 5 columns
personal_data.head()

In [None]:
# find specific person in data
personal_data[personal_data['key_value'] == "homepages/r/CJvanRijsbergen"]

In [None]:
# group authors by its key value
# for example all these names of authors should be grouped as they have unique key
# {C. J. van Rijsbergen, Cornelis Joost van Rijsbergen, Keith van Rijsbergen} --> homepages/r/CJvanRijsbergen
grouped_personal_data = personal_data.groupby('key_value')['author'].apply(list) 

In [None]:
# return the type of data object
type(grouped_personal_data)

In [None]:
# create a dictonary from the series
dictinary_names = grouped_personal_data.to_dict()

In [None]:
# look up specific person
dictinary_names["homepages/r/CJvanRijsbergen"]

In [None]:
# return the first 5 columns
data.head() 

In [None]:
# replace in our data author names with corresponding keysm which are to be found in the above dict.
# for example all these names authors used in his publications should be replace with one and the same key
# {C. J. van Rijsbergen, Cornelis Joost van Rijsbergen, Keith van Rijsbergen} --> homepages/r/CJvanRijsbergen

data['authorNEW'] = ''
for idx,row  in data.iterrows():
    result = [k for k, v in dictinary_names.items() if name_to_look in v]
    if not result:
        data.loc[idx, 'authorNEW'] = data.loc[idx, 'author']
    else:
        data.loc[idx, 'authorNEW'] = result

I let it run for some time, the perfomance not satisfactory! Optimisation of the method required.

In [None]:
# simulate the required behavior 
mydict = {'amber': ['Katerina', 'Almerima'] , 'george': ['Buba'], 'george2': ['Buba2']}
name_to_look = 'Buba2'
[k for k, v in mydict.items() if name_to_look in v]

In [None]:
# save the data to text file
data = pd.to_csv('/Users/aj186039/projects/PMI_UseCase/data/final_parsed_dblp.txt', sep = ',', header=True, encoding='utf-8')
data.shape