In [6]:
#https://grobid.readthedocs.io/en/latest/training/Bibliographical-references/
from bs4 import BeautifulSoup
import os
import pandas as pd
import numpy as np
import lxml as lxml

In [7]:
def read_tei(tei_file):
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'lxml')
        return  soup.findAll('biblstruct')
    raise RuntimeError('Cannot generate a soup from the input')


def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default
    
def attr_to_text(elem, attr,default=''):
    if elem:
        return elem.attrs[attr]
    else:
        return default
    
from dataclasses import dataclass
@dataclass
class Person:
    firstname: str
    middlename: str
    surname: str
        
from os.path import basename, splitext

def basename_without_ext(path):
    base_name = basename(path)
    stem, ext = splitext(base_name)
    if stem.endswith('.tei'):
        # Return base name without tei file
        return stem[0:-4]
    else:
        return stem

In [8]:
#extract reference
class TEIFile(object):
    def __init__(self, biblstruct):
        #self.filename = filename
        self.soup = biblstruct
        self._text = None
        self._title = ''
        self._abstract = ''

    @property
    def doi(self):
        idno_elem = self.soup.find('idno')
        if not idno_elem:
            return ''
        else:
            return idno_elem.getText()
    @property
    def date(self):
        date_elem = self.soup.find('date',type = 'published')
        date_when = attr_to_text(date_elem,'when')
        if not date_elem:
            return ''
        else:
            return date_when
    @property
    def title(self):
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title

#    @property
#    def abstract(self):
#        if not self._abstract:
#            abstract = self.soup.abstract.getText(separator=' ', strip=True)
#            self._abstract = abstract
#        return self._abstract

    @property
    def authors(self):
        authors_in_bibl = self.soup.find_all('author')

        result = []
        for author in authors_in_bibl:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(firstname, middlename, surname)
            result.append(person)
        return result
    
    @property
    def publication(self):
        mongo_elem = self.soup.find('monogr')
        mongo_name = mongo_elem.find('title')
        if not mongo_name:
            return ''
        else:
            return mongo_name.getText()
        
    @property
    def publisher(self):
        pub_elem = self.soup.find('publisher')
        if not pub_elem:
            return ''
        else:
            return(pub_elem.getText())
 
    @property
    def text(self):
        if not self._text:
            divs_text = []
            for div in self.soup.body.find_all("div"):
                # div is neither an appendix nor references, just plain text.
                if not div.get("type"):
                    div_text = div.get_text(separator=' ', strip=True)
                    divs_text.append(div_text)

            plain_text = " ".join(divs_text)
            self._text = plain_text
        return self._text

In [9]:
def ref_to_csv_entry(tei,base_name):
    #tei = TEIFile(tei_file)
    #tei_file = file_to_tei(fname)
    #for tei in tei_file:
    return tei.doi, tei.title, tei.date, tei.authors, tei.publication, tei.publisher, base_name
# turn reference into csv row
def file_to_tei(fname):
    tei_file = read_tei(fname)
    #print(f"Handled {tei_file}")
    base_name = basename_without_ext(fname)
    tei_nodes = [TEIFile(node) for node in tei_file]
    ref = []
    for tei in tei_nodes:
        ref.append(ref_to_csv_entry(tei,base_name))
    result_csv = pd.DataFrame(ref, columns=['DOI', 'Title','Date', 'Authors','Publication','Publisher','File'])
    return result_csv

In [17]:
import glob
from pathlib import Path

papers = sorted(Path("../../reference_xml").glob('*.tei.xml'))

In [20]:
import multiprocessing
print(f"My machine has {multiprocessing.cpu_count()} cores.")


PosixPath('../../reference_xml/16766_27580_FSPLT1_020012.tei.xml')

In [14]:

from multiprocessing.pool import Pool
pool = Pool()
#xmf_full = [floc + x for x in xmf]

My machine has 4 cores.


In [15]:
csv_entries = pool.map(file_to_tei,papers)

In [16]:
ref_df = pd.concat(csv_entries)
ref_df.to_csv('reference_set_df_10-14-19.csv')