# Author extraction

The original ArXiv metadata authors were parsed using some combination of heuristics.
Sometimes e.g. university and/or location becomes part of the name, usually as last item in a list.
Some examples: '1 and 3', 'Iowa', 'Harvard University'.

Maybe these can be manually fixed by using some heuristics, but maybe NER could be easy for ML approaches here.
Proposal: use BERT pre-trained on CoNLL2013 to extract author names.

In [1]:
import os
import sys
if '..' not in sys.path:
    sys.path.append('..')
import logging
import _io

from arxiv_analysis import load

import itertools as it
import functools as ft
import collections as col
import typing as ty
import types

import json

import numpy as np
import scipy as sp
import pandas as pd

import torch
import spacy
import sklearn
import transformers as trf

from torch import nn
from torch import functional as F

import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from collections import defaultdict
from typing import Callable, Union, List, Iterable


logging.basicConfig(
    format='[%(levelname)s] %(asctime)s %(message)s',
    datefmt='%Y-%m-%d %H:%M',
    level=logging.DEBUG)
L = logging.getLogger()


%matplotlib inline


base = Path('/kaggle/input')
files = {f.name:list(f.glob('*')) 
         for f in base.glob('*')
         if f.is_dir()}

!ls -l $base



total 4
drwxr-xr-x 2 nobody nogroup 4096 Nov  8 09:31 arxiv


In [2]:
def trf_line(d: dict) -> dict:
    if 'cs.ai' in d['categories'].lower():
        return d
    return None

arxiv = load.ArxivDataset.load(
    loc=files['arxiv'][0],
    transform=trf_line,
    limit=100,
)

In [25]:
class AuthorClassifier:
    label_list = [
        "O",       # Outside of a named entity
        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
        "I-MISC",  # Miscellaneous entity
        "B-PER",   # Beginning of a person's name right after another person's name
        "I-PER",   # Person's name 
        "B-ORG",   # Beginning of an organisation right after another organisation
        "I-ORG",   # Organisation
        "B-LOC",   # Beginning of a location right after another location
        "I-LOC"    # Location
    ]
    
    def __init__(self, pretrained, device='cpu'):
        self.device = device
        self.model = trf.AutoModelForTokenClassification.from_pretrained(pretrained).to(device)
        self.tokenizer = trf.AutoTokenizer.from_pretrained("bert-base-cased")
                                          
    def __call__(self, doc: spacy.tokens.Doc):
        '''Extract entities from the SpaCy document.
        '''
        inputs = self.tokenizer.encode(doc.text, return_tensors="pt")
        outputs, *_ = self.model(inputs.to(self.device))
        predictions, *_ = torch.argmax(outputs, dim=2).squeeze(-1).cpu()
        tokens = self._tokenize(doc.text)
        
        ents = list(self._align_ner2doc(doc, predictions, tokens))
        authors = list(self._extract_authors(doc, ents))
        
        return ents, authors
    
    def _tokenize(self, in_str):
        return self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(in_str))

    @classmethod
    def _align_ner2doc(cls, doc, predictions, tokens):
        def _match(text, tok_ix): # match spacy's tokens with tokenizer tokens
            text_ix = 0
            while tok_ix < len(tokens):
                tok = tokens[tok_ix].lstrip('#')
                if not text[text_ix:text_ix+len(tok)] == tok:
                    break
                tok_ix += 1
                text_ix += len(tok)
            return tok_ix
        
        i = 1
        for j, spacy_tok in enumerate(doc):
            # 1st wordpiece determines entire token
            pred_idx = predictions[i]
            label = cls.label_list[pred_idx]
            yield spacy.tokens.Span(doc, j, j+1, label=label)
            i = _match(spacy_tok.text, i)
            
    @staticmethod
    def _extract_authors(doc, ents: ty.List[spacy.tokens.Span]):
        def _contiguous_ner_end_idx(ents, j): # find contiguous span of *-PER labels
            while j < len(ents):
                if ents[j-1].end != ents[j].start or (
                        # hack: special-case hyphenated names.
                        ents[j].text is not '-' and
                        not ents[j].label_.endswith('PER')):
                    break
                j += 1
            return j
        
        i = 0
        while i < len(ents):
            j = i + 1
            if ents[i].label_.endswith('PER'):
                j = _contiguous_ner_end_idx(ents, j)
                yield spacy.tokens.Span(doc, i, j, label='author')
            i = j
    

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
clf = AuthorClassifier(
    pretrained="dbmdz/bert-large-cased-finetuned-conll03-english",
    device=device)
clf(arxiv.df.iloc[0].authors)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=998.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1334448817.0, style=ProgressStyle(descr…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




([T., Kosel, and, I., Grabec], [T. Kosel, I. Grabec])

In [27]:
%%time

# either attach authors to the DataFrame
# or directly as an extension to the document
spacy.tokens.Doc.set_extension('authors', default=None, force=True)
author_list = arxiv.df.authors.head(n=100)

for author_doc in author_list:
    ents, authors = clf(author_doc)
    author_doc.ents = ents
    author_doc._.authors = authors

CPU times: user 55.1 s, sys: 791 ms, total: 55.9 s
Wall time: 28 s


In [28]:
print('\n'.join(f'{i}:{x._.authors}' for i, x in enumerate(author_list)))

0:[T. Kosel, I. Grabec]
1:[T. Kosel, I. Grabec]
2:[Carlos Gershenson]
3:[Mohd Abubakr, R.M.Vinay]
4:[Jianlin Cheng]
5:[Tarik Hadzic, Rune Moller Jensen, Henrik Reif Andersen]
6:[Yao HengShuai]
7:[Anon Plangprasopchok, Kristina Lerman]
8:[Kristina Lerman, Anon Plangprasopchok, Chio Wong]
9:[Stefano Bistarelli, Ugo Montanari, Francesca Rossi, Francesco Santini]
10:[Juliana S Bernardes, Alberto Davila, Vitor Santos Costa, Gerson
  Zaverucha]
11:[H. Satori, M. Harti, N. Chenfour]
12:[H. Satori, M. Harti, N. Chenfour]
13:[Giorgio Terracina, Nicola Leone, Vincenzino Lio, Claudio Panetta]
14:[Quoc Le, Alexander Smola]
15:[Marko A. Rodriguez]
16:[Tshilidzi Marwala, Bodie Crossingham]
17:[S. Mohamed, D. Rubin, T. Marwala]
18:[J. Uglov, V. Schetinin, C. Maple]
19:[Walid S. Saba]
20:[Christian Gagn\'e, Mich\`ele Sebag, Marc Schoenauer, Marco Tomassini]
21:[Andreas Martin Lisewski]
22:[Tshilidzi Marwala, Unathi Mahola, Snehashish Chakraverty]
23:[Erik Berglund, Joaquin Sitte]
24:[Edgar H. de Graaf

## Heuristic check

Let's try to compare which examples NER picks up against our simple heuristic.

In [29]:
idx = arxiv.indices
with open(files['arxiv'][0]) as fp:
    parsed_authors = pd.DataFrame(load.jsonl(
        fp, 
        transform=lambda dict_: {'authors_parsed': dict_['authors_parsed']} if dict_['id'] in idx else None,
        limit=100
    ))

In [30]:
parsed_authors

Unnamed: 0,authors_parsed
0,"[[Kosel, T., ], [Grabec, I., ]]"
1,"[[Kosel, T., ], [Grabec, I., ]]"
2,"[[Gershenson, Carlos, ]]"
3,"[[Abubakr, Mohd, ], [Vinay, R. M., ]]"
4,"[[Cheng, Jianlin, ]]"
...,...
95,"[[Montanari, Andrea, ], [Ricci-Tersenghi, Fede..."
96,"[[Fodor, Paul, ]]"
97,"[[Li, Xinde, , ICRL], [Huang, Xinhan, , ICRL],..."
98,"[[Khrennikov, Andrei, ]]"


In [32]:
for i, (a_ner, a_heuristic) in enumerate(zip(author_list, parsed_authors.authors_parsed)):
    if len(a_heuristic) != len(a_ner._.authors):
        print()
        print(i)
        print('Heuristic ::', a_heuristic)
        print(a_ner.text)
        print([f'{x.text} ({x.label_}) //' for x in a_ner.ents])
        print([f'{x} //' for x in a_ner._.authors])

# Manually compare to heuristics

Based on a quick overview of a subset of the data, seems like we get 100% recall but improve precision (e.g. INRIA, LIPN, etc).

In [34]:
for i, (a_ner, a_heuristic) in enumerate(zip(author_list, parsed_authors.authors_parsed)):
    print('Transform ::', a_ner._.authors)
    print('Heuristic ::', a_heuristic)
    print()

Transform :: [T. Kosel, I. Grabec]
Heuristic :: [['Kosel', 'T.', ''], ['Grabec', 'I.', '']]

Transform :: [T. Kosel, I. Grabec]
Heuristic :: [['Kosel', 'T.', ''], ['Grabec', 'I.', '']]

Transform :: [Carlos Gershenson]
Heuristic :: [['Gershenson', 'Carlos', '']]

Transform :: [Mohd Abubakr, R.M.Vinay]
Heuristic :: [['Abubakr', 'Mohd', ''], ['Vinay', 'R. M.', '']]

Transform :: [Jianlin Cheng]
Heuristic :: [['Cheng', 'Jianlin', '']]

Transform :: [Tarik Hadzic, Rune Moller Jensen, Henrik Reif Andersen]
Heuristic :: [['Hadzic', 'Tarik', ''], ['Jensen', 'Rune Moller', ''], ['Andersen', 'Henrik Reif', '']]

Transform :: [Yao HengShuai]
Heuristic :: [['HengShuai', 'Yao', '']]

Transform :: [Anon Plangprasopchok, Kristina Lerman]
Heuristic :: [['Plangprasopchok', 'Anon', ''], ['Lerman', 'Kristina', '']]

Transform :: [Kristina Lerman, Anon Plangprasopchok, Chio Wong]
Heuristic :: [['Lerman', 'Kristina', ''], ['Plangprasopchok', 'Anon', ''], ['Wong', 'Chio', '']]

Transform :: [Stefano Bistar