* simple Named Entity Reongition model with VAR and TYPE tags using spaCy
* training data: tex files from the Stacks Project annotated using a "Let ... be a ..." rule
* inspired by https://github.com/explosion/spaCy/blob/master/examples/training/train_ner.py

In [181]:
from __future__ import unicode_literals, print_function
import json
import pathlib
import random

import spacy
from spacy.pipeline import EntityRecognizer
from spacy.gold import GoldParse
from spacy.tagger import Tagger
from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, LOWER, TAG, ORTH

import os
import re
 
try:
    unicode
except:
    unicode = str

In [182]:
nlp = spacy.load('en')

In [183]:
#initialize the triggering rule 'let $' for the rule-based annotation
matcher = Matcher(nlp.vocab)
matcher.add_pattern("let", [{LOWER: "let"}, {ORTH: "$"}])

In [184]:
def tex2doc(tex_file):  #read the whole tex file in the spaCy doc object
    with open(tex_file, 'r') as tex:
        data=tex.read()
    doc = nlp(data)
    return doc

In [185]:
def rule_based_annotation(doc):
    annotation = []
    for m in matcher(doc):
        let_position = m[2]
        dollar_position = m[3]
        annotation += add_letDollarBe_entity(doc, let_position, dollar_position)
    return (doc.text, annotation)


def add_letDollarBe_entity(doc, let_position, dollar_position):
    new_annotation=[]
    sentence = doc[let_position : dollar_position].sent   #stay in the current sentence, so to respect sentence boundaries
    print('===== sent', sentence, '========\n')
    
    part = doc[let_position : sentence.end]  #forget the already-processed first part of the sentence
    
    for regex_match in re.finditer('^let (\$+[^\$]+\$+) be (an?|the) \S+', part.text, re.IGNORECASE): #there is at most one match in this for loop because of ^
        #compute left and right char offset for candidate VAR
        left_offset = part.start_char + regex_match.span(1)[0]
        right_offset = part.start_char + regex_match.span(1)[1]
        
        print('regex_match', regex_match.group(1))
        for nn in part.noun_chunks:
            print('nn candidate', nn)
            if nn.start_char >= right_offset + 3: #we only consider noun chunks after the '$ be '
                
                
                
                new_annotation += ((left_offset, right_offset, 'VAR'),)
                
                #left and right char offset for TYPE are already among the nn attributes
                new_annotation += ((nn.start_char, nn.end_char, 'TYPE'),)
                print('nn', nn.text)
                break  #we only consider the 1st noun chunk after the 'be' 
                
    
    print('new_annotation', new_annotation)
                    
    return new_annotation

In [193]:
annotated_data=[]

directory = os.fsencode('tex_files/')
list_of_texs = [os.fsdecode(file) for file in os.listdir(directory)[0:3]]
for filename in list_of_texs:
    print("file: ", filename)
    doc = tex2doc(os.path.join(os.fsdecode(directory), filename))
    annotated_data.append(rule_based_annotation(doc))    

file:  pione.tex
===== sent Before we state the result we introduce the category of $G$-sets for a
topological group $G$.

\begin{definition}
\label{definition-G-set-continuous}
Let $G$ be a topological group.

regex_match $G$
nn candidate a topological group
nn a topological group
new_annotation [(794, 797, 'VAR'), (801, 820, 'TYPE')]
===== sent \medskip\noindent
Recall that if $L/K$ is an infinite Galois extension then the
Galois group $G = \text{Gal}(L/K)$ comes endowed with a canonical
topology, see Fields, Section \ref{fields-section-infinite-galois}.

\begin{lemma}
\label{lemma-sheaves-point}

regex_match $K$
nn candidate a field
nn a field
new_annotation [(1863, 1866, 'VAR'), (1870, 1877, 'TYPE')]
===== sent Let $K^{sep}$ a separable closure of $K$.
Consider the profinite group $G = \text{Gal}(K^{sep}/K)$.

new_annotation []
===== sent In this section we discuss some of the material the reader can
find in \cite[Expos\'e V, Sections 4, 5, and 6]{SGA1}.

\medskip\noindent
Let $F :

===== sent Let $L^{sh} = L \otimes_K K_x^{sh}$ where $K_x^{sh}$ is the fraction field
of $\mathcal{O}_{X, x}^{sh}$. Then $L^{sh} = \prod_{i = 1, \ldots, n} L_i$

new_annotation []
===== sent \begin{lemma}
\label{lemma-structure-decomposition}
Let $A$ be a discrete valuation ring with fraction field $K$.

regex_match $A$
nn candidate a discrete valuation ring
nn a discrete valuation ring
new_annotation [(122395, 122398, 'VAR'), (122402, 122427, 'TYPE')]
===== sent Let $L/K$ be a (possibly infinite) Galois extension.

regex_match $L/K$
nn candidate a (possibly infinite) Galois extension
nn a (possibly infinite) Galois extension
new_annotation [(122457, 122462, 'VAR'), (122466, 122504, 'TYPE')]
===== sent Let $B$ be the integral closure of $A$ in $L$.
Let $\mathfrak m$ be a maximal ideal of $B$.
Let $G = \text{Gal}(L/K)$,
$D = \{\sigma \in G \mid \sigma(\mathfrak m) = \mathfrak m\}$, and
$I = \{\sigma \in D \mid \sigma \bmod \mathfrak m =

regex_match $B$
nn candidate the integral closure

===== sent \section{Cycles}
\label{section-cycles}

\noindent

regex_match $X$
nn candidate a variety
nn a variety
new_annotation [(5391, 5394, 'VAR'), (5398, 5407, 'TYPE')]
===== sent Let $Z_i$ be the irreducible components of $Z$ of
dimension $k$ and let $n_i$ be the {\it multiplicity of $Z_i$ in $Z$}
defined as
$$
n_i = \text{length}_{\mathcal{O}_{X, Z_i}} \mathcal{O}_{Z, Z_i}
$$
where $\mathcal{O}_{X, Z_i}$, resp.\ $\mathcal{O}_{Z, Z_i}$ is the
local ring of $X$, resp.\ $Z$ at the generic point of $Z_i$.
We define the $k$-cycle associated to $Z$ to be the $k$-cycle
$$
[Z]_k = \sum n_i [Z_i].

regex_match $Z_i$
nn candidate the irreducible components
nn the irreducible components
new_annotation [(6201, 6206, 'VAR'), (6210, 6236, 'TYPE')]
===== sent Let $Z_i$ be the irreducible components of $Z$ of
dimension $k$ and let $n_i$ be the {\it multiplicity of $Z_i$ in $Z$}
defined as
$$
n_i = \text{length}_{\mathcal{O}_{X, Z_i}} \mathcal{O}_{Z, Z_i}
$$
where $\mathcal{O}_{X, Z_i}$, resp.\ 

===== sent \end{proof}

\begin{lemma}
\label{lemma-determinant}

regex_match $k$
nn candidate a field
nn a field
new_annotation [(92109, 92112, 'VAR'), (92116, 92123, 'TYPE')]
===== sent Let $n \geq 1$ be an integer and let

regex_match $n \geq 1$
nn candidate $
nn candidate an integer
nn an integer
new_annotation [(92129, 92139, 'VAR'), (92143, 92153, 'TYPE')]
===== sent x_{n1} & \ldots & \ldots & x_{nn}
\end{matrix}
\right)
$$
is an irreducible element of the polynomial ring $k[x_{ij}]$.
\end{lemma}

\begin{proof}

regex_match $V$
nn candidate an $n$ dimensional vector space
nn an $n$ dimensional vector space
new_annotation [(92502, 92505, 'VAR'), (92509, 92540, 'TYPE')]
===== sent Let $W$ be a vector space of dimension $n - 1$.
By elementary linear algebra, the morphism
$$

regex_match $W$
nn candidate a vector space
nn a vector space
new_annotation [(92669, 92672, 'VAR'), (92676, 92690, 'TYPE')]
===== sent \end{proof}

\noindent

regex_match $V$
nn candidate a vector space
nn a vec

===== sent \medskip\noindent

regex_match $X$
nn candidate a simplicial space
nn a simplicial space
new_annotation [(1804, 1807, 'VAR'), (1811, 1829, 'TYPE')]
===== sent \begin{lemma}
\label{lemma-simplicial-site}

regex_match $X$
nn candidate a simplicial space
nn a simplicial space
new_annotation [(2935, 2938, 'VAR'), (2942, 2960, 'TYPE')]

regex_match $X$
nn candidate a simplicial space
nn a simplicial space
new_annotation [(3068, 3071, 'VAR'), (3075, 3093, 'TYPE')]
===== sent Let $\mathcal{F}$ be a sheaf on $X_{Zar}$.
It is clear from the definition of coverings, that the restriction
of $\mathcal{F}$ to the opens of $X_n$ defines a sheaf $\mathcal{F}_n$
on the topological space $X_n$. For every $\varphi : [m] \to [n]$ the
restriction maps of $\mathcal{F}$ for pairs $U \subset X_n$, $V \subset X_m$
with $X(\varphi)(U) \subset V$, define an $X(\varphi)$-map
$\mathcal{F}(\varphi) : \mathcal{F}_m \to \mathcal{F}_n$, see
Sheaves, Definition \ref{sheaves-definition-f-map}.
Moreover, give

nn candidate a site
nn a site
new_annotation [(166544, 166557, 'VAR'), (166561, 166567, 'TYPE')]

regex_match $K$
nn candidate a hypercovering
nn a hypercovering
new_annotation [(166608, 166611, 'VAR'), (166615, 166630, 'TYPE')]
===== sent First, let $\mathcal{I}$ be an injective abelian sheaf on $\mathcal{C}$.
Then the spectral sequence of
Lemma \ref{lemma-augmentation-spectral-sequence}
for the sheaf $a^{-1}\mathcal{I}$ degenerates as
$(a^{-1}\mathcal{I})_p = a_p^{-1}\mathcal{I}$
is injective by Lemma \ref{lemma-localize-injective}.

regex_match $\mathcal{I}$
nn candidate $\mathcal{I}$
nn candidate an injective abelian sheaf
nn an injective abelian sheaf
new_annotation [(166762, 166775, 'VAR'), (166779, 166805, 'TYPE')]
===== sent We conclude that
$R^pa_*a^{-1}\mathcal{I} = 0$ for $p > 0$.
On the other hand, we have $\mathcal{I} = a_*a^{-1}\mathcal{I}$
by Lemma \ref{lemma-hypercovering-descent-sheaves}.

\medskip\noindent
Next, let $E$ be as in the statement of the lemma.

new_annota

===== sent \begin{lemma}
\label{lemma-equivalence-relation}
Let $f : X \to S$ be a morphism of schemes.

regex_match $f : X \to S$
nn candidate X \to
nn candidate a morphism
nn a morphism
new_annotation [(245024, 245037, 'VAR'), (245041, 245051, 'TYPE')]
===== sent Let $\pi : Y \to (X/S)_\bullet$ be a cartesian morphism of simplicial
schemes, see Definitions \ref{definition-cartesian-morphism} and
\ref{definition-fibre-products-simplicial-scheme}.
Then the morphism
$$
j = (d^1_1, d^1_0) : Y_1 \to Y_0 \times_S Y_0
$$
defines an equivalence relation on $Y_0$ over $S$,
see Groupoids, Definition \ref{groupoids-definition-equivalence-relation}.
\end{lemma}

\begin{proof}

regex_match $\pi : Y \to (X/S)_\bullet$
nn candidate $\pi
nn candidate a cartesian morphism
nn a cartesian morphism
new_annotation [(245068, 245095, 'VAR'), (245099, 245119, 'TYPE')]
===== sent \medskip\noindent

regex_match $T/S$
nn candidate a scheme
nn a scheme
new_annotation [(247336, 247341, 'VAR'), (247345, 247353, '

In [201]:
annotated_data_with_filenames = list(zip(annotated_data, list_of_texs))
random.shuffle(annotated_data_with_filenames)
train_data = [ann_data[0] for ann_data in annotated_data_with_filenames[:-1]]
test_data = [ann_data[0] for ann_data in annotated_data_with_filenames[-1]] #we hold out one tex file for testing

In [207]:
def train_ner(nlp, train_data, entity_types):
    # Add new words to vocab.
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]

    # Train NER.
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    return ner

In [208]:
ner = train_ner(nlp, train_data, ['VAR', 'TYPE'])

In [209]:
#first test on a simple sentence
doc = nlp.make_doc('Let $S$ be a scheme and let $bubu here$ be a great thing you know.')  
nlp.tagger(doc)
ner(doc)

for ent in doc.ents:
    print(ent)
    ent.merge()
    
for word in doc:
    if word.ent_type:
        print('(' + word.text + ':' + word.ent_type_ +')', end=word.whitespace_)
    else:
        print(word.text_with_ws, end='')
    


$S$
a scheme
a great thing
Let ($S$:VAR) be (a scheme:TYPE) and let $bubu here$ be (a great thing:TYPE) you know.

In [214]:
#then test on the hold out tex file and save in a new tex file
doc = nlp.make_doc(test_data[0][0])  
print(doc)
nlp.tagger(doc)
ner(doc)

for ent in doc.ents:
    ent.merge()

new_filename = 'a-' + annotated_data_with_filenames[-1][1] + '.tex'
with open( 'annotated_tex_files/'+ '.tex', 'w') as f:
    for word in doc:
        print(word)
        if word.ent_type:
            f.write('(' + word.text + ':' + word.ent_type_ +')' + word.whitespace_)
        else:
            f.write(word.text_with_ws)

\
\


In [221]:
test_data[0]

"\\input{preamble}\n\n% OK, start here.\n%\n\\begin{document}\n\n\\title{Intersection Theory}\n\n\n\\maketitle\n\n\\phantomsection\n\\label{section-phantom}\n\n\\tableofcontents\n\n\n\\section{Introduction}\n\\label{section-introduction}\n\n\\noindent\nIn this chapter we construct the intersection product on the Chow groups\nmodulo rational equivalence on a nonsingular projective variety over an\nalgebraically closed field. Our tools are Serre's Tor formula\n(see \\cite[Chapter V]{Serre_algebre_locale}), reduction to the diagonal,\nand the moving lemma.\n\n\\medskip\\noindent\nWe first recall cycles and how to construct proper pushforward and\nflat pullback of cycles. Next, we introduce rational equivalence of cycles\nwhich gives us the Chow groups $A_*(X)$. Proper pushforward and flat pullback\nfactor through rational equivalence to give operations on Chow groups.\nThis takes up Sections \n\\ref{section-cycles},\n\\ref{section-cycle-of-closed},\n\\ref{section-cycle-of-coherent-sheaf},