* simple Named Entity Reongition model with VAR and TYPE tags using spaCy
* training data: tex files from the Stacks Project annotated using a "Let ... be a ..." rule
* inspired by https://github.com/explosion/spaCy/blob/master/examples/training/train_ner.py

In [3]:
from __future__ import unicode_literals, print_function
import json
import pathlib
import random

import spacy
from spacy.pipeline import EntityRecognizer
from spacy.gold import GoldParse
from spacy.tagger import Tagger

import os
import re
 
try:
    unicode
except:
    unicode = str

In [4]:
nlp = spacy.load('en')
#nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

In [5]:
def tex2doc(tex_file):  #read the whole tex file in the spaCy doc object
    with open(tex_file, 'r') as tex:
        data=tex.read()
    doc = nlp(data)
    return doc

In [6]:
def rule_based_annotation(doc):
    annotation = []
    for match in re.finditer('let \$(\S+( \S+){0,3})\$ be an? (\S+)', doc.text, re.IGNORECASE):
        annotation.append((match.span(1)[0],match.span(1)[1], 'VAR'))
        annotation.append((match.span(3)[0],match.span(3)[1], 'TYPE'))    
    return (doc.text, annotation)

In [23]:
annotated_data=[]

directory = os.fsencode('tex_files/')
for file in os.listdir(directory):
    filename = os.fsdecode(file)
    print("file: ", filename)
    doc = tex2doc(os.path.join(os.fsdecode(directory), filename))
    annotated_data.append(rule_based_annotation(doc))   
    
    

file:  intersection.tex
file:  spaces-simplicial.tex
file:  stacks-sheaves.tex
file:  cotangent.tex
file:  stacks-more-morphisms.tex
file:  formal-defos.tex
file:  spaces-more-cohomology.tex
file:  divisors.tex
file:  more-morphisms.tex


In [29]:
print(len(annotated_data))
print(annotated_data[0])

9


In [38]:
random.shuffle(annotated_data)
train_data = annotated_data[:-1]
test_data = annotated_data[-1:] #we hold out one tex file for testing

In [35]:
def train_ner(nlp, train_data, entity_types):
    # Add new words to vocab.
    for raw_text, _ in train_data:
        doc = nlp.make_doc(raw_text)
        for word in doc:
            _ = nlp.vocab[word.orth]

    # Train NER.
    ner = EntityRecognizer(nlp.vocab, entity_types=entity_types)
    for itn in range(5):
        random.shuffle(train_data)
        for raw_text, entity_offsets in train_data:
            doc = nlp.make_doc(raw_text)
            gold = GoldParse(doc, entities=entity_offsets)
            ner.update(doc, gold)
    return ner

In [36]:
ner = train_ner(nlp, train_data, ['VAR', 'TYPE'])

In [37]:
#first test on a simple sentence
doc = nlp.make_doc('Let $S$ be a scheme.')  
nlp.tagger(doc)
ner(doc)
for word in doc:
    print(word.text, word.ent_type_)

Let 
$ 
S$ 
be 
a 
scheme TYPE
. TYPE


In [47]:
#then test on the hold out tex file; there are no 'VAR' tags detected, maybe something to do with dollar sign tokenization...
doc = nlp.make_doc(test_data[0][0])  
nlp.tagger(doc)
ner(doc)
for word in doc:
    print(word.text, "\t" + word.ent_type_)

\input{preamble 	
} 	


 	
% 	
OK 	
, 	
start 	
here 	
. 	

 	
% 	

 	
\begin{document 	
} 	


 	
\title{More 	
on 	
Cohomology 	
of 	
Spaces 	
} 	


 	
\maketitle 	


 	
\phantomsection 	

 	
\label{section 	
- 	
phantom 	
} 	


 	
\tableofcontents 	





 	
\section{Introduction 	
} 	

 	
\label{section 	
- 	
introduction 	
} 	


 	
\noindent 	

 	
In 	
this 	
chapter 	
continues 	
the 	
discussion 	
started 	
in 	

 	
Cohomology 	
of 	
Spaces 	
, 	
Section 	
\ref{spaces 	
- 	
cohomology 	
- 	
section 	
- 	
introduction}. 	

 	
One 	
can 	
also 	
view 	
this 	
chapter 	
as 	
the 	
analogue 	
for 	
algebraic 	
spaces 	

 	
of 	
the 	
chapter 	
on 	
\'etale 	
cohomology 	
for 	
schemes 	
, 	
see 	

 	
\'Etale 	
Cohomology 	
, 	
Section 	
\ref{etale 	
- 	
cohomology 	
- 	
section 	
- 	
introduction}. 	


 	
\medskip\noindent 	

 	
In 	
fact 	
, 	
we 	
intend 	
this 	
chapter 	
to 	
be 	
mainly 	
a 	
translation 	
of 	
the 	

 	
results 	
already 	
proved 	
for 	
schemes 	
into 	
the 	
l

- 	
star 	
} 	

 	
Let 	
$ 	
S$ 	
be 	
a 	
scheme 	TYPE
. 	TYPE
Let 	
$ 	
f 	
: 	
X 	
\to 	
Y$ 	
and 	
$ 	
g 	
: 	
Y 	
' 	
\to 	
Y$ 	

 	
be 	
a 	
morphisms 	TYPE
of 	
algebraic 	
spaces 	
over 	
$ 	
S$. 	
Assume 	
$ 	
f$ 	
is 	
proper 	
. 	

 	
Set 	
$ 	
X 	
' 	
= 	
Y 	
' 	
\times_Y 	
X$ 	
with 	
projections 	
$ 	
f 	
' 	
: 	
X 	
' 	
\to 	
Y'$ 	
and 	
$ 	
g 	
' 	
: 	
X 	
' 	
\to 	
X$. 	

 	
Let 	
$ 	
\mathcal{F}$ 	
be 	
any 	
sheaf 	
on 	
$ 	
X_\etale$. 	
Then 	

 	
$ 	
g^{-1}f_*\mathcal{F 	
} 	
= 	
f'_*(g')^{-1}\mathcal{F}$. 	

 	
\end{lemma 	
} 	


 	
\begin{proof 	
} 	

 	
The 	
question 	
is 	
\'etale 	
local 	
on 	
$ 	
Y'$. 	
Choose 	
a 	
scheme 	
$ 	
V$ 	
and 	
a 	
surjective 	

 	
\'etale 	
morphism 	
$ 	
V 	
\to 	
Y$. 	
Choose 	
a 	
scheme 	
$ 	
V'$ 	
and 	
a 	
surjective 	
\'etale 	

 	
morphism 	
$ 	
V 	
' 	
\to 	
V 	
\times_Y 	
Y'$. 	
Then 	
we 	
may 	
replace 	
$ 	
Y'$ 	
by 	
$ 	
V'$ 	
and 	

 	
$ 	
Y$ 	
by 	
$ 	
V$. 	
Hence 	
we 	
may 	
assume 	
$ 	
Y$ 	
and 	
$ 	
Y'$ 	
a

: 	
\Sh(X_\etale 	
) 	
\to 	
\Sh((\textit{Spaces}/X)_\etale)$ 	

 	
with 	
$ 	
\pi_X 	
\circ 	
i_X 	
= 	
\text{id}$ 	
as 	
morphisms 	
of 	
topoi 	
and 	

 	
$ 	
\pi_{X 	
, 	
* 	
} 	
= 	
i_X^{-1}$. 	

 	
More 	
generally 	
, 	
if 	
$ 	
f 	
: 	
Y 	
\to 	
X$ 	
is 	
an 	
object 	
of 	
$ 	
( 	
\textit{Spaces}/X)_\etale$ 	
, 	

 	
then 	
there 	
is 	
a 	
morphism 	

 	
$ 	
i_f 	
: 	
\Sh(Y_\etale 	
) 	
\to 	
\Sh((\textit{Spaces}/X)_\etale)$ 	

 	
such 	
that 	
$ 	
f_{small 	
} 	
= 	
\pi_X 	
\circ 	
i_f$ 	
, 	
see 	

 	
Topologies 	
on 	
Spaces 	
, 	
Lemmas 	
\ref{spaces 	
- 	
topologies 	
- 	
lemma 	
- 	
put 	
- 	
in 	
- 	
T 	
- 	
etale 	
} 	
and 	

 	
\ref{spaces 	
- 	
topologies 	
- 	
lemma 	
- 	
morphism 	
- 	
big 	
- 	
small 	
- 	
etale}. 	
In 	

 	
Topologies 	
on 	
Spaces 	
, 	
Remark 	

 	
\ref{spaces 	
- 	
topologies 	
- 	
remark 	
- 	
change 	
- 	
topologies 	
- 	
ringed 	
} 	

 	
we 	
have 	
extended 	
these 	
to 	
a 	
morphism 	
of 	
ringed 	
sites 	

 	
$ 	
$ 	

 	
\pi_X 	
: 	

 

) 	
] 	
For 	
$ 	
K$ 	
in 	
$ 	
D((\textit{Spaces}/Y)_\etale 	
, 	
\mathcal{O})$ 	
we 	
have 	

 	
$ 	
g_{big}^*(Rf_{big 	
, 	
* 	
} 	
K 	
) 	
= 	
Rf'_{small 	
, 	
* 	
} 	
( 	
( 	
g'_{big})^*K)$ 	

 	
in 	
$ 	
D(\textit{Mod}(X'_\etale 	
, 	
\mathcal{O}_{X'}))$. 	

 	
\end{enumerate 	
} 	

 	
\end{lemma 	
} 	


 	
\begin{proof 	
} 	

 	
Part 	
( 	
1 	
) 	
follows 	
from 	

 	
Lemma 	
\ref{lemma 	
- 	
compare 	
- 	
injectives 	
} 	

 	
and 	
( 	
\ref{equation 	
- 	
compare 	
- 	
big 	
- 	
small 	
} 	
) 	

 	
on 	
choosing 	
a 	
K 	
- 	
injective 	
complex 	
of 	
abelian 	
sheaves 	
representing 	
$ 	
K$. 	


 	
\medskip\noindent 	

 	
Part 	
( 	
3 	
) 	
follows 	
from 	
Lemma 	
\ref{lemma 	
- 	
compare 	
- 	
injectives 	
} 	

 	
and 	
Topologies 	
, 	
Lemma 	

 	
\ref{topologies 	
- 	
lemma 	
- 	
morphism 	
- 	
big 	
- 	
small 	
- 	
cartesian 	
- 	
diagram 	
- 	
etale 	
} 	

 	
on 	
choosing 	
a 	
K 	
- 	
injective 	
complex 	
of 	
abelian 	
sheaves 	
representing 	
$ 	
K$. 	


 	
\medsk

comparing 	
big 	
and 	
small 	
\'etale 	
sites 	
, 	
see 	
Section 	
\ref{section 	
- 	
compare}. 	

 	
The 	
composition 	
determines 	
a 	
morphism 	
of 	
sites 	

 	
$ 	
$ 	

 	
a_X 	
= 	
\pi_X 	
\circ 	
\epsilon_X 	
: 	

 	
( 	
\textit{Spaces}/X)_{fppf 	
} 	

 	
\longrightarrow 	

 	
X_{spaces 	
, 	
\etale 	
} 	

 	
$ 	
$ 	

 	
If 	
$ 	
\mathcal{H}$ 	
is 	
an 	
abelian 	
sheaf 	
on 	
$ 	
( 	
\textit{Spaces}/X)_{fppf}$ 	
, 	

 	
then 	
we 	
will 	
write 	
$ 	
H^n_{fppf}(U 	
, 	
\mathcal{H})$ 	
for 	
the 	
cohomology 	

 	
of 	
$ 	
\mathcal{H}$ 	
over 	
an 	
object 	
$ 	
U$ 	
of 	
$ 	
( 	
\textit{Spaces}/X)_{fppf}$. 	


 	
\begin{lemma 	
} 	

 	
\label{lemma 	
- 	
comparison 	
- 	
fppf 	
- 	
etale 	
} 	

 	
Let 	
$ 	
S$ 	
be 	
a 	
scheme 	TYPE
. 	TYPE
Let 	
$ 	
X$ 	
be 	
an 	
algebraic 	TYPE
space 	
over 	
$ 	
S$. 	

 	
\begin{enumerate 	
} 	

 	
\item 	
For 	
$ 	
\mathcal{F 	
} 	
\in 	
\Sh(X_\etale)$ 	
we 	
have 	

 	
$ 	
\epsilon_{X 	
, 	
* 	
} 	
a_X^{-1}\mathcal{F 	
} 	
= 	
\pi_X

 	
\ref{sites 	
- 	
cohomology 	
- 	
lemma 	
- 	
derived 	
- 	
pushforward 	
- 	
composition}. 	

 	
Then 	
second 	
equality 	
is 	
Lemma 	
\ref{lemma 	
- 	
cohomological 	
- 	
descent 	
- 	
etale 	
- 	
fppf}. 	

 	
The 	
third 	
is 	

 	
Lemma 	
\ref{lemma 	
- 	
compare 	
- 	
higher 	
- 	
direct 	
- 	
image 	
- 	
proper 	
} 	
part 	
( 	
2 	
) 	
. 	

 	
The 	
fourth 	
is 	
Lemma 	
\ref{lemma 	
- 	
cohomological 	
- 	
descent 	
- 	
etale 	
- 	
fppf 	
} 	
again 	
. 	

 	
Thus 	
the 	
base 	
change 	
map 	

 	
$ 	
a_Y^{-1}(Rf_{small 	
, 	
* 	
} 	
K 	
) 	
\to 	
Rf_{big 	
, 	
fppf 	
, 	
* 	
} 	
( 	
a_X^{-1}K)$ 	

 	
induces 	
an 	
isomorphism 	

 	
$ 	
$ 	

 	
R\epsilon_{Y 	
, 	
* 	
} 	
a_Y^{-1}Rf_{small 	
, 	
* 	
} 	
K 	
\to 	

 	
R\epsilon_{Y 	
, 	
* 	
} 	
Rf_{big 	
, 	
fppf 	
, 	
* 	
} 	
a_X^{-1}K 	

 	
$ 	
$ 	

 	
The 	
proof 	
is 	
finished 	
by 	
the 	
following 	
remark 	
: 	
a 	
map 	

 	
$ 	
\alpha 	
: 	
a_Y^{-1}L 	
\to 	
M$ 	
with 	
$ 	
L$ 	
in 	
$ 	
D^+(Y_\etale)$ 	

 	
and 	
$ 

\Gamma(U 	
, 	
g^*\mathcal{F})$. 	

 	
We 	
can 	
reinterpret 	
the 	
$ 	
s_i$ 	
as 	
a 	
family 	
of 	
maps 	

 	
$ 	
\varphi_i 	
: 	
f_i^*\mathcal{O}_U 	
= 	
\mathcal{O}_{U_i 	
} 	
\to 	
f_i^*g^*\mathcal{F}$ 	

 	
compatible 	
with 	
the 	
canonical 	
descent 	
data 	
associated 	
to 	
the 	

 	
quasi 	
- 	
coherent 	
sheaves 	
$ 	
\mathcal{O}_U$ 	
and 	
$ 	
g^*\mathcal{F}$ 	
on 	
$ 	
U$. 	

 	
Hence 	
by 	
Descent 	
on 	
Spaces 	
, 	
Proposition 	

 	
\ref{spaces 	
- 	
descent 	
- 	
proposition 	
- 	
fpqc 	
- 	
descent 	
- 	
quasi 	
- 	
coherent 	
} 	

 	
we 	
see 	
that 	
we 	
may 	
( 	
uniquely 	
) 	
descend 	

 	
these 	
to 	
a 	
map 	
$ 	
\mathcal{O}_U 	
\to 	
g^*\mathcal{F}$ 	
which 	
gives 	

 	
us 	
our 	
section 	
$ 	
s$. 	


 	
\medskip\noindent 	

 	
We 	
will 	
deduce 	
( 	
2 	
) 	
-- 	
( 	
7 	
) 	
from 	
the 	
corresponding 	
statement 	
for 	
schemes 	
. 	

 	
Choose 	
an 	
\'etale 	
covering 	
$ 	
\{X_i 	
\to 	
X\}_{i 	
\in 	
I}$ 	

 	
where 	
each 	
$ 	
X_i$ 	
is 	
a 

* 	
} 	
$ 	
, 	
resp.\ 	
$ 	
a_{X 	
, 	
* 	
} 	
$ 	
, 	
surely 	
the 	
proof 	
is 	
done 	

 	
by 	
Leray 	
's 	
acyclicity 	
lemma 	
? 	
Actually 	
... 	
, 	
no 	
because 	
Leray 	
's 	

 	
acyclicity 	
lemma 	
only 	
applies 	
to 	
bounded 	
below 	
complexes 	
. 	

 	
However 	
, 	
in 	
the 	
next 	
paragraph 	
we 	
will 	
show 	
the 	
result 	
does 	
follow 	

 	
from 	
the 	
bounded 	
below 	
case 	
because 	
our 	
complex 	
is 	
the 	
derived 	
limit 	

 	
of 	
bounded 	
below 	
complexes 	
of 	
quasi 	
- 	
coherent 	
modules 	
. 	


 	
\medskip\noindent 	

 	
The 	
cohomology 	
sheaves 	
of 	

 	
$ 	
\pi_X^*\mathcal{K}^\bullet$ 	
and 	
$ 	
a_X^*\mathcal{K}^\bullet$ 	

 	
have 	
vanishing 	
higher 	
cohomology 	

 	
groups 	
over 	
affine 	
objects 	
of 	
$ 	
( 	
\textit{Spaces}/X)_\etale$ 	
by 	

 	
Lemma 	
\ref{lemma 	
- 	
vanishing 	
- 	
adequate}. 	

 	
Therefore 	
we 	
have 	

 	
$ 	
$ 	

 	
L\pi_X^*K 	
= 	
R\lim 	
\tau_{\geq 	
-n}(L\pi_X^*K 	
) 	

 	
\quad\text{and}\quad 	


sheaves 	
the 	
maps 	

 	
$ 	
$ 	

 	
\pi_X^{-1}K 	
\longrightarrow 	
R\epsilon_{X 	
, 	
* 	
} 	
a_X^{-1}K 	

 	
\quad\text{and}\quad 	

 	
K 	
\longrightarrow 	
Ra_{X 	
, 	
* 	
} 	
a_X^{-1}K 	

 	
$ 	
$ 	

 	
are 	
isomorphisms 	
with 	

 	
$ 	
a_X 	
: 	
\Sh((\textit{Spaces}/X)_{ph 	
} 	
) 	
\to 	
\Sh(X_\etale)$ 	
as 	
above 	
. 	

 	
\end{lemma 	
} 	


 	
\begin{proof 	
} 	

 	
We 	
only 	
prove 	
the 	
second 	
statement 	
; 	
the 	
first 	
is 	
easier 	
and 	
proved 	
in 	
exactly 	

 	
the 	
same 	
manner 	
. 	
There 	
is 	
a 	
reduction 	
to 	
the 	
case 	
where 	

 	
$ 	
K$ 	
is 	
given 	
by 	
a 	
single 	
torsion 	
abelian 	
sheaf 	
. 	
Namely 	
, 	
represent 	
$ 	
K$ 	

 	
by 	
a 	
bounded 	
below 	
complex 	
$ 	
\mathcal{F}^\bullet$ 	
of 	
torsion 	

 	
abelian 	
sheaves 	
. 	
This 	
is 	
possible 	
by 	
Cohomology 	
on 	
Sites 	
, 	
Lemma 	

 	
\ref{sites 	
- 	
cohomology 	
- 	
lemma 	
- 	
torsion}. 	

 	
By 	
the 	
case 	
of 	
a 	
sheaf 	
we 	
see 	
that 	

 	
$ 	
\mathcal