install spacy \
https://spacy.io/usage

install model in command line \
https://spacy.io/models

In [None]:
import spacy

In [12]:
import en_core_web_sm

# note: other spacy models exists for English. This is a small one.
# https://spacy.io/models/en#en_core_web_sm

use spacy POS tagger \
https://spacy.io/usage/linguistic-features/ 

the first tag is the simple UPOS part-of-speech tag https://universaldependencies.org/u/pos/ \
the second tag is the detailed part-of-speech tag

Notes: \
    - the first tag may not be detailed enough (doesn't distinguish adjectives for example (e.g., superlative)) \
    - what are the labels of the second tag? (depends on model) https://spacy.io/models/en#en_core_web_sm

In [13]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("This will soon be America's biggest economic problem")

for token in doc:
    print(token.text, token.pos_, token.tag_)

This PRON DT
will AUX MD
soon ADV RB
be AUX VB
America PROPN NNP
's PART POS
biggest ADJ JJS
economic ADJ JJ
problem NOUN NN


In [14]:
# show explanaition of a single spacy label
spacy.explain("JJS")

'adjective, superlative'

In [None]:
#### Extracting info on tag-set 2 of model ####

In [17]:
tags_str = "$, '', ,, -LRB-, -RRB-, ., :, ADD, AFX, CC, CD, DT, EX, FW, HYPH, IN, JJ, JJR, JJS, LS, MD, NFP, NN, NNP, NNPS, NNS, PDT, POS, PRP, PRP$, RB, RBR, RBS, RP, SYM, TO, UH, VB, VBD, VBG, VBN, VBP, VBZ, WDT, WP, WP$, WRB, XX, _SP, ``"

In [21]:
tags = tags_str.split(", ")
tag_list = [tag for tag in tags]
print(len(tag_list))

['$', "''", ',', '-LRB-', '-RRB-', '.', ':', 'ADD', 'AFX', 'CC', 'CD', 'DT', 'EX', 'FW', 'HYPH', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NFP', 'NN', 'NNP', 'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB', 'XX', '_SP', '``']


In [27]:
# displaying the tags finally used for feature engineering and saving to file

file = open('pos_tag_list.txt', 'w')

tag_dict = {}
for t2 in tag_list:
    tag_dict[t2] = spacy.explain(t2)
file.write("All fine-grained POS tags: \n")
for t3, ex in tag_dict.items():
   file.write(str(t3 + " " + ex + "\n"))

file.close()

All fine-grained POS tags:

$ symbol, currency
'' closing quotation mark
, punctuation mark, comma
-LRB- left round bracket
-RRB- right round bracket
. punctuation mark, sentence closer
: punctuation mark, colon or ellipsis
ADD email
AFX affix
CC conjunction, coordinating
CD cardinal number
DT determiner
EX existential there
FW foreign word
HYPH punctuation mark, hyphen
IN conjunction, subordinating or preposition
JJ adjective (English), other noun-modifier (Chinese)
JJR adjective, comparative
JJS adjective, superlative
LS list item marker
MD verb, modal auxiliary
NFP superfluous punctuation
NN noun, singular or mass
NNP noun, proper singular
NNPS noun, proper plural
NNS noun, plural
PDT predeterminer
POS possessive ending
PRP pronoun, personal
PRP$ pronoun, possessive
RB adverb
RBR adverb, comparative
RBS adverb, superlative
RP adverb, particle
SYM symbol
TO infinitival "to"
UH interjection
VB verb, base form
VBD verb, past tense
VBG verb, gerund or present participle
VBN verb, past p