In [None]:
!pip install spacy tabulate
!python -m spacy download en_core_web_md

In [2]:
# Text mining
# -----------
# Text Data, Data, and Language
# Print conventions and code conventions
# Token != Word
# Tokenization procedures
# Preprocessing to reduce long tails
# + Token distribution and reducing long tails
# + "Compressing" our text: e.g. casing, lemmatizing, stemming, etc.
# + Removing stopwords, digits, punctuation
# What are we actually going to count?
# + Ngrams (n-length tokens => tokens)
# + Association scoring with pointwise mutual information
# Data structures for doing this work, including document-term matrix
# + Vectorization, (Adrian Mackenzie, The Machine Learners)
# + Vector spaces
# + Vector space semantics (you shall know a word by the company it keeps!)
# Similarity measures
# + Cosine similarity
# + tf-idf (specificity)
# Topic modeling

In [3]:
from pathlib import Path
from collections import Counter, defaultdict
from tabulate import tabulate
import spacy

In [4]:
indir = Path("data/section_two/s1")

In [5]:
nlp = spacy.load("en_core_web_md")

In [6]:
with indir.joinpath("stein_carafe.txt").open("r") as fin:
    poem = fin.read()

poem

'A kind in glass and a cousin, a spectacle and nothing strange a single hurt color and an arrangement in a system to pointing. All this and not ordinary, not unordered in not resembling. The difference is spreading.'

In [7]:
carafe = nlp(poem)
carafe

A kind in glass and a cousin, a spectacle and nothing strange a single hurt color and an arrangement in a system to pointing. All this and not ordinary, not unordered in not resembling. The difference is spreading.

In [8]:
dir(carafe)

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '_bulk_merge',
 '_context',
 '_get_array_attrs',
 '_realloc',
 '_vector',
 '_vector_norm',
 'cats',
 'char_span',
 'copy',
 'count_by',
 'doc',
 'ents',
 'extend_tensor',
 'from_array',
 'from_bytes',
 'from_dict',
 'from_disk',
 'from_docs',
 'from_json',
 'get_extension',
 'get_lca_matrix',
 'has_annotation',
 'has_extension',
 'has_unknown_spaces',
 'has_vector',
 'is_nered',
 'is_parsed',
 'is_sentenced',
 'is_tagged',
 'lang',
 'lang_',
 'mem',
 'noun_chunks',
 'noun_chunks_iterator',
 'remove_extension',
 'retokenize',
 'sentiment'

In [9]:
carafe.has_annotation("TAG")

True

In [11]:
list(carafe.sents)

[A kind in glass and a cousin, a spectacle and nothing strange a single hurt color and an arrangement in a system to pointing.,
 All this and not ordinary, not unordered in not resembling.,
 The difference is spreading.]

In [12]:
for sentence in carafe.sents:
    print(sentence)

A kind in glass and a cousin, a spectacle and nothing strange a single hurt color and an arrangement in a system to pointing.
All this and not ordinary, not unordered in not resembling.
The difference is spreading.


In [14]:
carafe.sents[0]

TypeError: 'generator' object is not subscriptable

In [15]:
carafe.noun_chunks

<generator at 0x12ae4bb50>

In [16]:
for chunk in carafe.noun_chunks:
    print(chunk)

A kind
glass
a cousin
a spectacle
nothing
a single hurt color
an arrangement
a system
All this
The difference


In [17]:
with indir.joinpath("odyssey_book_11.txt").open("r") as fin:
    book11 = fin.read()

odyssey = nlp(book11)

In [18]:
odyssey[:100]

Then, when we had got down to the sea shore we drew our ship into the water and got her mast and sails into her; we also put the sheep on board and took our places, weeping and in great distress of mind. Circe, that great and cunning goddess, sent us a fair wind that blew dead aft and stayed steadily with us keeping our sails all the time well filled; so we did whatever wanted doing to the ship's gear and let her go as the wind and helmsman headed her

In [20]:
chunk.text

'The difference'

In [22]:
counts = Counter([chunk.text for chunk in odyssey.noun_chunks])

In [25]:
repeats = []
for chunk, count in counts.items():
    length = len(chunk.split())
    if length > 2 and count > 1:
        repeats.append([chunk, length])

table = tabulate(repeats, ["Chunk", "Length"])
print(table)

Chunk                       Length
------------------------  --------
the sea shore                    3
a fair wind                      3
the poor feckless ghosts         4
the same time                    3
the other side                   3
his golden sceptre               3
your own house                   3
her own son                      3
the Achaean land                 3
her own husband                  3
my wicked wife                   3
all the Danaans                  3
the poor creature                3


In [26]:
odyssey.ents

(Circe,
 Oceanus,
 Cimmerians,
 one,
 Oceanus,
 Circe,
 Here Perimedes and Eurylochus,
 first,
 thirdly,
 Ithaca,
 Teiresias,
 two,
 every quarter,
 two,
 Hades,
 Proserpine,
 Teiresias,
 first,
 Elpenor,
 Circe,
 Circe,
 Hades,
 Telemachus,
 the Aeaean island,
 days,
 one,
 Autolycus,
 Troy,
 Teiresias,
 Theban Teiresias,
 Ulysses,
 Laertes,
 the light of day,
 Neptune,
 the Thrinacian island,
 Ithaca,
 Neptune,
 Teiresias,
 Hades,
 Oceanus,
 Troy,
 Ithaca,
 'Mother,
 Theban,
 Teiresias,
 Achaean,
 one,
 the very first day,
 Agamemnon,
 Ilius,
 Trojans,
 both night,
 Telemachus,
 the winter,
 summer,
 Hades,
 Proserpine,
 Proserpine,
 the light of day,
 first,
 Tyro,
 Salmoneus,
 Cretheus,
 Aeolus,
 Neptune,
 Tyro,
 twelve months,
 Pelias,
 Neleus,
 Jove,
 Pelias,
 Iolcus,
 Pylos,
 Cretheus,
 Aeson, Pheres,
 Amythaon,
 Antiope,
 Asopus,
 Jove,
 two,
 Thebes,
 seven,
 Thebes,
 Alcmena,
 Amphitryon,
 Jove,
 Hercules,
 Megara,
 Creon,
 Amphitryon,
 Epicaste,
 Thebes,
 Epicaste,
 Chloris,

In [27]:
time = [ent.text for ent in odyssey.ents if ent.label_ == "TIME"]
print(time)

['both night', 'all night', 'morning']


In [31]:
spacy.explain("GPE")

'Countries, cities, states'

In [29]:
nlp.get_pipe("ner").labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [28]:
set(ent.text for ent in odyssey.ents if ent.label_ == "PERSON")

{'Achilles',
 'Ajax',
 'Antiope',
 'Ariadne',
 'Cassandra',
 'Creon',
 'Dia',
 'Diana',
 'Echeneus',
 'Epeus',
 'Epicaste',
 'Eriphyle',
 'Eurypylus',
 'Gorgon',
 'Hebe',
 'Helen',
 'Hercules',
 'Iasus',
 'Jove',
 'King Alcinous',
 'Leda',
 'Leto',
 'Maera',
 'Minos',
 'Neleus',
 'Neoptolemus',
 'O Phaecians',
 'Orestes',
 'Ossa',
 'Panopeus',
 'Peleus',
 'Pelias',
 'Periclymenus',
 'Pero',
 'Phaedra',
 'Phylace',
 'Priam',
 'Proserpine',
 'Pylos',
 'Pytho',
 'Queen',
 'Salmoneus',
 'Scyros',
 'Sisyphus',
 'Teiresias',
 'Telamon',
 'Telemachus',
 'Theban Teiresias',
 'Thetis',
 'Troy',
 'Tyro',
 'Ulysses'}

In [33]:
in_our_doc = set()
for ent in odyssey.ents:
    label = ent.label_
    if label not in in_our_doc:
        in_our_doc.add(label)

in_our_doc

{'CARDINAL',
 'DATE',
 'GPE',
 'LOC',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

In [34]:
carafe

A kind in glass and a cousin, a spectacle and nothing strange a single hurt color and an arrangement in a system to pointing. All this and not ordinary, not unordered in not resembling. The difference is spreading.

In [35]:
carafe[3]

glass

In [36]:
dir(carafe[3])

['_',
 '__bytes__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 'ancestors',
 'check_flag',
 'children',
 'cluster',
 'conjuncts',
 'dep',
 'dep_',
 'doc',
 'ent_id',
 'ent_id_',
 'ent_iob',
 'ent_iob_',
 'ent_kb_id',
 'ent_kb_id_',
 'ent_type',
 'ent_type_',
 'get_extension',
 'has_dep',
 'has_extension',
 'has_head',
 'has_morph',
 'has_vector',
 'head',
 'i',
 'idx',
 'iob_strings',
 'is_alpha',
 'is_ancestor',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_sent_end',
 'is_sent_start',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang

In [37]:
carafe[3].is_alpha

True

In [38]:
carafe[3].is_stop

False

In [39]:
carafe[3].lang_

'en'

In [41]:
nlp("kept")[0].lemma_

'keep'

In [42]:
carafe[3].sent

A kind in glass and a cousin, a spectacle and nothing strange a single hurt color and an arrangement in a system to pointing.

In [44]:
carafe.vector

array([-2.5555735 ,  1.1846154 , -2.269247  ,  0.3677398 ,  5.24201   ,
       -0.1987919 ,  1.2433177 ,  4.4085402 ,  0.9550491 , -0.16181572,
        6.9850497 ,  1.5053661 , -3.7253914 ,  1.7839051 ,  1.4726005 ,
        1.9264268 ,  1.2448726 ,  0.445887  , -0.8267525 , -2.1162584 ,
        1.7902328 , -0.38020015, -1.9477575 ,  0.31356767, -0.7174688 ,
       -2.4472535 , -3.0243936 , -1.6282868 , -1.1875781 ,  1.2953837 ,
        1.1721401 , -1.2594589 , -0.9912044 , -1.059311  , -4.1143107 ,
        0.3692391 , -0.21478145, -0.3150489 ,  2.0574374 , -0.2255659 ,
        0.31841078, -0.4647687 ,  0.91950166, -0.4593155 , -0.9836534 ,
        2.4000485 ,  0.47912785, -2.4650972 , -1.4792219 ,  0.66991735,
       -1.2323452 ,  1.3389677 ,  0.3964691 , -5.1207867 , -1.3892405 ,
        0.3496113 ,  0.9154521 ,  1.0409203 ,  0.3290848 , -0.7906738 ,
       -0.6259186 , -1.9025444 ,  0.1057207 , -1.3458245 ,  1.4250729 ,
        2.036559  , -2.554052  , -5.4397173 ,  2.0374045 ,  2.01

In [47]:
carafe[3].vector[1]

-4.6198

In [46]:
len(carafe[3].vector)

300

In [49]:
attributes = [[tok.text, tok.is_punct, tok.like_url] for tok in carafe]
print(tabulate(attributes, ["Text", "Punctuation", "Is URL"]))

Text         Punctuation    Is URL
-----------  -------------  --------
A            False          False
kind         False          False
in           False          False
glass        False          False
and          False          False
a            False          False
cousin       False          False
,            True           False
a            False          False
spectacle    False          False
and          False          False
nothing      False          False
strange      False          False
a            False          False
single       False          False
hurt         False          False
color        False          False
and          False          False
an           False          False
arrangement  False          False
in           False          False
a            False          False
system       False          False
to           False          False
pointing     False          False
.            True           False
All          False          False
this      

In [50]:
pos_tags = [[tok.text, tok.pos_] for tok in carafe]
print(tabulate(pos_tags, ["Token", "Tag"]))

Token        Tag
-----------  -----
A            DET
kind         NOUN
in           ADP
glass        NOUN
and          CCONJ
a            DET
cousin       NOUN
,            PUNCT
a            DET
spectacle    NOUN
and          CCONJ
nothing      PRON
strange      ADJ
a            DET
single       ADJ
hurt         NOUN
color        NOUN
and          CCONJ
an           DET
arrangement  NOUN
in           ADP
a            DET
system       NOUN
to           ADP
pointing     VERB
.            PUNCT
All          DET
this         PRON
and          CCONJ
not          PART
ordinary     ADJ
,            PUNCT
not          PART
unordered    ADJ
in           ADP
not          PART
resembling   VERB
.            PUNCT
The          DET
difference   NOUN
is           AUX
spreading    VERB
.            PUNCT


In [51]:
pos_tags = [[tok.text, tok.tag_] for tok in carafe]
print(tabulate(pos_tags, ["Token", "Tag"]))

Token        Tag
-----------  -----
A            DT
kind         NN
in           IN
glass        NN
and          CC
a            DT
cousin       NN
,            ,
a            DT
spectacle    NN
and          CC
nothing      NN
strange      JJ
a            DT
single       JJ
hurt         NN
color        NN
and          CC
an           DT
arrangement  NN
in           IN
a            DT
system       NN
to           IN
pointing     VBG
.            .
All          PDT
this         DT
and          CC
not          RB
ordinary     JJ
,            ,
not          RB
unordered    JJ
in           IN
not          RB
resembling   VBG
.            .
The          DT
difference   NN
is           VBZ
spreading    VBG
.            .


In [53]:
to_render = list(carafe.sents)[1]
spacy.displacy.render(to_render, style = "dep")

In [57]:
for sentence in carafe.sents:
    for token in sentence:
        subtree = list(token.subtree)
        print("Token:", token.text, "\t\tTree:", " ".join(tok.text for tok in subtree)) 

Token: A 		Tree: A
Token: kind 		Tree: A kind in glass and a cousin , a spectacle and nothing
Token: in 		Tree: in glass
Token: glass 		Tree: glass
Token: and 		Tree: and
Token: a 		Tree: a
Token: cousin 		Tree: a cousin , a spectacle and nothing
Token: , 		Tree: ,
Token: a 		Tree: a
Token: spectacle 		Tree: a spectacle and nothing
Token: and 		Tree: and
Token: nothing 		Tree: nothing
Token: strange 		Tree: A kind in glass and a cousin , a spectacle and nothing strange a single hurt color and an arrangement in a system to pointing .
Token: a 		Tree: a
Token: single 		Tree: single
Token: hurt 		Tree: hurt
Token: color 		Tree: a single hurt color and an arrangement in a system to pointing
Token: and 		Tree: and
Token: an 		Tree: an
Token: arrangement 		Tree: an arrangement in a system to pointing
Token: in 		Tree: in a system
Token: a 		Tree: a
Token: system 		Tree: a system
Token: to 		Tree: to pointing
Token: pointing 		Tree: pointing
Token: . 		Tree: .
Token: All 		Tree: All
Token: th

In [59]:
for sentence in carafe.sents:
    for token in sentence:
        print("Token:", token.text, "\t\tHead:", token.head.text) 

Token: A 		Head: kind
Token: kind 		Head: strange
Token: in 		Head: kind
Token: glass 		Head: in
Token: and 		Head: kind
Token: a 		Head: cousin
Token: cousin 		Head: kind
Token: , 		Head: cousin
Token: a 		Head: spectacle
Token: spectacle 		Head: cousin
Token: and 		Head: spectacle
Token: nothing 		Head: spectacle
Token: strange 		Head: strange
Token: a 		Head: color
Token: single 		Head: color
Token: hurt 		Head: color
Token: color 		Head: strange
Token: and 		Head: color
Token: an 		Head: arrangement
Token: arrangement 		Head: color
Token: in 		Head: arrangement
Token: a 		Head: system
Token: system 		Head: in
Token: to 		Head: arrangement
Token: pointing 		Head: to
Token: . 		Head: strange
Token: All 		Head: this
Token: this 		Head: unordered
Token: and 		Head: this
Token: not 		Head: ordinary
Token: ordinary 		Head: this
Token: , 		Head: unordered
Token: not 		Head: unordered
Token: unordered 		Head: unordered
Token: in 		Head: unordered
Token: not 		Head: resembling
Token: resemb

In [64]:
sentence = odyssey[2246:2260]
dependencies = []
for tok in sentence:
    row = [tok.text, tok.lemma_, tok.dep_, spacy.explain(tok.dep_), tok.head.text]
    dependencies.append(row)

In [65]:
print(tabulate(dependencies, ["Text", "Lemma", "Dep", "Exp", "Head"]))

Text       Lemma    Dep     Exp                        Head
---------  -------  ------  -------------------------  ---------
Then       then     advmod  adverbial modifier         tried
I          I        nsubj   nominal subject            tried
tried      try      ROOT    root                       tried
to         to       aux     auxiliary                  find
find       find     xcomp   open clausal complement    tried
some       some     det     determiner                 way
way        way      dobj    direct object              find
of         of       prep    prepositional modifier     way
embracing  embrace  pcomp   complement of preposition  of
my         my       poss    possession modifier        mother
mother     mother   poss    possession modifier        ghost
's         's       case    case marking               mother
ghost      ghost    dobj    direct object              embracing
.          .        punct   punctuation                tried


In [74]:
subj = []
for token in odyssey:
    if token.dep_ in ('nsubj', 'nsubjpass') and token.pos_ in ('NOUN', 'PROPN'):
        subtree = " ".join(t.text for t in token.subtree)
        subj.append([token.text, token.head.text, token.head.lemma_, subtree])

In [75]:
print(tabulate(subj, ["Subject", "Head", "Head lemma", "Subtree"]))

Subject       Head          Head lemma    Subtree
------------  ------------  ------------  -----------------------------------------------------------------
Circe         sent          send          Circe , that great and cunning goddess ,
wind          headed        head          the wind and helmsman
sails         were          be            her sails
sun           went          go            the sun
darkness      was           be            darkness
rays          pierce        pierce        the rays of the sun
wretches      live          live          the poor wretches
Circe         told          tell          Circe
Eurylochus    held          hold          Eurylochus
Teiresias     have          have          Teiresias
blood         run           run           the blood
ghosts        came          come          the ghosts
men           worn          wear          old men
armour        smirched      smirch        their armour
ghosts        come          come          the poor feckle

In [76]:
subject_heads = defaultdict(list)
for item in subj:
    subject, head, *_ = item
    subject_heads[subject].append(head)

In [78]:
associations = []
for subject, heads in subject_heads.items():
    sublist = [subject, ", ".join(heads)]
    associations.append(sublist)

In [79]:
print(tabulate(associations, ["Subject", "Associated heads"]))

Subject       Associated heads
------------  --------------------------------------------------------
Circe         sent, told
wind          headed, tossed, sprang
sails         were
sun           went
darkness      was
rays          pierce
wretches      live
Eurylochus    held
Teiresias     have, answered
blood         run
ghosts        came, come, stood, gathered, screaming
men           worn
armour        smirched
ghost         was, saying, came, came, close, went, came, came, strode
Elpenor       said
man           left, cross, was, kill, do
heaven        make, vouchsafe, take
ship          reaches, went
hardship      reach
people        heard, bless, are
wayfarer      meet
death         come
life          ebb, left
taste         talk
prophecyings  spoken
mother        came, answered
wife          intends, remains, is, allow
one           got, invites, told, was
Telemachus    holds
lands         undisturbed
father        remains
weather       comes
Proserpine    want, sent, dismiss