In [1]:
# Required in order to run the import below.
import sys

In [2]:
# Instructing folks not to think too much about this code, just know that we're doing it 
# because it is going and getting the package from the internet and downloading it to our computer
# so that we can use the code.

# This will only be included if you don't install nltk on day 3.
!{sys.executable} -m pip install nltk



In [3]:
# We'll go over some of the file I/O and string functions, just to make sure everyone
# remembers.

# Storing text to save to a file.
essential_text = "We definitely want to remember the way in which communities need agency over their own histories!"

# Opening a file, and write our essential text to the file, and close the file. 
with open('essential.txt', 'w') as file:
    file.write(essential_text)
    
# Opening a file, read the text from it, and store the text in a variable.
with open('essential.txt', 'r') as file:
    essential_text_from_file = file.read()

In [4]:
# Let's look at what we just stored.
print(essential_text_from_file)

We definitely want to remember the way in which communities need agency over their own histories!


In [5]:
# Can we annotate some parts of this? Yes! Using NLTK!

# Let's download some essential NLTK packages for today.
import nltk.downloader
nltk.download('averaged_perceptron_tagger') # For POS tagger.
nltk.download('punkt') # For word tokenizer.
nltk.download('tagsets') # For UPenn tagset help.
nltk.download('wordnet') # For WordNet lemmatizer.

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\clair\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\clair\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\clair\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\clair\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Import the NLTK tokenize function (for words).
from nltk.tokenize import word_tokenize

# Broadly, tokenization is the process of preparing input for a particular model.
# In an NLP subfield known as "lexical analysis", tokenization is the process
# of demarcating sections of a string character, typically using spaces to separate
# words and/or sentences. Let's try it!

In [7]:
# Let's tokenize the sentence we just read in!
tokenized_text = word_tokenize(essential_text)
print(tokenized_text)

# What information from the sentence is lost during this process?

['We', 'definitely', 'want', 'to', 'remember', 'the', 'way', 'in', 'which', 'communities', 'need', 'agency', 'over', 'their', 'own', 'histories', '!']


In [8]:
# Instead of a tokenizer, we could also use a
# "lemmatizer".

# A lemmatizer uses lemmatization to find all
# "lemmas" of a term. I know, confusing, but
# here are some examples:

# rocks --> rock
# corpora --> corpus
# better --> good

# In linguistic terms, a lemma is
# a representative of a set of terms
# by which they can be indexed.
# For instance "break" is the lemma
# for "break", "breaks", "broke",
# "broken", and "breaking".

In [9]:
# Let's import the lemmatizer!
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer.
lemmatizer = WordNetLemmatizer()

# And test!
print("communities" + ' : ' + lemmatizer.lemmatize("communities"))

communities : community


In [10]:
# Nice!
# Let's try to print the lemmas for the full sentence:
for token in tokenized_text:
    print(token + " : " + lemmatizer.lemmatize(token))
    
# Try your own sentence!

We : We
definitely : definitely
want : want
to : to
remember : remember
the : the
way : way
in : in
which : which
communities : community
need : need
agency : agency
over : over
their : their
own : own
histories : history
! : !


In [11]:
# Yes! The relationship the words have to one another is more difficult to determine.
# For instance "their" and "own" as modifiers of "histories", which gives "histories" a
# slightly different contextual meaning.

# Note other issues with this like polish/Polish, clip/clip, foil/foil.

# Would determining part of speech (POS) help? Let's try it!

# Use the NLTK POS (Part Of Speech) tagger on the tokenized
# sentence (note that the sentence must be tokenized first).
from nltk import pos_tag
pos_tag(tokenized_text)

[('We', 'PRP'),
 ('definitely', 'RB'),
 ('want', 'VBP'),
 ('to', 'TO'),
 ('remember', 'VB'),
 ('the', 'DT'),
 ('way', 'NN'),
 ('in', 'IN'),
 ('which', 'WDT'),
 ('communities', 'NNS'),
 ('need', 'VBP'),
 ('agency', 'NN'),
 ('over', 'IN'),
 ('their', 'PRP$'),
 ('own', 'JJ'),
 ('histories', 'NNS'),
 ('!', '.')]

In [12]:
# Okay... but what do these abbreviations all mean?
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [13]:
# It would be nice if we could visualize the connections between the words though--
# we know which are adjectives, but what do they modify?

# Never fear, the spaCy package is here!

# We can also do this using the spaCy package!

# If we want to make sure we install a specific version
# we can use this:
!{sys.executable} -m pip install spaCy==2.3.2

# We can also try this:
#!{sys.executable} -m pip install spaCy

# Or this:
# `!{sys.executable} -m pip install spaCy --use-feature=2020-resolver`



In [14]:
# Now, we do have to download a neural network English core model.
# This is essentially a "model" of how English works, with rules based on millions
# of "correct" sentences plugged into a neural network.

# This would normally be tons of work, so many thanks to the spaCy developers for
# providing it!
!{sys.executable} -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [15]:
# If this block doesn't run, run
# `{sys.executable} -m spacy download en`
# and then re-run!
import en_core_web_sm # Importing the neural network model we just downloaded.
import spacy as sp # Loading spaCy.

# Load the English core model.
nlp = en_core_web_sm.load()

In [16]:
# Let's load our previous sentence as a spaCy object!
spacy_sen = nlp(essential_text)

In [17]:
# Print the sentence text, now that it's a spaCy object!
print(spacy_sen.text)

We definitely want to remember the way in which communities need agency over their own histories!


In [18]:
# Oh... it looks the same.

# Well let's see if POS information is retained.
print(spacy_sen[4].text + " : " + spacy_sen[4].pos_)

remember : VERB


In [19]:
# We could even print every word like this:
for word in spacy_sen:
    print(word.text + " : " + word.pos_)

We : PRON
definitely : ADV
want : VERB
to : PART
remember : VERB
the : DET
way : NOUN
in : ADP
which : DET
communities : NOUN
need : VERB
agency : NOUN
over : ADP
their : DET
own : ADJ
histories : NOUN
! : PUNCT


In [20]:
# We can even go more in-depth using the 'explain' parameter:
print(spacy_sen[4].text + " : " + sp.explain(spacy_sen[4].tag_))

remember : verb, base form


In [21]:
# But words also have explicit relationships to one another, as we discussed earlier.
# How do we see those? Well let's try to visualize them!
from spacy import displacy

displacy.render(spacy_sen, style='dep', jupyter=True, options={'distance': 85})

# If you aren't using Jupyter, you'll need to use:
#
# `displacy.serve(spacy_sen, style='dep', options={'distance': 120})
#
# Then you'll see the following:
#
# `Serving on port 5000...
#  Using the 'dep' visualizer`
#
# To view, go to the following address in your web browser:
#
# `http://127.0.0.1:5000/

In [22]:
# Very pretty! That helps us understand the "structure" of the sentence.

# Let's try visualizing your own sentence!

# (If we have time, we'll look into named entity recognition)

# Let's take our coffee break :)

In [23]:
# Next, we're going to look into "finding" a string a bit.

# Let's start with the sentence we had above.

# First we'll define a search term.
string_to_search_1 = "communities"

# Next we'll loop through all of the "tokens" in the tokenized text and look for a match!
for token in tokenized_text:
    if token == string_to_search_1: # Why do we have 2 equal signs here?
        print("Found it! :)")

Found it! :)


In [24]:
# Okay! But what if we want to know if a word, doesn't exist, like if we search "bigfoot"
# what happens?
string_to_search_2 = "bigfoot"

for token in tokenized_text:
    if token == string_to_search_2:
        print("Found it! :)")
    else:
        print("Didn't find it :(")

Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(
Didn't find it :(


In [25]:
# Oh no it printed a bunch! That's a bit overwhelming though...
# Can we have it print just once?
string_to_search_2 = "bigfoot"

# Let's define a "found it" variable.
# We'll set it to "false" outside of the loop.
found_it = False

for token in tokenized_text:
    if token == string_to_search_2:
        found_it = True # Set to true when found.
        
if found_it:
    print("Found it! :)")
else:
    print("Didn't find it :(")

Didn't find it :(


In [26]:
# That's it! Much more succinct.

# But do we even need the loop? Is there a faster way to do this?

# Yes! Using "in", we can do this:
if string_to_search_1 in tokenized_text:
    print("Found it! :)")
else:
    print("Didn't find it :(")

Found it! :)


In [27]:
# We actually don't even need the tokenized text for this!
if string_to_search_1 in essential_text:
    print("Found it! :)")
else:
    print("Didn't find it :(")

Found it! :)


In [28]:
# This seems a little too good to be true.
# How do you think this could go wrong?

# What happens if a user has caps lock on?
string_to_search_3 = "COMMUNITIES"

if string_to_search_3 in essential_text:
    print("Found it! :)")
else:
    print("Not found! :(")

Not found! :(


In [29]:
# Oh no. That's not good.
# Well let's send the text to uppercase maybe?
uppercase_essential_text = essential_text.upper()

if string_to_search_3 in uppercase_essential_text:
    print("Found it! :)")
else:
    print("Not found! :(")

Found it! :)


In [30]:
# It worked! But what about the original search with "communities"?
uppercase_essential_text = essential_text.upper()

if string_to_search_1 in uppercase_essential_text:
    print("Found it! :)")
else:
    print("Not found! :(")

Not found! :(


In [31]:
# ...that's disappointed.
# Wait! What if we send the search text AND the essential text to lowercase?
lowercase_essential_text = essential_text.lower()

if string_to_search_3.lower() in lowercase_essential_text:
    print("Found it! :)")
else:
    print("Not found! :(")

Found it! :)


In [32]:
# That's it!

# But what information is lost when we do this?

# A consistent theme here is that, we can manipulate data in so many ways,
# but all manipulations will mean some information is lost.
# This is important to keep in mind!

In [33]:
# What if we want to know the position of the "find"? This is especially important if
# we have strings that are thousands of pages long.

# For that, we can use the "find" function in Python.
# Documentation: https://www.geeksforgeeks.org/python-string-find/

# Let's try it!
lowercase_essential_text.find(string_to_search_3.lower())

48

In [34]:
# So now we know where the find is!

# What happens if the string isn't there?
lowercase_essential_text.find(string_to_search_2.lower())

-1

In [35]:
# We get a "-1"!

# But having the number isn't always useful... what if we could indicate where the "find" is?

# Let's insert [HERE'S THE FIND!] in the string.

position_to_insert = lowercase_essential_text.find(string_to_search_3.lower())

# For this we need to do string concatenation:
new_essential_text = essential_text[:position_to_insert] + "[HERE'S THE FIND!]" + essential_text[position_to_insert:]

print(new_essential_text)

We definitely want to remember the way in which [HERE'S THE FIND!]communities need agency over their own histories!


In [36]:
# Nice!

# But what if we have multiple matches? Or we want multiple matches?

# If time, try to find a string in another sentence.

# For that, we need the regular expressions package, which we'll look into after the break:)

In [37]:
# What are regular expressions?
# A regular expression (often shortened as regex) is a sequence of characters that specifies
# a particular search pattern, usually for string-searching algorithms.

# To use regular expressions in Python, we need to import the "re" package:
import re

In [38]:
# Let's try to do our uppercase search problem using the re package!
if re.search(string_to_search_3, essential_text, re.IGNORECASE):
    print("Found it :)")
else:
    print("Not found :(")

Found it :)


In [39]:
# Yay! It worked!

# But what if have a partial match in a sentence?
string_to_search_4 = 'cat'
less_essential_text = "I'm not looking for something sophisticated."

if re.search(string_to_search_4, less_essential_text, re.IGNORECASE):
    print("Found it! :)")
else:
    print("Not found! :(")

Found it! :)


In [40]:
# Uh-oh.
# That's a problem.
# Okay, well, can we make our search system better?

# Regular expressions have all sorts of weird rules, and often
# feel like another programming language in and of themselves.
# Luckily, a number of sites provide "cheatsheets" and testing
# materials, so you can test before you run.

# I personally recommend: https://regexr.com/, but there
# are tons out there!

In [41]:
# For instance, let's try to make a regular expression that
# mimics "re.IGNORECASE" to start.

# Let's try to find "communities" or "COMMUNITIES" in the
# 'essential_text':
# "We definitely want to remember the way in which communities need agency over their own histories!"
#
# One way to do this is:
# (COMMUNITIES)|(communities)

# But that won't match "Communities" hmm...
#
# We could do this:
# [Cc][Oo][Mm][Mm][Uu][Nn][Ii][Tt][Ii][Ee][Ss]

# But that's a bit long...
#
# Let's do this instead. Click on the "flags" option
# at regexr.com. Select the 'case insensitive' option.
#
# Then we'll have this:
# (communities)/i

In [42]:
# Well... we have our regex working on the web, but how do we put it in Python?
if re.search(r'(COMMUNITIES)|(communities)', essential_text):
    print("Found it! :)")
    
if re.search(r'[Cc][Oo][Mm][Mm][Uu][Nn][Ii][Tt][Ii][Ee][Ss]', essential_text):
    print("Found it! :)")
    
if re.search(r'(communities)', essential_text, flags=re.I):
    print("Found it! :)")

Found it! :)
Found it! :)
Found it! :)


In [43]:
# Let's return to our 'cat' example.
#
# There's a special flag, "\b" (backspace) which moves the
# cursor backwards on position (on the same 'row').
#
# With regular expressions, this functions as a 'word boundary'.
# This means that it attempts to differentiate between "word" and
# "non-word" characters.

# For instance:
if re.search(r"\b" + re.escape(string_to_search_4) + r"\b", less_essential_text, re.IGNORECASE):
    print("Found it! :P")
else:
    print("Not found! :O")

Not found! :O


In [44]:
# Nice!

# Let's finish today attempting to search for a 
# "concept" (so to speak).
#
# For instance, notice what happens when we try to search
# 'community' in the essential text, even with our best regular
# expressions:
if re.search(r"\b" + re.escape('community') + r"\b", essential_text, re.IGNORECASE):
    print("Found it! :)")
else:
    print("Not found! :(")

Not found! :(


In [45]:
# But if someone searches that, they probably want to match; how can we help?

# Hmm... Did we talk about anything earlier that could be used for that?

# Of course, the lemmatizer!

# Let's lemmatize the sentence:
lemmatized_essential_text = [lemmatizer.lemmatize(word) for word in tokenized_text]

# And return the lemmatizer form to a sentence:
lemmatized_essential_text_str = " ".join(lemmatized_essential_text)
print(lemmatized_essential_text_str)

We definitely want to remember the way in which community need agency over their own history !


In [46]:
# We also need to lemmatize the search string:
lemmatized_search_string = lemmatizer.lemmatize(string_to_search_3.lower())
# Remember to send to lowercase or the lemmatizer won't work!

# And now we'll search!
if re.search(r"\b" + re.escape(lemmatized_search_string) + r"\b", lemmatized_essential_text_str, re.IGNORECASE):
    print("Found it! :)")
else:
    print("Not found! :(")
    
# Let's try with your own sentence!

Found it! :)


In [47]:
# But wait! What if we want the positions where multiple string matches are?

# Let's try this new text block:
string_to_search_5 = "HIV"
lemmatized_search_string_2 = lemmatizer.lemmatize(string_to_search_5.lower())

# This is the first paragraph of this Wikipedia page:
# https://en.wikipedia.org/wiki/Signs_and_symptoms_of_HIV/AIDS
new_essential_text = "The stages of HIV infection are acute infection (also known as primary infection), latency and AIDS. Acute infection lasts for several weeks and may include symptoms such as fever, swollen lymph nodes, inflammation of the throat, rash, muscle pain, malaise, and mouth and esophageal sores. The latency stage involves few or no symptoms and can last anywhere from two weeks to twenty years or more, depending on the individual. AIDS, the final stage of HIV infection, is defined by low CD4+ T cell counts (fewer than 200 per μL), various opportunistic infections, cancers and other conditions."
new_tokenized_text = word_tokenize(new_essential_text)
new_lemmatized_essential_text = [lemmatizer.lemmatize(word) for word in new_tokenized_text]
new_lemmatized_essential_text_str = " ".join(new_lemmatized_essential_text)
print(new_lemmatized_essential_text_str)

The stage of HIV infection are acute infection ( also known a primary infection ) , latency and AIDS . Acute infection last for several week and may include symptom such a fever , swollen lymph node , inflammation of the throat , rash , muscle pain , malaise , and mouth and esophageal sore . The latency stage involves few or no symptom and can last anywhere from two week to twenty year or more , depending on the individual . AIDS , the final stage of HIV infection , is defined by low CD4+ T cell count ( fewer than 200 per μL ) , various opportunistic infection , cancer and other condition .


In [48]:
# Nice!
# Next, we'll use the "re.findall()" function:
matches = re.findall(r"\b" + re.escape(lemmatized_search_string_2) + r"\b", new_lemmatized_essential_text_str, re.IGNORECASE)
for match in matches:
    print(match)

HIV
HIV


In [49]:
# Okay but we want to know *where* these matches are.
# For this we use the "re.finditer()" function.
matches = re.finditer(r"\b" + re.escape(lemmatized_search_string_2) + r"\b", new_lemmatized_essential_text_str, re.IGNORECASE)

results = [i.start() for i in matches]
print(results)

[13, 455]


In [50]:
# Let's insert [HERE'S THE FIND!] in the string, just like last time.
position_to_insert = lowercase_essential_text.find(string_to_search_3.lower())

# We need to know how long the string we are entering is, in order
# to know where to but subsequent matches, which are moved over.
thing_to_insert = "[HERE'S THE FIND!]"
length_of_insert = len(thing_to_insert)

very_new_lemmatized_essential_text_str = new_lemmatized_essential_text_str

no_inserted = 0

for match_span in results:
    very_new_lemmatized_essential_text_str = very_new_lemmatized_essential_text_str[:(int(match_span)+(no_inserted * length_of_insert))] + thing_to_insert + very_new_lemmatized_essential_text_str[(int(match_span)+(no_inserted * length_of_insert)):] 
    no_inserted += 1
    
print(very_new_lemmatized_essential_text_str)

The stage of [HERE'S THE FIND!]HIV infection are acute infection ( also known a primary infection ) , latency and AIDS . Acute infection last for several week and may include symptom such a fever , swollen lymph node , inflammation of the throat , rash , muscle pain , malaise , and mouth and esophageal sore . The latency stage involves few or no symptom and can last anywhere from two week to twenty year or more , depending on the individual . AIDS , the final stage of [HERE'S THE FIND!]HIV infection , is defined by low CD4+ T cell count ( fewer than 200 per μL ) , various opportunistic infection , cancer and other condition .
