# Natural language processing in Python

This cheat sheet contains oparations on natural language processing and string manipulations in Python.

Sources:
- https://github.com/moondra2017/Python-Regular-Expressions

In [1]:
from IPython.display import display
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

## Basic built-in text manipulations

### Number of characters

In [None]:
str1 = "Despite the CONSTANT negative press, covfefe"
print("Number of characters",len(str1))

### Split individual words by different separation 

In [None]:
str1 = "Despite the CONSTANT negative press, covfefe"
print(str1.split())
print(str1.split(','))

### Stripping whitespaces

In [None]:
str1 = ' hello  apple '

print(str1.strip()) # leading and trailing whitespaces
print(str1.replace(" ", "")) # all whitespaces

# Needs RE!
print(re.sub( '\s+', ' ', str1).strip()) # replace multiple whitespaces with one


### Force between lower and upper case

In [None]:
str1 = "Despite the CONSTANT negative press, covfefe"
print([w.lower() for w in str1.split()])
print([w.upper() for w in str1.split()])

### Find specific words using list comprehensions

In [None]:
str1 = "Despite the CONSTANT negative press, covfefe"
print([w for w in str1.split() if len(w) > 7])
print([w for w in str1.split() if w.istitle()])
print([w for w in str1.split() if w.islower()])
print([w for w in str1.split() if w.isupper()])
print([w for w in str1.split() if w.endswith('fe')])
print([w for w in str1.split() if w.startswith('c')])

### Find unique words

In [None]:
str1 = "Despite the CONSTANT negative press, covfefe covfefe covfefe"
print(set(str1.split()))

### Extract numeric types from string


In [None]:
import itertools 

mystring = "61-63sds.0600"
print(int("".join(itertools.takewhile(str.isdigit, mystring))))

mylist = ["".join(x) for _, x in itertools.groupby(mystring, key=str.isdigit)]
mylist = [s for s in mylist if s.isdigit()]
print(mylist)

<h2>Regular expressions</h2>

### Quick quide to different characters

#### Meta characters

Quick guide to re <b>meta characters</b>

<b>Identifiers</b>
- \s is any whitespace character [ \t\n\r\f\v]
- \S is any non-whitespace character [^ \t\n\r\f\v]
- \b whitespace around words
- \d <=> [0-9]
- \D <=> [^0-9]
- \w is any alphanumeric character <=> [a-zA-Z0-9_]
- \W is any non-alphanumeric character <=> [^a-zA-Z0-9_]
- . any character, except for a newline
- \\. a period

<b>Modifiers</b>
- \{n\} exactly n repetitions where $n \geq 0$
- \{n , \} at least $n$ repetitions
- \{ , n\} at most $n$ repetitions
- \{m , n\} at least $m$ and at most $n$ repetitions
- \+ match one or more occurences
- \* match zero or more occurences
- ? match zero or one occurences
- \$ match the end of a string; match needs to be at the end of a string
- ^ match the beginning of a string; match needs to be at the beginning of a string
- | either or; a|b matches either a or b
- [] Used to indicate a set of characters


<b>White space characters</b>
- \n new line
- \s space
- \t tab
- \e escape
- \f form feed (?)
- \r return

<b>Word boundary for non-alphanumeric cahracters</b>
- \b word boundary; looks on one side nonalpha numeric caracter and on another alpha numeric character
- \B opposite of word boundary; looks both sides for alpha numeric character OR both sides for nonalpha numeric character


<b>Groups</b>
- () Defines a group. Only grouped stuff gets pulled out but whole re will be matched
- (?:) A non-capturing version of regular parentheses. Matches whatever regular expression is inside the parentheses, but the substring matched by the group cannot be retrieved after performing a match or referenced later in the pattern.
- (?P) Naming groups

<b>Examples</b>

- [a-z] matches lowercase set of charactes a to z
- [A-Z] matches uppercase set of charactes A to Z
- [0-9] matches numbers 0 to 9
- [^b-n] matches lowercase set of charactes except letters b to n


#### Alphanumeric characters

 These are alphanumeric characters
- A-Z
- a-z
- 0-9

These are not:
- . (dot
-  (space)
- @
- any newline character

### Flags

- re.MULTILINE (re.M): Looks for instances separately at each new line. Used in conjunction with meta characters '^' or '$'. For example, in conjunction with '^' what is getting matched is beginning of each new line. Works only with re.search()!
- re.IGNORECASE: Ignores cases
- re.DOTALL (re.S): Used in conjunction with meta character '.' (dot) to actually include newlines as well.
- re.ASCII
- re.DEBUG
- re.LOCALE

In [None]:
# Some flag examples
string = '''U.S. stock-index futures pointed
to a solidly higher open on
Monday, indicating that major
benchmarks were poised to rebound
from last week’s sharp decline,
which represented their biggest weekly drops in months.
That weakness was driven in part by
fears over North Korea, where tensions
with the U.S. have been escalating.
North Korea. That issue overshadowed the state of
the equity market, where earnings
have been strong at a time of high
employment and low inflation,
as well as valuations that
appear elevated by many metrics, north korea North Korea.'''

pattern1a = re.compile('^North Korea\.?', flags = re.MULTILINE)
pattern1b = re.compile('^North Korea\.?')

pattern2a = re.compile('north korea', flags = re.IGNORECASE)
pattern2b = re.compile('north korea')

pattern3a = re.compile('.*', flags = re.S)
pattern3b = re.compile('.*')

print(re.search(pattern1a, string))
print(re.search(pattern1b, string))
print('------')
print(re.findall(pattern2a, string))
print(re.findall(pattern2b, string))
print('------')
print(re.match(pattern3a, string).group())
print('--')
print(re.match(pattern3b, string).group())


### Groups

When using re.findall(), only grouped stuff will get outputted! This is not tha same with re.search() or re.match() as they always output the entire match by default.

In [None]:
string = 'John has 6 cats but I think my friend Susan has 3 dogs and Mike has 8 fishes'

myre1 = re.compile('[A-Za-z]+ \w+ \d+ \w+')
myre2 = re.compile('([A-Za-z]+) \w+ \d+ \w+')
myre3 = re.compile('([A-Za-z]+) \w+ \d+ (\w+)')
myre4 = re.compile('(([A-Za-z]+) \w+ \d+ (\w+))')
    
print(re.findall(myre1, string))
print(re.findall(myre2, string))
print(re.findall(myre3, string))
print(re.findall(myre4, string))

print('-----')

print(re.search(myre1, string).group())
print(re.search(myre2, string).group())
print(re.search(myre3, string).group())
print(re.search(myre4, string).group())


Although re.search() and re.match() output the entire match as default '0' group, they can (and will) distinguish (only) between individual groups that are captured.

In [None]:
string = 'John has 6 cats but I think my friend Susan has 3 dogs and Mike has 8 fishes'
myre1 = re.compile('([A-Za-z]+) (\w+) \d+ (\w+)')
myre2 = re.compile('([A-Za-z]+) (?:\w+) \d+ (\w+)')

print(re.search(myre1, string).group())
print(re.search(myre1, string).group(1))
print(re.search(myre1, string).group(2))
print(re.search(myre1, string).group(3))
print(re.search(myre1, string).groups())
print(re.search(myre1, string).group(2,1))

print('-----')

print(re.search(myre2, string).group())
print(re.search(myre2, string).group(1))
print(re.search(myre2, string).group(2))
#print(re.search(myre2, string).group(3)) # there is only 2 captured groups!
print(re.search(myre2, string).groups())
print(re.search(myre2, string).group(2,1))

re.match() and re.serach() have all kinds of methods beside .group() and .groups():
- .span()
- .start()
- .end()

re.findall() does not have any of these!

In [None]:
string = 'John has 6 cats but I think my friend Susan has 3 dogs and Mike has 8 fishes'
myre1 = re.compile('([A-Za-z]+) (\w+) \d+ (\w+)')

print(re.search(myre1, string))
print(re.search(myre1, string).span())
print(re.search(myre1, string).span(2))
print(re.search(myre1, string).start())
print(re.search(myre1, string).start(3))
print(re.search(myre1, string).end())
print(re.search(myre1, string).end(1))


#### Backreferencing

In [None]:
# Here we use syntax \1 to refer to the 1st group within regular expression
# So essentially we want to find 1st group twice

string = 'Merry Merry Christmas'

myre1 = re.compile(r'(\w+) \1')
myre2 = re.compile('(\w+) \1') # doe snot work without raw string!


print(re.search(myre1, string))
print(re.search(myre2, string))

# re.findall() only outputs the word once!
print(re.findall(myre1, string))
print(re.findall(myre2, string))


#### Capturing vs. non-capturing groups

Non-capturing groups can be used when group structure wants to be used but group does not necessarily want to be included in the match. For example, in cases where a group might be present (1 or more occurence) but not necessarily (0 occurences); if zero occurences, then we donät want to output this as it will be jsut empty string.

In [None]:
string = '''
    Feb-25-2001; Feb 25, 2001; February 25, 2001; Feb. 25, 2001; Feb 25 2009;
'''
# With this findall prints 'zeroeth group' only
pattern1 = re.compile(r'(?:\d{2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2}, )?\d{4}')

# With this groups 1-3 get outputted with findall. 3rd group is only the day and not the year since year is not in group
pattern2 = re.compile(r'(\d{2} )?(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* (\d{1,2}, )?\d{4}')

#Notice that re.search matches the same for both patterns!
print(re.search(pattern1, string).group(0))
print(re.findall(pattern1, string))
print('-'*15)
print(re.search(pattern2, string).group(0))
print(re.findall(pattern2, string))


Below example shows that non-captured groups get matched in the "group 0" but not as actual groups.

Source: https://stackoverflow.com/questions/3512471/what-is-a-non-capturing-group-what-does-do

In [None]:
string1 = 'https://stackoverflow.com/'

# protocol part as capturing
exp1 = re.compile(r'(https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?')

# protocol part as non-capturing
exp2 = re.compile(r'(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?')

print(re.search(exp1, string1).group(0))
print(re.search(exp1, string1).group(1))
print(re.search(exp1, string1).group(2))
print(re.search(exp1, string1).group(3))
print('-'*15)
print(re.search(exp2, string1).group(0))
print(re.search(exp2, string1).group(1))
print(re.search(exp2, string1).group(2))


In [None]:
string1 = 'https://stackoverflow.com/'

# protocol part as capturing
exp1 = re.compile(r'(https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?')

# protocol part as non-capturing
exp2 = re.compile(r'(?:https?|ftp)://([^/\r\n]+)(/[^\r\n]*)?')

print(re.findall(exp1, string1))
print(re.findall(exp2, string1))

### re.sub

Substitutes parts in a string. Works as re.findall() in that it searches all instances.

In [None]:
string ="""U.S. stock-index futures pointed
to a solidly higher open on Monday, indicating
that major benchmarks were poised to USA
rebound from last week’s sharp decline, which
represented to us their biggest weekly drops in months."""

print(re.sub('U.S.|US|USA', 'United States ', string ))

In [None]:
string = 'Dan has 3 snails. Mike has 4 cats. Alisa has 9 monkeys.'
square = lambda x: x**2

print(re.sub('(\d+)', '1', string))
print(re.sub('(\d+)', lambda x: str(x.group(0)), string))
print(re.sub('(\d+)', lambda x: str(3 + int(x.group(0))), string))
print(re.sub('(\d+)', lambda x: str(square(int(x.group(0)))), string))


### Word boundaries

Word boundaru checks both sides of a word for nonalpha numeric characters.

In [None]:
string1 = "cat catherine catholic wildcat copycat uncatchable"
string2 = ".cat catherine catholic wildcat copycat uncatchable"

pattern1 = re.compile(r'\bcat\b')

print(re.findall(pattern1, string1))
# Notice that dot is nonalpha numeric!
print(re.findall(pattern1, string2))


### Lookarounds

A few needed terms which are not complements to each other, more like synonyms!

<b>'Capturing'</b>: Used only in relation to groups; we have capturing groups and non-capturing groups. Captured groups get stored as actual groups. However, non-capturing groups are consumed (unlike lookarounds which are NOT consumed) and thus matched in the "group zero". They are not included as a separate group though. See section Capturing vs. non-capturing groups.<br>
<b>'Consuming'</b>: Moving through a string that everything that matches gets consumed, and the cursor moves to end of the match. Consumed parts will get outputted

- A group, either capturing or non-capturing, is consumed

Lookbacks allow us to confirm that some sort of subpattern is ahead or behind main pattern

- ?= Positive lookahead
- ?! Negative lookahead
- ?<= Positive lookback
- ?<! Negative lookback

When you use non-capturing group it is consuming the characters. Due to this it cannot deal with overlapping groups. Lookarounds do not consume! For example, in positive lookahead cursor moves and looks for match but comes back if match was found.

I think lookarounds always need a group!

#### Example positive lookahead

In [None]:
string ='''ABC1    1.1.1.1    20151118    active
           ABC2    2.2.2.2    20151118    inactive
           ABC3    x.x.x.x    xxxxxxxx    active'''

# Regular expression with capturing group of second column and last column
# with positive lookahead for word 'active'
# Group with positive lookahead is not being captured since it is a 'zero width assertion'
myre1 = 'ABC\w\s+(\S+)\s+\S+\s+(?=active)'
pattern1 =re.compile(myre1)

# Same using non-capturing group syntax. It is consuming the characters, but not capturing
# it to a group. This means it will be included in the match BUT it won't be outputted with findall
myre2 = 'ABC\w\s+(\S+)\s+\S+\s+(?:active)'
pattern2 =re.compile(myre2)

# In contrast to myre2, here the last group is indeed capturing
# and it gets outputted with findall
myre3 = 'ABC\w\s+(\S+)\s+\S+\s+(active)'
pattern3 =re.compile(myre3)


print(re.findall(pattern1, string))
print(re.search(pattern1, string))
print(re.search(pattern1, string).group())

print('----')
print(re.findall(pattern2, string))
print(re.search(pattern2, string))
print(re.search(pattern2, string).group())

print('----')

print(re.findall(pattern3, string))
print(re.search(pattern3, string))
print(re.search(pattern3, string).group())


#### Example negative lookahead

In [None]:
string = '''
Remaining party applicants:

Occupation: Party Planner
Occupation: Baking
Occupation: Cook
Occupation: Economist
Occupation: Publicist
Occupation: Baker
Occupation: baker
Occupation: pierrot'''

# Find those with occupation not dealing with cooking or baking
# Notice that we need .+ to capture the matched occupation names
pattern = re.compile('Occupation: (?!Baker|Baking|Cook).+', flags = re.IGNORECASE)

print(re.findall(pattern,string))

#### Example negative lookback

In [None]:
# We want to extract names from persons with baker not being their occupation
string = '''
Remaining party applicants:

Planner: Joe Doe
Guest: Maria Jackson
Cook: Sarah Jones
Economist: Josefina Vilar
Publicist: Mark Garm
Baker: Santa Claus
Party Planner: Misty Mountains
baker: Seema Patel
pierrot: Bill Smith'''

# This is the working version
pattern1 = re.compile(r'(?<!Baker: )\b\w+\s\w+$', flags = re.IGNORECASE|re.M)

#Problem version: needs to be a raw string otherwise no match
pattern2 = re.compile('(?<!Baker: )\b\w+\s\w+$', flags = re.IGNORECASE|re.M)

#Problem version: word boundary must be included to make sure there is a space to the left
# (?<!Baker: ) says our match should not br preceded by Baker: and space. The first letter
# after such a part IS preceded by that, so it will not be captured. With word boundary lookback
# will check whether preceding is done; if yes, cursor comes back, and \b makes sure that we will
# force there to be a word boundary and thus also capture the first letter
pattern3 = re.compile(r'(?<!Baker: )\w+\s\w+$', flags = re.IGNORECASE|re.M)

# Needs 'end-of' meta character $, othrewise does not work correctly
# as we are not looking at END OF A LINE (notice that we have flag re.M) 
pattern4 = re.compile(r'(?<!Baker: )\b\w+\s\w+', flags = re.IGNORECASE|re.M)


print(re.findall(pattern1, string))
print('------')
print(re.findall(pattern2, string))
print(re.findall(pattern3, string))
print(re.findall(pattern4, string))



#### Lookarounds do not match consecutively!

Lookarounds do not automatically work 'back-to-back' as the next examples demonstrates.

In [None]:
string = '''cherry 100 red 
            apple  150 green
            grapes 200 
            '''

# Due to zero width assertion of lookarounds (in this case positive lookaheads),
# we correctly match the middle column numbers with 1st lookahead but since the 
# cursor does not csonsume them (returns to start) the 2nd lookahead is not satisfied
# as we do not find spces between 2nd and 3rd columns but rather the middle column number again! 
pattern1 = re.compile(r'[a-z]+\s*(?= \d+)(?=\s*)(?=[a-z]+)')

# Correcting patter1: putting all lookaheads into same group
pattern2 = re.compile(r'[a-z]+\s*(?= \d+\s*[a-z]+)')

print(re.findall(pattern1, string))
print(re.findall(pattern2, string))


## Cook book examples for regular expressions

#### Match if and only if (AND) certain conditions take place

In [None]:
# Here we will match whole string if certain conditions take place.
# We use lookarounds to achieve this 
string1 = 'AZN#3232!abbb32..'
string2 = 'AZN#3232abbb3232'

# Match whole string (\S) if 
#  - uncapitalized letter is found after zero or more any character except newline
#    AND
#  - capitalized letter is found after zero or more any character except newline
#    AND
#  - number is found after zero or more any character except newline
#    AND
#  - special character (! OR ? OR .) is found after zero or more any character except newline
pattern = re.compile('(?=.*[a-z])(?=.*[A-Z])(?=.*[0-9])(?=.*[!?.])\S+')

print(re.search(pattern, string1))
print(re.search(pattern, string2))


In [None]:
str1 = "Despite the CONSTANT negative #press covfefe, @WH. Also, negative."
straslist = str1.split()

# Find occurence of a word from a sentence
print([w for w in straslist if re.search('negative',w)])
print(re.findall('negative',str1))
print([i for i, w in enumerate(straslist) if re.search('negative',w)])

In [None]:
# Find all English wovels/consonants from a word
str1 = 'ouagadougou'
print(re.findall(r'[aeiouy]', str1))
print(re.findall(r'[^aeiouy]', str1))

#### Find different dates

In [None]:
str1 = '''
This is some sample text with different dates.
02/25/2001; 02/25/01; 2/20/01; 2/3/01;
2001-02-25; 2001/25/02; 2001-2-25; 2001-12-4;
Feb-25-2001; Feb 25, 2001; February 25, 2001; Feb. 25, 2001; Feb 25 2009; July 3 2009;
Feb-3-2001; Feb 3, 2001; February 3, 2001; Feb. 3, 2001; Feb 3 2009;
25 Feb 2001; 25 February 2001; 25 Feb. 2001; 25 February, 2009;
Feb 25th, 2009; March 21st, 2009; Aug 2nd, 2009; Aug 23rd, 2009;
Feb 2001; September 2005; Oct 2010; 6/2001; 11/2001
'''
print("There are " + str(len(str1.split(';'))) + " dates")

# Find dates that are given in either of following formats 
#   -> 12-11-2002
#   -> 12/11/2002
#   -> 11/12/2002
#   -> 12/11/02
# word boundary at start included so that it won't falsely pick up
# '2001-02-25' as '01-02-25'
pattern1a = re.compile(r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}')

# Find dates that are given in either of following formats 
#   -> Nov-12-2002
pattern1b = re.compile(r'[A-Za-z]{3}[/-]\d{1,2}[/-]\d{2,4}')

# Find dates that are given in either of following formats 
#   -> 12 Nov 2002
#   -> 12 November 2002
#   -> Nov 12, 2002
#   -> November 23, 2002
#   -> Aug 2nd, 2009
#   -> Feb. 3, 2001
#   -> 25 Feb. 2001
#   -> Feb 25 2009
#   -> July 3 2009
#   -> 25 February, 2009
pattern2 = re.compile(r'(?:\d{2} )?(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z\.]*,? (?:\d{1,2}[a-z]{0,2},? )?\d{4}') # 20 matches


# Find dates that are given in either of following formats 
#   -> 6/2002 and 11/2002
#   -> 6-2002 and 11-2002
#pattern3 = re.compile(r'(?:[^\w/-])(\d{1,2}[/-]\d{4})\b') # does not match if date is at the start of the string!
pattern3 = re.compile(r'(?<![\w/-])(\d{1,2}[/-]\d{4})\b') # should work now

# Find dates that are given in either of following formats 
#   -> 2001-02-25
#   -> 2001/02/25
#   -> 2001-2-25
pattern4 = re.compile(r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}')


total_list = re.findall(pattern1a,str1) \
                + re.findall(pattern1b,str1) \
                + re.findall(pattern2,str1) \
                + re.findall(pattern3,str1) \
                + re.findall(pattern4,str1)

#total_list = re.findall(pattern2,str1)


print("We found " + str(len(total_list)))
print(total_list)


---

<i>re</i> module has three categories: pattern matching, substitution, and splitting

In [None]:
str1 = '0    Montevallo (University of Montevallo)[2]'

# Find if contains brakcet
pattern = re.compile('\}')
ff = pattern.findall(str1)
gg = pattern.search(str1)

if gg:
    print('True')
else:
    print('False')

In [None]:
# Get everything in the string before certain character (here '-')
# If character not found, then get whole string
str1 = 'a@b-c-d-e'
str2 = 'a@bcde'

regex1 = re.compile(r"^([^-]*).*")

print(re.search(regex1, str1).group(1))
print(re.search(regex1, str2).group(1))



In [None]:
# Get everything in the string after certain character (here '.')
# If character not found, both approaches throw an error
str1 = 'a@b-c-d-e'
str2 = 'a@bcde'

# First approach
regex1 = re.compile("(?:-).*")
print(re.search(regex1, str1).group(0))
#print(re.search(regex1, str2).group(0))

# Second approach
print(str1.split('-', 1)[1])
#str2.split('-', 1)[1]



In [None]:
# Remove stuff between some tags (does not work with nested!)
string  = 'This is a <stupid> sentence'
match = re.compile('[\<].*?[\>]')
string = match.sub('',string)
print(re.sub( '\s+', ' ', string).strip())

In [None]:
# Remove certain character
string = 'Some string with "number that are immportant: 22, 44, "66'
string.replace('"', '')


In [None]:
# Remove everything after last number

regex1 = re.compile(r"(.*?)(\d)")
regex2 = re.compile(r"^([^\d]*)\d*")

string = 'I need to 566 get everything 068 after digit'
string2 = '1302 Pysäköinti 5 m matkalle ennen suojatietä'

print(re.sub(regex1,'', string))

print(re.sub(regex2,'', string))
print(re.sub(regex2,'', string2))




In [None]:
str1 = '9999999 Unknown reason'
str1 = list(map(str.strip, str1))
str1

## nltk

Sources:
- https://pythonprogramming.net/tokenizing-words-sentences-nltk-tutorial/
- https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

### Stop words

Useless words as far as any data analysis goes.

In [None]:
stop_words = set(stopwords.words('english'))

example_sent = "This is a sample sentence, showing off the stop words filtration."
words = word_tokenize(example_sent)

filtered_sent = []
filtered_sent = [w for w in words if not w in stop_words]
        
print(filtered_sent)

### Tokenization

Given a character sequence (some text) and a defined unit (sentence or word), <b>tokenization</b> is the task of chopping the sequence up into pieces (defined units), called <i>tokens</i> , perhaps at the same time throwing away certain characters, such as punctuation. This is why it is better in isolating words than e.g. re.spli(' ').

In [None]:
text = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

print(sent_tokenize(text))
print('-'*15)
print(word_tokenize(text))


### Stemming

Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. 

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for w in example_words:
    print(ps.stem(w))


### Lemmatization

Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.

In [8]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# This lemmatizer takes in a part of speec parameter pos
# If not supplied, defaults to "noun" which may not produce what we want
print(lemmatizer.lemmatize("better"))
print(lemmatizer.lemmatize("better", pos="a"))
print(lemmatizer.lemmatize("best", pos="a"))


better
good
best


### Tagging part of speech

Heading is pretty selg explanatory.

In [2]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw("2005-GWBush.txt")
test_text = state_union.raw("2006-GWBush.txt")

# Train custom sentence tokenizer
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

# Tokenize test sample into sentences
tokenized = custom_sent_tokenizer.tokenize(test_text)

# Loop over some tokenized sentences
tok_sent = tokenized[4:5][0]
print(tok_sent)

# Tokenize words
words = nltk.word_tokenize(tok_sent)
print('-'*30)
print(words)

# Tag parts of speech to words
tagged = nltk.pos_tag(words)
print('-'*30)
print(tagged)


President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.
------------------------------
['President', 'George', 'W.', 'Bush', 'reacts', 'to', 'applause', 'during', 'his', 'State', 'of', 'the', 'Union', 'Address', 'at', 'the', 'Capitol', ',', 'Tuesday', ',', 'Jan', '.']
------------------------------
[('President', 'NNP'), ('George', 'NNP'), ('W.', 'NNP'), ('Bush', 'NNP'), ('reacts', 'VBZ'), ('to', 'TO'), ('applause', 'VB'), ('during', 'IN'), ('his', 'PRP$'), ('State', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('Union', 'NNP'), ('Address', 'NNP'), ('at', 'IN'), ('the', 'DT'), ('Capitol', 'NNP'), (',', ','), ('Tuesday', 'NNP'), (',', ','), ('Jan', 'NNP'), ('.', '.')]


### Chunking

Drawing chnked trees is hard in Jupyter notebook environment...

In [4]:
# Use variable "tagged" from above (Tagging part of speech)

chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
chunkParser = nltk.RegexpParser(chunkGram)
chunked = chunkParser.parse(tagged)

print(tagged)
print('-'*30)
print(chunked)


[('President', 'NNP'), ('George', 'NNP'), ('W.', 'NNP'), ('Bush', 'NNP'), ('reacts', 'VBZ'), ('to', 'TO'), ('applause', 'VB'), ('during', 'IN'), ('his', 'PRP$'), ('State', 'NNP'), ('of', 'IN'), ('the', 'DT'), ('Union', 'NNP'), ('Address', 'NNP'), ('at', 'IN'), ('the', 'DT'), ('Capitol', 'NNP'), (',', ','), ('Tuesday', 'NNP'), (',', ','), ('Jan', 'NNP'), ('.', '.')]
------------------------------
(S
  (Chunk President/NNP George/NNP W./NNP Bush/NNP)
  reacts/VBZ
  to/TO
  applause/VB
  during/IN
  his/PRP$
  (Chunk State/NNP)
  of/IN
  the/DT
  (Chunk Union/NNP Address/NNP)
  at/IN
  the/DT
  (Chunk Capitol/NNP)
  ,/,
  (Chunk Tuesday/NNP)
  ,/,
  (Chunk Jan/NNP)
  ./.)


### Chinking

Similar to chunking but actually the opposite... fill

### Named entity recognition

Can spot stuff like

ORGANIZATION - Georgia-Pacific Corp., WHO
PERSON - Eddy Bonte, President Obama
LOCATION - Murray River, Mount Everest
DATE - June, 2008-06-29
TIME - two fifty a m, 1:30 p.m.
MONEY - 175 million Canadian Dollars, GBP 10.40
PERCENT - twenty pct, 18.75 %
FACILITY - Washington Monument, Stonehenge
GPE - South East Asia, Midlothian

### Word similarity metrics

#### Jaccard distance

In [14]:
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
from nltk.corpus import words
import pandas as pd

correct_spellings = words.words()
distances = []

gram_n = 2
entry = 'meanignful'

'''
Either correct_spellings or correct_spellings_redu can ne used.
In latter choices are limited to words that start with same letter
as the entry word
'''
correct_spellings_redu = pd.Series(correct_spellings)
correct_spellings_redu = correct_spellings_redu[correct_spellings_redu.str.startswith(entry[0])]
for word in correct_spellings:
    myngrams1 = set(ngrams(word, gram_n))
    myngrams2 = set(ngrams(entry, gram_n))
    distances.append((jaccard_distance(myngrams1 , myngrams2), word))

min_dist = min(distances)
min_dist

(0.5, 'meaningful')

#### Edit distance (Damerau–Levenshtein distance)

In [13]:
from nltk.metrics.distance import edit_distance
from nltk.corpus import words
import pandas as pd

correct_spellings = words.words()
distances = []

entry = 'meanignful'

'''
Either correct_spellings or correct_spellings_redu can ne used.
In latter choices are limited to words that start with same letter
as the entry word
'''
correct_spellings_redu = pd.Series(correct_spellings)
correct_spellings_redu = correct_spellings_redu[correct_spellings_redu.str.startswith(entry[0])]
for word in correct_spellings:
    distances.append((edit_distance(word , entry), word))

min_dist = min(distances)
min_dist

(2, 'meaningful')