In [None]:
# Chapter 3 Processing Raw Text

In [1]:
import nltk,re,pprint

In [2]:
from nltk import word_tokenize

In [None]:
# 3.1 Accessing Text from the Web and from Disk

In [3]:
from urllib import request

In [None]:
# Text number 1125 is “All's Well That Ends Well” by William Shakespeare, and we can access it as follows.

In [11]:
url = "http://www.gutenberg.org/files/1125/1125.txt"

In [12]:
response = request.urlopen(url)

In [13]:
raw = response.read().decode('utf8')

In [14]:
type(raw)

str

In [15]:
len(raw)

160953

In [16]:
raw[:204]

"\r\n**********************************************************************\r\nTHIS EBOOK WAS ONE OF PROJECT GUTENBERG'S EARLY FILES PRODUCED AT A\r\nTIME WHEN PROOFING METHODS AND TOOLS WERE NOT WELL DEVELOPED."

In [None]:
#The variable raw contains a string with 160,953 characters. This is the raw content of the book, including many details we are not interested in such as whitespace, line breaks and blank lines

In [None]:
# For our language processing, we want to break up the string into words and punctuation. This step is called tokenization, and it produces our familiar structure, a list of words and punctuation.

In [17]:
tokens = word_tokenize(raw)

In [18]:
type(tokens)

list

In [19]:
# The number of words in this text file

In [20]:
len(tokens)

32857

In [21]:
# The first ten words

In [22]:
tokens[:20]

['**********************************************************************',
 'THIS',
 'EBOOK',
 'WAS',
 'ONE',
 'OF',
 'PROJECT',
 'GUTENBERG',
 "'S",
 'EARLY',
 'FILES',
 'PRODUCED',
 'AT',
 'A',
 'TIME',
 'WHEN',
 'PROOFING',
 'METHODS',
 'AND',
 'TOOLS']

In [None]:
# Dealing with HTML

In [None]:
# Much of the text on the web is in the form of HTML documents. 

In [None]:
# You can use a web browser to save a page as text to a local file, then access this as described in the section on files below

In [None]:
# The first step is the same as before, using urlopen. 

In [None]:
# For fun we'll pick a BBC News story called Blondes to die out in 200 years, an urban legend passed along by the BBC as established scientific fact:

In [23]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"

In [24]:
html = request.urlopen(url).read().decode('utf8')

In [25]:
html[:60]

'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

In [None]:
# You can type print(html) to see the HTML content in all its glory, including meta tags, an image map, JavaScript, forms, and tables.

In [26]:
print(html)

<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<title>BBC NEWS | Health | Blondes 'to die out in 200 years'</title>
<meta name="keywords" content="BBC, News, BBC News, news online, world, uk, international, foreign, british, online, service">
<meta name="OriginalPublicationDate" content="2002/09/27 11:51:55">
<meta name="UKFS_URL" content="/1/hi/health/2284783.stm">
<meta name="IFS_URL" content="/2/hi/health/2284783.stm">
<meta name="HTTP-EQUIV" content="text/html;charset=iso-8859-1">
<meta name="Headline" content="Blondes 'to die out in 200 years'">
<meta name="Section" content="Health">
<meta name="Description" content="Natural blondes are an endangered species and will die out by 2202, a study suggests.">
<!-- GENMaps-->
<map name="banner">
<area alt="BBC NEWS" coords="7,9,167,32" href="http://news.bbc.co.uk/1/hi.html" shape="RECT">
</map>

<script src="/nol/shared/js/livestats_v1_1.js" langua

In [None]:
# To get text out of HTML we will use a Python library called BeautifulSoup, available from http://www.crummy.com/software/BeautifulSoup/:

In [27]:
from bs4 import BeautifulSoup

In [28]:
raw = BeautifulSoup(html, 'html.parser').get_text()

In [29]:
tokens = word_tokenize(raw)

In [30]:
tokens

['BBC',
 'NEWS',
 '|',
 'Health',
 '|',
 'Blondes',
 "'to",
 'die',
 'out',
 'in',
 '200',
 "years'",
 'NEWS',
 'SPORT',
 'WEATHER',
 'WORLD',
 'SERVICE',
 'A-Z',
 'INDEX',
 'SEARCH',
 'You',
 'are',
 'in',
 ':',
 'Health',
 'News',
 'Front',
 'Page',
 'Africa',
 'Americas',
 'Asia-Pacific',
 'Europe',
 'Middle',
 'East',
 'South',
 'Asia',
 'UK',
 'Business',
 'Entertainment',
 'Science/Nature',
 'Technology',
 'Health',
 'Medical',
 'notes',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Talking',
 'Point',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Country',
 'Profiles',
 'In',
 'Depth',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Programmes',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'SERVICES',
 'Daily',
 'E-mail',
 'News',
 'Ticker',
 'Mobile/PDAs',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Text',
 'Only',
 'Feedback',
 'Help',
 'EDITIONS',
 'Change',
 'to',
 'UK',
 'Friday',
 ',',
 '27',
 'September',
 ',',
 '2002',
 ',',
 '11:51',
 'GMT',
 '12:51'

In [31]:
# This still contains unwanted material concerning site navigation and related stories. With some trial and error you can find the start and end indexes of the content and select the tokens of interest, and initialize a text as before.

In [32]:
tokens=tokens[110:390]

In [33]:
print(tokens)

['UK', 'Blondes', "'to", 'die', 'out', 'in', '200', "years'", 'Scientists', 'believe', 'the', 'last', 'blondes', 'will', 'be', 'in', 'Finland', 'The', 'last', 'natural', 'blondes', 'will', 'die', 'out', 'within', '200', 'years', ',', 'scientists', 'believe', '.', 'A', 'study', 'by', 'experts', 'in', 'Germany', 'suggests', 'people', 'with', 'blonde', 'hair', 'are', 'an', 'endangered', 'species', 'and', 'will', 'become', 'extinct', 'by', '2202', '.', 'Researchers', 'predict', 'the', 'last', 'truly', 'natural', 'blonde', 'will', 'be', 'born', 'in', 'Finland', '-', 'the', 'country', 'with', 'the', 'highest', 'proportion', 'of', 'blondes', '.', 'The', 'frequency', 'of', 'blondes', 'may', 'drop', 'but', 'they', 'wo', "n't", 'disappear', 'Prof', 'Jonathan', 'Rees', ',', 'University', 'of', 'Edinburgh', 'But', 'they', 'say', 'too', 'few', 'people', 'now', 'carry', 'the', 'gene', 'for', 'blondes', 'to', 'last', 'beyond', 'the', 'next', 'two', 'centuries', '.', 'The', 'problem', 'is', 'that', 'b

In [None]:
# If we now take the further step of creating an NLTK text from this list, we can carry out all of the other linguistic processing we saw in Chapter 1., along with the regular list operations like slicing

In [34]:
text = nltk.Text(tokens)

In [35]:
text.concordance('gene')

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin


In [None]:
# Exercise 1: Try to retreive some text from any web page in the form of HTML documents by using the above code.

In [63]:
url = "https://en.wikipedia.org/wiki/Wiki"

In [64]:
html = request.urlopen(url).read().decode('utf8')

In [65]:
raw = BeautifulSoup(html, 'html.parser').get_text()

In [66]:
tokens = word_tokenize(raw)

In [67]:
tokens

['Wiki',
 '-',
 'Wikipedia',
 'document.documentElement.className=',
 "''",
 'client-js',
 "''",
 ';',
 'RLCONF=',
 '{',
 '``',
 'wgBreakFrames',
 "''",
 ':',
 '!',
 '1',
 ',',
 "''",
 'wgSeparatorTransformTable',
 "''",
 ':',
 '[',
 '``',
 "''",
 ',',
 "''",
 "''",
 ']',
 ',',
 "''",
 'wgDigitTransformTable',
 "''",
 ':',
 '[',
 '``',
 "''",
 ',',
 "''",
 "''",
 ']',
 ',',
 "''",
 'wgDefaultDateFormat',
 "''",
 ':',
 "''",
 'dmy',
 "''",
 ',',
 "''",
 'wgMonthNames',
 "''",
 ':',
 '[',
 '``',
 "''",
 ',',
 "''",
 'January',
 "''",
 ',',
 "''",
 'February',
 "''",
 ',',
 "''",
 'March',
 "''",
 ',',
 "''",
 'April',
 "''",
 ',',
 "''",
 'May',
 "''",
 ',',
 "''",
 'June',
 "''",
 ',',
 "''",
 'July',
 "''",
 ',',
 "''",
 'August',
 "''",
 ',',
 "''",
 'September',
 "''",
 ',',
 "''",
 'October',
 "''",
 ',',
 "''",
 'November',
 "''",
 ',',
 "''",
 'December',
 "''",
 ']',
 ',',
 "''",
 'wgRequestId',
 "''",
 ':',
 "''",
 '5cc695b1-a631-4797-9c55-8806dd93497a',
 "''",
 ',',
 "''",
 'wg

In [68]:
start = tokens.index("articles")

In [69]:
start

327

In [70]:
end = tokens.index("ready")

In [71]:
end

694

In [72]:
tokens=tokens[327:695]

In [73]:
tokens

['articles',
 'needing',
 'additional',
 'references',
 "''",
 ',',
 "''",
 'All',
 'articles',
 'needing',
 'examples',
 "''",
 ',',
 "''",
 'Articles',
 'needing',
 'examples',
 'from',
 'August',
 '2018',
 "''",
 ',',
 "''",
 'All',
 'articles',
 'with',
 'unsourced',
 'statements',
 "''",
 ',',
 "''",
 'Articles',
 'with',
 'unsourced',
 'statements',
 'from',
 'July',
 '2013',
 "''",
 ',',
 "''",
 'Articles',
 'with',
 'unsourced',
 'statements',
 'from',
 'April',
 '2020',
 "''",
 ',',
 "''",
 'All',
 'articles',
 'lacking',
 'reliable',
 'references',
 "''",
 ',',
 "''",
 'Articles',
 'lacking',
 'reliable',
 'references',
 'from',
 'July',
 '2013',
 "''",
 ',',
 "''",
 'Wikipedia',
 'articles',
 'in',
 'need',
 'of',
 'updating',
 'from',
 'July',
 '2013',
 "''",
 ',',
 "''",
 'All',
 'Wikipedia',
 'articles',
 'in',
 'need',
 'of',
 'updating',
 "''",
 ',',
 "''",
 'Spoken',
 'articles',
 "''",
 ',',
 "''",
 'Articles',
 'with',
 'Curlie',
 'links',
 "''",
 ',',
 "''",
 'Wikip

In [None]:
# Reading Local Files

In [None]:
# In order to read a local file, we need to use Python's built-in open() function, followed by the read() method. 

In [None]:
# Suppose you have a file document.txt, you can load its contents like this

In [74]:
f=open('document.txt')

In [None]:
# The read() method creates a string with the contents of the entire file:

In [75]:
f.read()

'Time flies like an arrow.\nFruit flies like a banana.\nWhat are you doing?\nWhy are you late?\nWhich is the best one?'

In [None]:
# Recall that the '\n' characters are newlines; this is equivalent to pressing Enter on a keyboard and starting a new line.

In [None]:
# We use the strip() method to remove the newline character at the end of the input line.

In [76]:
f=open('document.txt')
for line in f:
    print(line.strip())

Time flies like an arrow.
Fruit flies like a banana.
What are you doing?
Why are you late?
Which is the best one?


In [None]:
# NLTK's corpus files can also be accessed using these methods. We simply have to use nltk.data.find() to get the filename for any corpus item. Then we can open and read it in the way we just demonstrated above:

In [77]:
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')

In [78]:
raw = open(path).read()

In [79]:
len(raw)

1220066

In [80]:
tokens = word_tokenize(raw)

In [81]:
print(tokens[:30])

['[', 'Moby', 'Dick', 'by', 'Herman', 'Melville', '1851', ']', 'ETYMOLOGY', '.', '(', 'Supplied', 'by', 'a', 'Late', 'Consumptive', 'Usher', 'to', 'a', 'Grammar', 'School', ')', 'The', 'pale', 'Usher', '--', 'threadbare', 'in', 'coat', ',']


In [None]:
# Capturig User Input

In [None]:
# Sometimes we want to capture the text that a user inputs when she is interacting with our program. To prompt the user to type a line of input, call the Python function input(). After saving the input to a variable, we can manipulate it just as we have done for other strings.


In [83]:
s = input("Enter some text: ")

Enter some text: How are you doing?


In [84]:
print("You typed", len(word_tokenize(s)), "words.")

You typed 5 words.


In [85]:
# The NLP Pipeline (See Slides)

In [86]:
# When we tokenize a string we produce a list (of words), and this is Python's <list> type. Normalizing and sorting lists produces other lists:

In [87]:
raw=open('document.txt').read()

In [88]:
tokens=word_tokenize(raw)

In [89]:
tokens

['Time',
 'flies',
 'like',
 'an',
 'arrow',
 '.',
 'Fruit',
 'flies',
 'like',
 'a',
 'banana',
 '.',
 'What',
 'are',
 'you',
 'doing',
 '?',
 'Why',
 'are',
 'you',
 'late',
 '?',
 'Which',
 'is',
 'the',
 'best',
 'one',
 '?']

In [90]:
 words = [w.lower() for w in tokens]

In [91]:
words

['time',
 'flies',
 'like',
 'an',
 'arrow',
 '.',
 'fruit',
 'flies',
 'like',
 'a',
 'banana',
 '.',
 'what',
 'are',
 'you',
 'doing',
 '?',
 'why',
 'are',
 'you',
 'late',
 '?',
 'which',
 'is',
 'the',
 'best',
 'one',
 '?']

In [92]:
vocab=sorted(set(words))

In [93]:
vocab

['.',
 '?',
 'a',
 'an',
 'are',
 'arrow',
 'banana',
 'best',
 'doing',
 'flies',
 'fruit',
 'is',
 'late',
 'like',
 'one',
 'the',
 'time',
 'what',
 'which',
 'why',
 'you']

In [None]:
# The type of an object determines what operations you can perform on it. So, for example, we can append to a list but not to a string:

In [94]:
vocab.append('blog')

In [95]:
vocab

['.',
 '?',
 'a',
 'an',
 'are',
 'arrow',
 'banana',
 'best',
 'doing',
 'flies',
 'fruit',
 'is',
 'late',
 'like',
 'one',
 'the',
 'time',
 'what',
 'which',
 'why',
 'you',
 'blog']

In [None]:
# 3.2 Strings: Text Processing at the Lowest Level

In [None]:
# In earlier chapters we focused on a text as a list of words. 

In [None]:
# We didn't look too closely at words and how they are handled in the programming language. 

In [None]:
# The contents of a word, and of a file, are represented by programming languages as a fundamental data type known as a string. 

In [None]:
# In this section we explore strings in detail, and show the connection between strings, words, texts and files.

In [None]:
# Basic Operations with Strings

In [None]:
# Strings are specified using single quotes [1] or double quotes [2], as shown below. If a string contains a single quote, we must backslash-escape the quote [3] so Python knows a literal quote character is intended, or else put the string in double quotes [2]. Otherwise, the quote inside the string [4] will be interpreted as a close quote, and the Python interpreter will report a syntax error:

In [96]:
monty = 'Monty Python' #1

In [97]:
monty

'Monty Python'

In [98]:
circus = "Monty Python's Flying Circus" #2

In [99]:
circus

"Monty Python's Flying Circus"

In [100]:
circus='Monty Python\'s Flying Circus'# 3

In [101]:
circus

"Monty Python's Flying Circus"

In [107]:
circus='Monty Pythons Flying Circus'  #4

In [108]:
# Sometimes strings go over several lines. Python provides us with various ways of entering them. In the next example, a sequence of two strings is joined into a single string. We need to use backslash [1] or parentheses [2] so that the interpreter knows that the statement is not complete after the first line.

In [109]:
couplet = "Shall I compare thee to a Summer's day?"\
         "Thou are more lovely and more temperate:"

In [110]:
print(couplet)

Shall I compare thee to a Summer's day?Thou are more lovely and more temperate:


In [111]:
couplet=("Rough winds do shake the darling buds of May,"
        "And Summer's lease hath all too short a date:")

In [112]:
print(couplet)

Rough winds do shake the darling buds of May,And Summer's lease hath all too short a date:


In [113]:
# Unfortunately the above methods do not give us a newline between the two lines of the sonnet. Instead, we can use a triple-quoted string as follows:

In [114]:
couplet ="""Shall I compare thee to a Summer's day?
Thou are more lovely and more temperate:"""

In [115]:
print(couplet)

Shall I compare thee to a Summer's day?
Thou are more lovely and more temperate:


In [116]:
couplet = '''Rough winds do shake the darling buds of May,
And Summer's lease hath all too short a date:'''

In [117]:
print(couplet)

Rough winds do shake the darling buds of May,
And Summer's lease hath all too short a date:


In [118]:
# Now that we can define strings, we can try some simple operations on them. First let's look at the + operation, known as concatenation [1]. It produces a new string that is a copy of the two original strings pasted together end-to-end. Notice that concatenation doesn't do anything clever like insert a space between the words. We can even multiply strings

In [119]:
'very' + 'very' + 'very'

'veryveryvery'

In [120]:
'very' * 3

'veryveryvery'

In [121]:
# Exercise 2. Try running the following code, then try to use your understanding of the string + and * operations to figure out how it works. Be careful to distinguish between the string ' ', which is a single whitespace character, and '', which is the empty string.

In [122]:
a = [1, 2, 3, 4, 5, 6, 7, 6, 5, 4, 3, 2, 1]

In [123]:
b = [' ' * 2 * (7 - i) + 'very' * i for i in a]

In [124]:
for line in b:
    print(line)

            very
          veryvery
        veryveryvery
      veryveryveryvery
    veryveryveryveryvery
  veryveryveryveryveryvery
veryveryveryveryveryveryvery
  veryveryveryveryveryvery
    veryveryveryveryvery
      veryveryveryvery
        veryveryvery
          veryvery
            very


In [None]:
## We are printing a diamond shape by running the loop in a smart way to make the position for i for each line in a increasing order and after we reach the middle we start to decrease it.

In [125]:
# We've seen that the addition and multiplication operations apply to strings, not just numbers. However, note that we cannot use subtraction or division with strings:

In [126]:
'very'-'y'

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [127]:
'very'/2

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
## Character string and other string cannot have a division or a subtraction

In [128]:
# Printing Strings

In [None]:
# So far, when we have wanted to look at the contents of a variable or see the result of a calculation, we have just typed the variable name into the interpreter. We can also see the contents of a variable using the print statement:

In [129]:
print(monty)

Monty Python


In [130]:
# Notice that there are no quotation marks this time. When we inspect a variable by typing its name in the interpreter, the interpreter prints the Python representation of its value. Since it's a string, the result is quoted. However, when we tell the interpreter to print the contents of the variable, we don't see quotation characters since there are none inside the string.

In [131]:
# The print statement allows us to display more than one item on a line in various ways, as shown below:

In [132]:
grail = 'Holy Grail'

In [133]:
print(monty+grail)

Monty PythonHoly Grail


In [134]:
print(monty,grail)

Monty Python Holy Grail


In [135]:
print(monty,'and the',grail)

Monty Python and the Holy Grail


In [136]:
# Strings are indexed, starting from zero. When we index a string, we get one of its characters (or letters). A single character is nothing special — it's just a string of length.

In [137]:
# See examples in slides

In [138]:
monty[0]

'M'

In [139]:
monty[3]

't'

In [140]:
monty[5]

' '

In [141]:
# if we try to access an index that is outside of the string we get an error:

In [142]:
monty[20]

IndexError: string index out of range

In [143]:
# Again as with lists, we can use negative indexes for strings, where -1 is the index of the last character [1]. Positive and negative indexes give us two ways to refer to any position in a string. In this case, when the string had a length of 12, indexes 5 and -7 both refer to the same character (a space). (Notice that 5 = len(monty) - 7.)



In [144]:
monty[-1]

'n'

In [145]:
monty[5]

' '

In [146]:
monty[-7]

' '

In [147]:
# We can write for loops to iterate over the characters in strings. This print function includes the optional end=' ' parameter, which is how we tell Python to print a space instead of a newline at the end.

In [148]:
sent = 'colorless green ideas sleep furiously'

In [149]:
for char in sent:
    print(char, end=' ')

c o l o r l e s s   g r e e n   i d e a s   s l e e p   f u r i o u s l y 

In [150]:
# We can count individual characters as well. We should ignore the case distinction by normalizing everything to lowercase, and filter out non-alphabetic characters:

In [151]:
from nltk.corpus import gutenberg

In [152]:
raw = gutenberg.raw('melville-moby_dick.txt')

In [153]:
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())

In [154]:
fdist.most_common(5)

[('e', 117092), ('t', 87996), ('a', 77916), ('o', 69326), ('n', 65617)]

In [155]:
[char for (char, count) in fdist.most_common()]

['e',
 't',
 'a',
 'o',
 'n',
 'i',
 's',
 'h',
 'r',
 'l',
 'd',
 'u',
 'm',
 'c',
 'w',
 'f',
 'g',
 'p',
 'b',
 'y',
 'v',
 'k',
 'q',
 'j',
 'x',
 'z']

In [156]:
# Accessing Substrings (See Slides)

In [157]:
# For example, the following code accesses the substring starting at index 6, up to (but not including) index 10:

In [158]:
monty[6:10]

'Pyth'

In [159]:
# Here we see the characters are 'P', 'y', 't', and 'h' which correspond to monty[6] ... monty[9] but not monty[10]. This is because a slice starts at the first index but finishes one before the end index.
# We can also slice with negative indexes — the same basic rule of starting from the start index and stopping one before the end index applies; here we stop before the space character.

In [160]:
monty[-12:-7]

'Monty'

In [161]:
monty[:5]

'Monty'

In [162]:
monty[6:]

'Python'

In [163]:
# We test if a string contains a particular substring using the in operator, as follows:

In [164]:
phrase = 'And now for something completely different'

In [165]:
if 'thing' in phrase:
    print('found "thing"')

found "thing"


In [166]:
monty.find('Python')

6

In [None]:
# Exercise 3: Make up a sentence and assign it to a variable, e.g. sent = 'my sentence...'. Now write slice expressions to pull out individual words. (This is obviously not a convenient way to process the words of a text!)

In [167]:
sent = "What a loevely day"

In [168]:
sent[0:4]

'What'

In [169]:
sent[5:7]

'a '

In [170]:
sent[8:11]

'oev'

In [171]:
sent[12:15]

'ly '

In [172]:
sent[16:21]

'ay'

In [173]:
# More operations on strings

In [174]:
#Method  	Functionality
#s.find(t)	index of first instance of string t inside s (-1 if not found)
#s.rfind(t)	index of last instance of string t inside s (-1 if not found)
#s.index(t)	like s.find(t) except it raises ValueError if not found
#s.rindex(t)	like s.rfind(t) except it raises ValueError if not found
#s.join(text)	combine the words of the text into a string using s as the glue
#s.split(t)	split s into a list wherever a t is found (whitespace by default)
#s.splitlines()	split s into a list of strings, one per line
#s.lower()	a lowercased version of the string s
#s.upper()	an uppercased version of the string s
#s.title()	a titlecased version of the string s
#s.strip()	a copy of s without leading or trailing whitespace
#s.replace(t, u)	replace instances of t with u inside s

In [175]:
# The Difference Between Lists and Strings

In [176]:
# Strings and lists are both kinds of sequence. We can pull them apart by indexing and slicing them, and we can join them together by concatenating them. However, we cannot join strings and lists:

In [177]:
query = 'Who knows?'

In [178]:
beatles=['John', 'Paul', 'George', 'Ringo']

In [179]:
query[2]

'o'

In [180]:
beatles[2]

'George'

In [181]:
query[:2]

'Wh'

In [182]:
beatles[:2]

['John', 'Paul']

In [183]:
# Strings and lists are both kinds of sequence. We can pull them apart by indexing and slicing them, and we can join them together by concatenating them. However, we cannot join strings and lists:

In [184]:
query + " I don't"

"Who knows? I don't"

In [185]:
beatles + 'Brian'

TypeError: can only concatenate list (not "str") to list

In [186]:
beatles + ['Brian']

['John', 'Paul', 'George', 'Ringo', 'Brian']

In [187]:
# Lists and strings do not have exactly the same functionality. Lists have the added power that you can change their elements:

In [188]:
beatles[0] = "John Lennon"

In [189]:
del beatles[-1]

In [190]:
beatles

['John Lennon', 'Paul', 'George']

In [191]:
query[0]='F'

TypeError: 'str' object does not support item assignment

In [192]:
# This is because strings are immutable — you can't change a string once you have created it. However, lists are mutable, and their contents can be modified at any time. As a result, lists support operations that modify the original value rather than producing a new value.

In [193]:
# 3.3   Regular Expressions for Detecting Word Patterns


In [194]:
# Many linguistic processing tasks involve pattern matching. For example, we can find words ending with ed using endswith('ed'). We saw a variety of such "word tests" in 4.2. Regular expressions give us a more powerful and flexible method for describing the character patterns we are interested in.



In [195]:
# To use regular expressions in Python we need to import the re library using: import re. We also need a list of words to search; we'll use the Words Corpus again (4). We will preprocess it to remove any proper names.

In [196]:
import re

In [197]:
wordlist=[w for w in nltk.corpus.words.words('en') if w.islower()]

In [198]:
# Let's find words ending with ed using the regular expression «ed$». We will use the re.search(p, s) function to check whether the pattern p can be found somewhere inside the string s. We need to specify the characters of interest, and use the dollar sign which has a special behavior in the context of regular expressions in that it matches the end of the word:

In [199]:
[w for w in wordlist if re.search('ed$', w)]

['abaissed',
 'abandoned',
 'abased',
 'abashed',
 'abatised',
 'abed',
 'aborted',
 'abridged',
 'abscessed',
 'absconded',
 'absorbed',
 'abstracted',
 'abstricted',
 'accelerated',
 'accepted',
 'accidented',
 'accoladed',
 'accolated',
 'accomplished',
 'accosted',
 'accredited',
 'accursed',
 'accused',
 'accustomed',
 'acetated',
 'acheweed',
 'aciculated',
 'aciliated',
 'acknowledged',
 'acorned',
 'acquainted',
 'acquired',
 'acquisited',
 'acred',
 'aculeated',
 'addebted',
 'added',
 'addicted',
 'addlebrained',
 'addleheaded',
 'addlepated',
 'addorsed',
 'adempted',
 'adfected',
 'adjoined',
 'admired',
 'admitted',
 'adnexed',
 'adopted',
 'adossed',
 'adreamed',
 'adscripted',
 'aduncated',
 'advanced',
 'advised',
 'aeried',
 'aethered',
 'afeared',
 'affected',
 'affectioned',
 'affined',
 'afflicted',
 'affricated',
 'affrighted',
 'affronted',
 'aforenamed',
 'afterfeed',
 'aftershafted',
 'afterthoughted',
 'afterwitted',
 'agazed',
 'aged',
 'agglomerated',
 'aggri

In [200]:
# The . wildcard symbol matches any single character. Suppose we have room in a crossword puzzle for an 8-letter word with j as its third letter and t as its sixth letter. In place of each blank cell we use a period:

In [201]:
[w for w in wordlist if re.search('^..j..t..$', w)]

['abjectly',
 'adjuster',
 'dejected',
 'dejectly',
 'injector',
 'majestic',
 'objectee',
 'objector',
 'rejecter',
 'rejector',
 'unjilted',
 'unjolted',
 'unjustly']

In [202]:
 # Exercise 4: The caret symbol ^ matches the start of a string, just like the $ matches the end. What results do we get with the above example if we leave out both of these, and search for «..j..t..»?

In [207]:
[w for w in wordlist if re.search('..j..t..', w)]

['abjectedness',
 'abjection',
 'abjective',
 'abjectly',
 'abjectness',
 'adjection',
 'adjectional',
 'adjectival',
 'adjectivally',
 'adjective',
 'adjectively',
 'adjectivism',
 'adjectivitis',
 'adjustable',
 'adjustably',
 'adjustage',
 'adjustation',
 'adjuster',
 'adjustive',
 'adjustment',
 'antejentacular',
 'antiprojectivity',
 'bijouterie',
 'coadjustment',
 'cojusticiar',
 'conjective',
 'conjecturable',
 'conjecturably',
 'conjectural',
 'conjecturalist',
 'conjecturality',
 'conjecturally',
 'conjecture',
 'conjecturer',
 'coprojector',
 'counterobjection',
 'dejected',
 'dejectedly',
 'dejectedness',
 'dejectile',
 'dejection',
 'dejectly',
 'dejectory',
 'dejecture',
 'disjection',
 'guanajuatite',
 'inadjustability',
 'inadjustable',
 'injectable',
 'injection',
 'injector',
 'injustice',
 'insubjection',
 'interjection',
 'interjectional',
 'interjectionalize',
 'interjectionally',
 'interjectionary',
 'interjectionize',
 'interjectiveness',
 'interjector',
 'interje

In [203]:
# J is no longer the third letter and t is no longer the six letter and the end letter could be anything, the length of the word could be anything

In [204]:
# Finally, the ? symbol specifies that the previous character is optional. Thus «^e-?mail$» will match both email and e-mail. We could count the total number of occurrences of this word (in either spelling) in a text using sum(1 for w in text if re.search('^e-?mail$', w)).

In [205]:
[w for w in wordlist if re.search('^e-?mail$', w)]

[]

In [206]:
sum(1 for w in wordlist if re.search('^e-?mail$', w))

0

In [None]:
#Basic Regular Expression Meta-Characters, Including Wildcards, Ranges and Closures

In [None]:
#Operator	Behavior
#.	        Wildcard, matches any character
#^abc	    Matches some pattern abc at the start of a string
#abc$	    Matches some pattern abc at the end of a string
#[abc]	    Matches one of a set of characters
#[A-Z0-9]	Matches one of a range of characters
#ed|ing|s	Matches one of the specified strings (disjunction)
#*	        Zero or more of previous item, e.g. a*, [a-z]* (also known as Kleene Closure)
#+	        One or more of previous item, e.g. a+, [a-z]+
#?	        Zero or one of the previous item (i.e. optional), e.g. a?, [a-z]?
#{n}	    Exactly n repeats where n is a non-negative integer
#{n,}	    At least n repeats
#{,n}	    No more than n repeats
#{m,n}	    At least m and no more than n repeats
#a(b|c)+	Parentheses that indicate the scope of the operators

In [None]:
# Ranges and Closures

In [None]:
# The T9 system is used for entering text on mobile phones (see 3.5). Two or more words that are entered with the same sequence of keystrokes are known as textonyms. For example, both hole and golf are entered by pressing the sequence 4653. What other words could be produced with the same sequence? Here we use the regular expression «^[ghi][mno][jlk][def]$»:

In [208]:
[w for w in wordlist if re.search('^[ghi][mno][jlk][def]$', w)]

['gold', 'golf', 'hold', 'hole']

In [None]:
# Exercise 5: Look for some "finger-twisters", by searching for words that only use part of the number-pad. For example «^[ghijklmno]+$», or more concisely, «^[g-o]+$», will match words that only use keys 4, 5, 6 in the center row, and «^[a-fj-o]+$» will match words that use keys 2, 3, 5, 6 in the top-right corner. What do - and + mean?

In [212]:
[w for w in wordlist if re.search('^[g-o]+$', w)]

['g',
 'ghoom',
 'gig',
 'giggling',
 'gigolo',
 'gilim',
 'gill',
 'gilling',
 'gilo',
 'gim',
 'gin',
 'ging',
 'gingili',
 'gink',
 'ginkgo',
 'ginning',
 'gio',
 'glink',
 'glom',
 'glonoin',
 'gloom',
 'glooming',
 'gnomon',
 'go',
 'gog',
 'gogo',
 'goi',
 'going',
 'gol',
 'goli',
 'gon',
 'gong',
 'gonion',
 'goo',
 'googol',
 'gook',
 'gool',
 'goon',
 'h',
 'hi',
 'high',
 'hill',
 'him',
 'hin',
 'hing',
 'hinoki',
 'ho',
 'hog',
 'hoggin',
 'hogling',
 'hoi',
 'hoin',
 'holing',
 'holl',
 'hollin',
 'hollo',
 'hollong',
 'holm',
 'homo',
 'homologon',
 'hong',
 'honk',
 'hook',
 'hoon',
 'i',
 'igloo',
 'ihi',
 'ilk',
 'ill',
 'imi',
 'imino',
 'immi',
 'in',
 'ing',
 'ingoing',
 'inion',
 'ink',
 'inkling',
 'inlook',
 'inn',
 'inning',
 'io',
 'ion',
 'j',
 'jhool',
 'jig',
 'jing',
 'jingling',
 'jingo',
 'jinjili',
 'jink',
 'jinn',
 'jinni',
 'jo',
 'jog',
 'johnin',
 'join',
 'joining',
 'joll',
 'joom',
 'k',
 'kiki',
 'kil',
 'kilhig',
 'kilim',
 'kill',
 'killing',

In [None]:
# Let's explore the + symbol a bit further. Notice that it can be applied to individual letters, or to bracketed sets of letters:

In [213]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))

In [214]:
[w for w in chat_words if re.search('^m+i+n+e+$', w)]

['miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
 'miiiiiinnnnnnnnnneeeeeeee',
 'mine',
 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee']

In [211]:
[w for w in chat_words if re.search('^[ha]+$', w)]

['a',
 'aaaaaaaaaaaaaaaaa',
 'aaahhhh',
 'ah',
 'ahah',
 'ahahah',
 'ahh',
 'ahhahahaha',
 'ahhh',
 'ahhhh',
 'ahhhhhh',
 'ahhhhhhhhhhhhhh',
 'h',
 'ha',
 'haaa',
 'hah',
 'haha',
 'hahaaa',
 'hahah',
 'hahaha',
 'hahahaa',
 'hahahah',
 'hahahaha',
 'hahahahaaa',
 'hahahahahaha',
 'hahahahahahaha',
 'hahahahahahahahahahahahahahahaha',
 'hahahhahah',
 'hahhahahaha']

In [None]:
# Kleene closures (*)

In [None]:
# It should be clear that + simply means "one or more instances of the preceding item", which could be an individual character like m, a set like [fed] or a range like [d-f]. 

In [None]:
# Now let's replace + with *, which means "zero or more instances of the preceding item". 

In [None]:
# The regular expression «^m*i*n*e*$» will match everything that we found using «^m+i+n+e+$», but also words where some of the letters don't appear at all, e.g. me, min, and mmmmm. 

In [None]:
# Note that the + and * symbols are sometimes referred to as Kleene closures, or simply closures.

In [215]:
[w for w in chat_words if re.search('^m*i*n*e*$', w)]

['',
 'e',
 'i',
 'in',
 'm',
 'me',
 'meeeeeeeeeeeee',
 'mi',
 'miiiiiiiiiiiiinnnnnnnnnnneeeeeeeeee',
 'miiiiiinnnnnnnnnneeeeeeee',
 'min',
 'mine',
 'mm',
 'mmm',
 'mmmm',
 'mmmmm',
 'mmmmmm',
 'mmmmmmmmiiiiiiiiinnnnnnnnneeeeeeee',
 'mmmmmmmmmm',
 'mmmmmmmmmmmmm',
 'mmmmmmmmmmmmmm',
 'n',
 'ne']

In [216]:
# The ^ operator has another function when it appears as the first character inside square brackets. For example «[^aeiouAEIOU]» matches any character other than a vowel. We can search the NPS Chat Corpus for words that are made up entirely of non-vowel characters using «^[^aeiouAEIOU]+$» to find items like these: :):):), grrr, cyb3r and zzzzzzzz. Notice this includes non-alphabetic characters.

In [217]:
[w for w in chat_words if re.search('^[^aeiouAEIOU]+$', w)]

['!',
 '!!',
 '!!!',
 '!!!!',
 '!!!!!',
 '!!!!!!',
 '!!!!!!!',
 '!!!!!!!!',
 '!!!!!!!!!',
 '!!!!!!!!!!',
 '!!!!!!!!!!!',
 '!!!!!!!!!!!!!',
 '!!!!!!!!!!!!!!!!',
 '!!!!!!!!!!!!!!!!!!!!!!',
 '!!!!!!!!!!!!!!!!!!!!!!!',
 '!!!!!!!!!!!!!!!!!!!!!!!!!!!',
 '!!!!!!!!!!!!!!!!!!!!!!!!!!!!',
 '!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!',
 '!!!!!!.',
 '!!!!!.',
 '!!!!....',
 '!!!.',
 '!!.',
 '!!...',
 '!.',
 '!...',
 '!=',
 '!?',
 '!??',
 '!???',
 '"',
 '"...',
 '"?',
 '"s',
 '#',
 '###',
 '####',
 '$',
 '$$',
 '$27',
 '&',
 '&^',
 "'",
 "''",
 "'.",
 "'d",
 "'ll",
 "'m",
 "'n'",
 "'s",
 '(',
 '(((',
 '((((',
 '(((((',
 '((((((',
 '(((((((',
 '((((((((',
 '(((((((((',
 '((((((((((',
 '(((((((((((',
 '((((((((((((',
 '(((((((((((((',
 '((((((((((((((',
 '(((((((((((((((',
 '(((((((((((((((((',
 '((((((((((((((((((',
 '((((((((((((((((((((',
 '(((((((((((((((((((((',
 '(((((((((((((((((((((((',
 '((((((((((((((((((((((((',
 '(((((((((((((((((((((((((',
 '((((((((((((((((((((((((((',
 '((((

In [None]:
# Here are some more examples of regular expressions being used to find tokens that match a particular pattern, illustrating the use of some new symbols: \, {}, (), and |:

In [None]:
# You probably worked out that a backslash means that the following character is deprived of its special powers and must literally match a specific character in the word. Thus, while . is special, \. only matches a period. 

In [None]:
# The braced expressions, like {3,5}, specify the number of repeats of the previous item. 

In [None]:
# The pipe character indicates a choice between the material on its left or its right. 

In [None]:
# Parentheses indicate the scope of an operator: they can be used together with the pipe (or disjunction) symbol like this: «w(i|e|ai|oo)t», matching wit, wet, wait, and woot. 

In [218]:
wsj = sorted(set(nltk.corpus.treebank.words()))

In [219]:
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)]

['0.0085',
 '0.05',
 '0.1',
 '0.16',
 '0.2',
 '0.25',
 '0.28',
 '0.3',
 '0.4',
 '0.5',
 '0.50',
 '0.54',
 '0.56',
 '0.60',
 '0.7',
 '0.82',
 '0.84',
 '0.9',
 '0.95',
 '0.99',
 '1.01',
 '1.1',
 '1.125',
 '1.14',
 '1.1650',
 '1.17',
 '1.18',
 '1.19',
 '1.2',
 '1.20',
 '1.24',
 '1.25',
 '1.26',
 '1.28',
 '1.35',
 '1.39',
 '1.4',
 '1.457',
 '1.46',
 '1.49',
 '1.5',
 '1.50',
 '1.55',
 '1.56',
 '1.5755',
 '1.5805',
 '1.6',
 '1.61',
 '1.637',
 '1.64',
 '1.65',
 '1.7',
 '1.75',
 '1.76',
 '1.8',
 '1.82',
 '1.8415',
 '1.85',
 '1.8500',
 '1.9',
 '1.916',
 '1.92',
 '10.19',
 '10.2',
 '10.5',
 '107.03',
 '107.9',
 '109.73',
 '11.10',
 '11.5',
 '11.57',
 '11.6',
 '11.72',
 '11.95',
 '112.9',
 '113.2',
 '116.3',
 '116.4',
 '116.7',
 '116.9',
 '118.6',
 '12.09',
 '12.5',
 '12.52',
 '12.68',
 '12.7',
 '12.82',
 '12.97',
 '120.7',
 '1206.26',
 '121.6',
 '126.1',
 '126.15',
 '127.03',
 '129.91',
 '13.1',
 '13.15',
 '13.5',
 '13.50',
 '13.625',
 '13.65',
 '13.73',
 '13.8',
 '13.90',
 '130.6',
 '130.7',
 '

In [220]:
[w for w in wsj if re.search('^[A-Z]+\$$', w)]

['C$', 'US$']

In [221]:
[w for w in wsj if re.search('^[0-9]{4}$', w)]

['1614',
 '1637',
 '1787',
 '1901',
 '1903',
 '1917',
 '1925',
 '1929',
 '1933',
 '1934',
 '1948',
 '1953',
 '1955',
 '1956',
 '1961',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1971',
 '1972',
 '1973',
 '1975',
 '1976',
 '1977',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2005',
 '2009',
 '2017',
 '2019',
 '2029',
 '3057',
 '8300']

In [222]:
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]

['10-day',
 '10-lap',
 '10-year',
 '100-share',
 '12-point',
 '12-year',
 '14-hour',
 '15-day',
 '150-point',
 '190-point',
 '20-point',
 '20-stock',
 '21-month',
 '237-seat',
 '240-page',
 '27-year',
 '30-day',
 '30-point',
 '30-share',
 '30-year',
 '300-day',
 '36-day',
 '36-store',
 '42-year',
 '50-state',
 '500-stock',
 '52-week',
 '69-point',
 '84-month',
 '87-store',
 '90-day']

In [223]:
[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]

['black-and-white',
 'bread-and-butter',
 'father-in-law',
 'machine-gun-toting',
 'savings-and-loan']

In [224]:
[w for w in wsj if re.search('(ed|ing)$', w)]

['62%-owned',
 'Absorbed',
 'According',
 'Adopting',
 'Advanced',
 'Advancing',
 'Alfred',
 'Allied',
 'Annualized',
 'Anything',
 'Arbitrage-related',
 'Arbitraging',
 'Asked',
 'Assuming',
 'Atlanta-based',
 'Baking',
 'Banking',
 'Beginning',
 'Beijing',
 'Being',
 'Bermuda-based',
 'Betting',
 'Boeing',
 'Broadcasting',
 'Bucking',
 'Buying',
 'Calif.-based',
 'Change-ringing',
 'Citing',
 'Concerned',
 'Confronted',
 'Conn.based',
 'Consolidated',
 'Continued',
 'Continuing',
 'Declining',
 'Defending',
 'Depending',
 'Designated',
 'Determining',
 'Developed',
 'Died',
 'During',
 'Encouraged',
 'Encouraging',
 'English-speaking',
 'Estimated',
 'Everything',
 'Excluding',
 'Exxon-owned',
 'Faulding',
 'Fed',
 'Feeding',
 'Filling',
 'Filmed',
 'Financing',
 'Following',
 'Founded',
 'Fracturing',
 'Francisco-based',
 'Fred',
 'Funded',
 'Funding',
 'Generalized',
 'Germany-based',
 'Getting',
 'Guaranteed',
 'Having',
 'Heating',
 'Heightened',
 'Holding',
 'Housing',
 'Illumin

In [None]:
# 3.4   Useful Applications of Regular Expressions

In [None]:
# r

In [None]:
# To the Python interpreter, a regular expression is just like any other string

In [None]:
# If the string contains a backslash followed by particular characters, it will interpret these specially.  For example \b would be interpreted as the backspace character. 

In [None]:
# In general, when using regular expressions containing backslash, we should instruct the interpreter not to look inside the string at all, but simply to pass it directly to the re library for processing. 

In [None]:
# We do this by prefixing the string with the letter r, to indicate that it is a raw string. 

In [None]:
# For example, the raw string r'\band\b' contains two \b symbols that are interpreted by the re library as matching word boundaries instead of backspace characters. 

In [None]:
# If you get into the habit of using r'...' for regular expressions — as we will do from now on — you will avoid having to think about these complications.

In [None]:
# The above examples all involved searching for words w that match some regular expression regexp using re.search(regexp, w).

In [None]:
# Apart from checking if a regular expression matches a word, we can use regular expressions to extract material from words, or to modify words in specific ways.

In [None]:
# Extracting Word Pieces

In [None]:
# The re.findall() ("find all") method finds all (non-overlapping) matches of the given regular expression. Let's find all the vowels in a word, then count them:

In [225]:
word = 'supercalifragilisticexpialidocious'

In [226]:
re.findall(r'[aeiou]', word)

['u',
 'e',
 'a',
 'i',
 'a',
 'i',
 'i',
 'i',
 'e',
 'i',
 'a',
 'i',
 'o',
 'i',
 'o',
 'u']

In [227]:
len(re.findall(r'[aeiou]', word))

16

In [228]:
# Let's look for all sequences of two or more vowels in some text, and determine their relative frequency:

In [229]:
wsj = sorted(set(nltk.corpus.treebank.words()))

In [230]:
fd = nltk.FreqDist(vs for word in wsj
                      for vs in re.findall(r'[aeiou]{2,}', word))

In [231]:
fd.most_common(12)

[('io', 549),
 ('ea', 476),
 ('ie', 331),
 ('ou', 329),
 ('ai', 261),
 ('ia', 253),
 ('ee', 217),
 ('oo', 174),
 ('ua', 109),
 ('au', 106),
 ('ue', 105),
 ('ui', 95)]

In [232]:
# Doing More with Word Pieces

In [None]:
# Once we can use re.findall() to extract material from words, there's interesting things to do with the pieces, like glue them back together or plot them.

In [None]:
# It is sometimes noted that English text is highly redundant, and it is still easy to read when word-internal vowels are left out. 

In [None]:
# For example, declaration becomes dclrtn, and inalienable becomes inlnble, retaining any initial or final vowel sequences

In [None]:
# The regular expression in our next example matches initial vowel sequences, final vowel sequences, and all consonants; everything else is ignored.

In [None]:
# This three-way disjunction is processed left-to-right, if one of the three parts matches the word, any later parts of the regular expression are ignored

In [233]:
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'

In [234]:
# We use re.findall() to extract all the matching pieces, and ''.join() to join them together 

In [235]:
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

In [236]:
english_udhr = nltk.corpus.udhr.words('English-Latin1')

In [237]:
# nltk.tokenwrap():Pretty print a list of text tokens, breaking lines on whitespace

In [238]:
print(nltk.tokenwrap(w for w in english_udhr[:75]))

Universal Declaration of Human Rights Preamble Whereas recognition of
the inherent dignity and of the equal and inalienable rights of all
members of the human family is the foundation of freedom , justice and
peace in the world , Whereas disregard and contempt for human rights
have resulted in barbarous acts which have outraged the conscience of
mankind , and the advent of a world in which human beings shall enjoy
freedom of speech and


In [239]:
print(nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

Unvrsl Dclrtn of Hmn Rghts Prmble Whrs rcgntn of the inhrnt dgnty and
of the eql and inlnble rghts of all mmbrs of the hmn fmly is the fndtn
of frdm , jstce and pce in the wrld , Whrs dsrgrd and cntmpt fr hmn
rghts hve rsltd in brbrs acts whch hve outrgd the cnscnce of mnknd ,
and the advnt of a wrld in whch hmn bngs shll enjy frdm of spch and


In [240]:
# Next, let's combine regular expressions with conditional frequency distributions. Here we will extract all consonant-vowel sequences from the words of Rotokas, such as ka and si. Since each of these is a pair, it can be used to initialize a conditional frequency distribution. We then tabulate the frequency of each pair:

In [241]:
rotokas_words = nltk.corpus.toolbox.words('rotokas.dic')

In [242]:
cvs = [cv for w in rotokas_words for cv in re.findall(r'[ptksvr][aeiou]', w)]

In [243]:
cfd = nltk.ConditionalFreqDist(cvs)

In [244]:
cfd.tabulate()

    a   e   i   o   u 
k 418 148  94 420 173 
p  83  31 105  34  51 
r 187  63  84  89  79 
s   0   0 100   2   1 
t  47   8   0 148  37 
v  93  27 105  48  49 


In [245]:
# If we want to be able to inspect the words behind the numbers in the above table, it would be helpful to have an index, allowing us to quickly find the list of words that contains a given consonant-vowel pair, e.g. cv_index['su'] should give us all words containing su. Here's how we can do this:


In [246]:
cv_word_pairs = [(cv, w) for w in rotokas_words
                 for cv in re.findall(r'[ptksvr][aeiou]', w)]

In [247]:
cv_index = nltk.Index(cv_word_pairs)

In [248]:
cv_index['su']

['kasuari']

In [249]:
 cv_index['po']

['kaapo',
 'kaapopato',
 'kaipori',
 'kaiporipie',
 'kaiporivira',
 'kapo',
 'kapoa',
 'kapokao',
 'kapokapo',
 'kapokapo',
 'kapokapoa',
 'kapokapoa',
 'kapokapora',
 'kapokapora',
 'kapokaporo',
 'kapokaporo',
 'kapokari',
 'kapokarito',
 'kapokoa',
 'kapoo',
 'kapooto',
 'kapoovira',
 'kapopaa',
 'kaporo',
 'kaporo',
 'kaporopa',
 'kaporoto',
 'kapoto',
 'karokaropo',
 'karopo',
 'kepo',
 'kepoi',
 'keposi',
 'kepoto']

In [250]:
# Finding Word Stems


In [251]:
# There are various ways we can pull out the stem of a word. Here's a simple-minded approach which just strips off anything that looks like a suffix:



In [252]:
def stem(word):
     for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
        return word

In [253]:
# Although we will ultimately use NLTK's built-in stemmers, it's interesting to see how we can use regular expressions for this task. Our first step is to build up a disjunction of all the suffixes. We need to enclose it in parentheses in order to limit the scope of the disjunction.

In [254]:
 re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

['ing']

In [255]:
# Here, re.findall() just gave us the suffix even though the regular expression matched the entire word. This is because the parentheses have a second function, to select substrings to be extracted. If we want to use the parentheses to specify the scope of the disjunction, but not to select the material to be output, we have to add ?:, which is just one of many arcane subtleties of regular expressions. Here's the revised version.

In [256]:
re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

['processing']

In [257]:
# However, we'd actually like to split the word into stem and suffix. So we should just parenthesize both parts of the regular expression:

In [258]:
 re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

[('process', 'ing')]

In [259]:
# This looks promising, but still has a problem. Let's look at a different word, processes:

In [260]:
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('processe', 's')]

In [261]:
# The regular expression incorrectly found an -s suffix instead of an -es suffix. This demonstrates another subtlety: the star operator is "greedy" and the .* part of the expression tries to consume as much of the input as possible. If we use the "non-greedy" version of the star operator, written *?, we get what we want:

In [262]:
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processes')

[('process', 'es')]

In [263]:
# This works even when we allow an empty suffix, by making the content of the second parentheses optional:


In [264]:
re.findall(r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$', 'language')

[('language', '')]

In [265]:
# This approach still has many problems (can you spot them?) .Now we will move on to define a function to perform stemming, and apply it to a whole text:


In [266]:
def stem(word):  
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

In [267]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
   is no basis for a system of government.  Supreme executive power derives from
 a mandate from the masses, not from some farcical aquatic ceremony."""

In [268]:
tokens = word_tokenize(raw)

In [269]:
[stem(t) for t in tokens]

['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'women',
 'ly',
 'in',
 'pond',
 'distribut',
 'sword',
 'i',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'Supreme',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']

In [270]:
# Notice that our regular expression removed the s from ponds but also from is and basis. It produced some non-words like distribut and deriv, but these are acceptable stems in some applications.

In [271]:
# Searching Tokenized Text

In [272]:
# You can use a special kind of regular expression for searching across multiple words in a text (where a text is a list of tokens). For example, "<a> <man>" finds all instances of a man in the text. The angle brackets are used to mark token boundaries, and any whitespace between the angle brackets is ignored (behaviors that are unique to NLTK's findall() method for texts). In the following example, we include <.*> [1] which will match any single token, and enclose it in parentheses so only the matched word (e.g. monied) and not the matched phrase (e.g. a monied man) is produced. The second example finds three-word phrases ending with the word bro [2]. The last example finds sequences of three or more words starting with the letter l [3].


In [273]:
from nltk.corpus import gutenberg, nps_chat

In [274]:
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))

In [275]:
moby.findall(r"<a> (<.*>) <man>")

monied; nervous; dangerous; white; white; white; pious; queer; good;
mature; white; Cape; great; wise; wise; butterless; white; fiendish;
pale; furious; better; certain; complete; dismasted; younger; brave;
brave; brave; brave


In [276]:
chat = nltk.Text(nps_chat.words())

In [277]:
chat.findall(r"<.*> <.*> <bro>")

you rule bro; telling you bro; u twizted bro


In [278]:
chat.findall(r"<l.*>{3,}")

lol lol lol; lmao lol lol; lol lol lol; la la la la la; la la la; la
la la; lovely lol lol love; lol lol lol.; la la la; la la la


In [279]:
# It is easy to build search patterns when the linguistic phenomenon we're studying is tied to particular words. In some cases, a little creativity will go a long way. For instance, searching a large text corpus for expressions of the form x and other ys allows us to discover hypernyms (cf 5):

In [280]:
from nltk.corpus import brown

In [281]:
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))

In [282]:
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

speed and other activities; water and other liquids; tomb and other
landmarks; Statues and other monuments; pearls and other jewels;
charts and other items; roads and other features; figures and other
objects; military and other areas; demands and other factors;
abstracts and other compilations; iron and other metals


In [283]:
# With enough text, this approach would give us a useful store of information about the taxonomy of objects, without the need for any manual labor. However, our search results will usually contain false positives, i.e. cases that we would want to exclude. For example, the result: demands and other factors suggests that demand is an instance of the type factor, but this sentence is actually about wage demands. Nevertheless, we could construct our own ontology of English concepts by manually correcting the output of such searches.


In [284]:
# 3.5  Normalizing Text

In [285]:
# In earlier program examples we have often converted text to lowercase before doing anything with its words, e.g. set(w.lower() for w in text). 


In [None]:
# By using lower(), we have normalized the text to lowercase so that the distinction between The and the is ignored. 

In [None]:
# Often we want to go further than this, and strip off any affixes, a task known as stemming. 

In [None]:
# A further step is to make sure that the resulting form is a known word in a dictionary, a task known as lemmatization. 

In [286]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords is no basis for a system of government.  Supreme executive power derives from
 a mandate from the masses, not from some farcical aquatic ceremony."""

In [287]:
tokens=word_tokenize(raw)

In [None]:
# Stemmers

In [None]:
# NLTK includes several off-the-shelf stemmers, and if you ever need a stemmer you should use one of these in preference to crafting your own using regular expressions, since these handle a wide range of irregular cases

In [None]:
# The Porter and Lancaster stemmers follow their own rules for stripping affixes. 

In [None]:
# Observe that the Porter stemmer correctly handles the word lying (mapping it to lie), while the Lancaster stemmer does not.

In [288]:
porter = nltk.PorterStemmer()

In [289]:
lancaster = nltk.LancasterStemmer()

In [290]:
[porter.stem(t) for t in tokens]

['denni',
 ':',
 'listen',
 ',',
 'strang',
 'women',
 'lie',
 'in',
 'pond',
 'distribut',
 'sword',
 'is',
 'no',
 'basi',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'suprem',
 'execut',
 'power',
 'deriv',
 'from',
 'a',
 'mandat',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcic',
 'aquat',
 'ceremoni',
 '.']

In [291]:
[lancaster.stem(t) for t in tokens]

['den',
 ':',
 'list',
 ',',
 'strange',
 'wom',
 'lying',
 'in',
 'pond',
 'distribut',
 'sword',
 'is',
 'no',
 'bas',
 'for',
 'a',
 'system',
 'of',
 'govern',
 '.',
 'suprem',
 'execut',
 'pow',
 'der',
 'from',
 'a',
 'mand',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'som',
 'farc',
 'aqu',
 'ceremony',
 '.']

In [292]:
# Lemmatization

In [293]:
# The WordNet lemmatizer only removes affixes if the resulting word is in its dictionary. This additional checking process makes the lemmatizer slower than the above stemmers. Notice that it doesn't handle lying, but it converts women to woman.

In [294]:
wnl = nltk.WordNetLemmatizer()

In [295]:
[wnl.lemmatize(t) for t in tokens]

['DENNIS',
 ':',
 'Listen',
 ',',
 'strange',
 'woman',
 'lying',
 'in',
 'pond',
 'distributing',
 'sword',
 'is',
 'no',
 'basis',
 'for',
 'a',
 'system',
 'of',
 'government',
 '.',
 'Supreme',
 'executive',
 'power',
 'derives',
 'from',
 'a',
 'mandate',
 'from',
 'the',
 'mass',
 ',',
 'not',
 'from',
 'some',
 'farcical',
 'aquatic',
 'ceremony',
 '.']

In [None]:
# 3.6 Regular Expressions for Tokenizing Text

In [None]:
# Tokenization is the task of cutting a string into identifiable linguistic units that constitute a piece of language data. 

In [None]:
# Although it is a fundamental task, we have been able to delay it until now because many corpora are already tokenized, and because NLTK includes some tokenizers.


In [None]:
#Now that you are familiar with regular expressions, you can learn how to use them to tokenize text, and to have much more control over the process.


In [None]:
# Simple Approaches to Tokenization

In [None]:
# The very simplest method for tokenizing text is to split on whitespace. Consider the following text from Alice's Adventures in Wonderland:

In [296]:
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
well without--Maybe it's always pepper that makes people hot-tempered,'..."""

In [297]:
# We could split this raw text on whitespace using raw.split(). To do the same using a regular expression, it is not enough to match any space characters in the string [1] since this results in tokens that contain a \n newline character; instead we need to match any number of spaces, tabs, or newlines 

In [298]:
re.split(r' ', raw)

["'When",
 "I'M",
 'a',
 "Duchess,'",
 'she',
 'said',
 'to',
 'herself,',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone\nthough),',
 "'I",
 "won't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL.',
 'Soup',
 'does',
 'very\nwell',
 'without--Maybe',
 "it's",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 "hot-tempered,'..."]

In [299]:
 re.split(r'[ \t\n]+', raw)

["'When",
 "I'M",
 'a',
 "Duchess,'",
 'she',
 'said',
 'to',
 'herself,',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though),',
 "'I",
 "won't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL.',
 'Soup',
 'does',
 'very',
 'well',
 'without--Maybe',
 "it's",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 "hot-tempered,'..."]

In [300]:
# The regular expression «[ \t\n]+» matches one or more space, tab (\t) or newline (\n). Other whitespace characters, such as carriage-return and form-feed should really be included too. Instead, we will use a built-in re abbreviation, \s, which means any whitespace character. The above statement can be rewritten as re.split(r'\s+', raw)

In [301]:
re.split(r'\s+', raw)

["'When",
 "I'M",
 'a',
 "Duchess,'",
 'she',
 'said',
 'to',
 'herself,',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though),',
 "'I",
 "won't",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL.',
 'Soup',
 'does',
 'very',
 'well',
 'without--Maybe',
 "it's",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 "hot-tempered,'..."]

In [302]:
# Remember to prefix regular expressions with the letter r (meaning "raw"), which instructs the Python interpreter to treat the string literally, rather than processing any backslashed characters it contains.

In [303]:
# Splitting on whitespace gives us tokens like '(not' and 'herself,'. An alternative is to use the fact that Python provides us with a character class \w for word characters, equivalent to [a-zA-Z0-9_]. It also defines the complement of this class \W, i.e. all characters other than letters, digits or underscore. We can use \W in a simple regular expression to split the input on anything other than a word character:

In [304]:
 re.split(r'\W+', raw)

['',
 'When',
 'I',
 'M',
 'a',
 'Duchess',
 'she',
 'said',
 'to',
 'herself',
 'not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 'I',
 'won',
 't',
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 'Maybe',
 'it',
 's',
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot',
 'tempered',
 '']

In [305]:
# Now that we're matching the words, we're in a position to extend the regular expression to cover a wider range of cases. The regular expression «\w+|\S\w*» will first try to match any sequence of word characters. If no match is found, it will try to match any non-whitespace character (\S is the complement of \s) followed by further word characters. This means that punctuation is grouped with any following letters (e.g. 's) but that sequences of two or more punctuation characters are separated.

In [306]:
re.findall(r'\w+|\S\w*', raw)

["'When",
 'I',
 "'M",
 'a',
 'Duchess',
 ',',
 "'",
 'she',
 'said',
 'to',
 'herself',
 ',',
 '(not',
 'in',
 'a',
 'very',
 'hopeful',
 'tone',
 'though',
 ')',
 ',',
 "'I",
 'won',
 "'t",
 'have',
 'any',
 'pepper',
 'in',
 'my',
 'kitchen',
 'AT',
 'ALL',
 '.',
 'Soup',
 'does',
 'very',
 'well',
 'without',
 '-',
 '-Maybe',
 'it',
 "'s",
 'always',
 'pepper',
 'that',
 'makes',
 'people',
 'hot',
 '-tempered',
 ',',
 "'",
 '.',
 '.',
 '.']

In [307]:
# Let's generalize the \w+ in the above expression to permit word-internal hyphens and apostrophes: «\w+([-']\w+)*». This expression means \w+ followed by zero or more instances of [-']\w+; it would match hot-tempered and it's. (We need to include ?: in this expression for reasons discussed earlier.) We'll also add a pattern to match quote characters so these are kept separate from the text they enclose.# 

In [308]:
print(re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw))

["'", 'When', "I'M", 'a', 'Duchess', ',', "'", 'she', 'said', 'to', 'herself', ',', '(', 'not', 'in', 'a', 'very', 'hopeful', 'tone', 'though', ')', ',', "'", 'I', "won't", 'have', 'any', 'pepper', 'in', 'my', 'kitchen', 'AT', 'ALL', '.', 'Soup', 'does', 'very', 'well', 'without', '--', 'Maybe', "it's", 'always', 'pepper', 'that', 'makes', 'people', 'hot-tempered', ',', "'", '...']


In [None]:
# Regular Expression Symbols

In [None]:
#Symbol 	Function
#\b 	Word boundary (zero width)
#\d 	Any decimal digit (equivalent to [0-9])
#\D 	Any non-digit character (equivalent to [^0-9])
#\s 	Any whitespace character (equivalent to [ \t\n\r\f\v])
#\S 	Any non-whitespace character (equivalent to [^ \t\n\r\f\v])
#\w 	Any alphanumeric character (equivalent to [a-zA-Z0-9_])
#\W 	Any non-alphanumeric character (equivalent to [^a-zA-Z0-9_])
#\t 	The tab character
#\n 	The newline character

In [None]:
# 3.7  Segmentation

In [None]:
# Tokenization is an instance of a more general problem of segmentation.

In [None]:
# Sentence Segmentation

In [None]:
# Manipulating texts at the level of individual words often presupposes the ability to divide a text into individual sentences. As we have seen, some corpora already provide access at the sentence level. In the following example, we compute the average number of words per sentence in the Brown Corpus:

In [309]:
len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())

20.250994070456922

In [310]:
# In other cases, the text is only available as a stream of characters. Before tokenizing the text into words, we need to segment it into sentences. NLTK facilitates this by including the Punkt sentence segmenter (Kiss & Strunk, 2006). Here is an example of its use in segmenting the text of a novel. 

In [311]:
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')

In [312]:
sents = nltk.sent_tokenize(text)

In [313]:
pprint.pprint(sents[79:89])

['"Nonsense!"',
 'said Gregory, who was very rational when anyone else\nattempted paradox.',
 '"Why do all the clerks and navvies in the\n'
 'railway trains look so sad and tired, so very sad and tired?',
 'I will\ntell you.',
 'It is because they know that the train is going right.',
 'It\n'
 'is because they know that whatever place they have taken a ticket\n'
 'for that place they will reach.',
 'It is because after they have\n'
 'passed Sloane Square they know that the next station must be\n'
 'Victoria, and nothing but Victoria.',
 'Oh, their wild rapture!',
 'oh,\n'
 'their eyes like stars and their souls again in Eden, if the next\n'
 'station were unaccountably Baker Street!"',
 '"It is you who are unpoetical," replied the poet Syme.']


In [314]:
# Sentence segmentation is difficult because period is used to mark abbreviations, and some periods simultaneously mark an abbreviation and terminate a sentence, as often happens with acronyms like U.S.A.For another approach to sentence segmentation, see Chapter 6.

In [315]:
# 3.8   Formatting: From Lists to Strings

In [316]:
# From Lists to Strings

In [317]:
# The simplest kind of structured object we use for text processing is lists of words. When we want to output these to a display or a file, we must convert these lists into strings. To do this in Python we use the join() method, and specify the string to be used as the "glue".

In [318]:
silly = ['We', 'called', 'him', 'Tortoise', 'because', 'he', 'taught', 'us', '.']

In [319]:
' '.join(silly)

'We called him Tortoise because he taught us .'

In [320]:
';'.join(silly)

'We;called;him;Tortoise;because;he;taught;us;.'

In [321]:
''.join(silly)

'WecalledhimTortoisebecausehetaughtus.'

In [322]:
# Strings and Formats

In [323]:
# We have seen that there are two ways to display the contents of an object:

In [324]:
word = 'cat'

In [325]:
sentence = """hello
world"""

In [326]:
print(word)

cat


In [327]:
print(sentence)

hello
world


In [328]:
word

'cat'

In [329]:
sentence

'hello\nworld'

In [330]:
# The print command yields Python's attempt to produce the most human-readable form of an object. The second method — naming the variable at a prompt — shows us a string that can be used to recreate this object. It is important to keep in mind that both of these are just strings, displayed for the benefit of you, the user. They do not give us any clue as to the actual internal representation of the object.

In [331]:
# There are many other useful ways to display an object as a string of characters. This may be for the benefit of a human reader, or because we want to export our data to a particular file format for use in an external program.


In [332]:
# Formatted output typically contains a combination of variables and pre-specified strings, e.g. given a frequency distribution fdist we could do:



In [333]:
fdist = nltk.FreqDist(['dog', 'cat', 'dog', 'cat', 'dog', 'snake', 'dog', 'cat'])

In [334]:
for word in sorted(fdist):
    print(word, '->', fdist[word], end='; ')

cat -> 3; dog -> 4; snake -> 1; 

In [335]:
# Print statements that contain alternating variables and constants can be difficult to read and maintain. Another solution is to use string formatting.

In [336]:
for word in sorted(fdist):
    print('{}->{};'.format(word, fdist[word]), end=' ')

cat->3; dog->4; snake->1; 

In [337]:
# To understand what is going on here, let's test out the format string on its own. (By now this will be your usual method of exploring new syntax.)



In [338]:
# The curly brackets '{}' mark the presence of a replacement field: this acts as a placeholder for the string values of objects that are passed to the str.format() method. We can embed occurrences of '{}' inside a string, then replace them with strings by calling format() with appropriate arguments. A string containing replacement fields is called a format string.

In [339]:
'{}->{};'.format ('cat', 3)

'cat->3;'

In [340]:
'{}'.format(3)

'3'

In [341]:
'I want a {} right now'.format('coffee')

'I want a coffee right now'

In [342]:
# We can have any number of placeholders, but the str.format method must be called with exactly the same number of arguments.

In [343]:
'{} wants a {} {}'.format ('Lee', 'sandwich', 'for lunch')

'Lee wants a sandwich for lunch'

In [344]:
'{} wants a {} {}'.format ('sandwich', 'for lunch')

IndexError: tuple index out of range

In [345]:
# The field name in a format string can start with a number, which refers to a positional argument of format(). Something like 'from {} to {}' is equivalent to 'from {0} to {1}', but we can use the numbers to get non-default orders:

In [346]:
'from {1} to {0}'.format('A', 'B')

'from B to A'

In [347]:
# We can also provide the values for the placeholders indirectly. Here's an example using a for loop:

In [348]:
template = 'Lee wants a {} right now'

In [349]:
menu = ['sandwich', 'spam fritter', 'pancake']

In [350]:
 for snack in menu:
    print(template.format(snack))

Lee wants a sandwich right now
Lee wants a spam fritter right now
Lee wants a pancake right now


In [351]:
# lining Things Up

In [352]:
# It is right-justified by default for numbers, but we can precede the width specifier with a '<' alignment option to make numbers left-justified

In [353]:
'{:6}'.format(41)

'    41'

In [354]:
'{:<6}'.format(41)

'41    '

In [355]:
# Strings are left-justified by default, but can be right-justified with the '>' alignment option.

In [356]:
'{:6}'.format('dog')

'dog   '

In [357]:
'{:>6}'.format('dog')

'   dog'

In [358]:
# Other control characters can be used to specify the sign and precision of floating point numbers; for example {:.4f} indicates that four digits should be displayed after the decimal point for a floating point number.



In [359]:
 import math

In [360]:
'{:.4f}'.format(math.pi)

'3.1416'

In [361]:
# The string formatting is smart enough to know that if you include a '%' in your format specification, then you want to represent the value as a percentage; there's no need to multiply by 100.

In [362]:
count, total = 3205, 9375

In [363]:
"accuracy for {} words: {:.4%}".format(total, count / total)

'accuracy for 9375 words: 34.1867%'

In [364]:
# a format string '{:{width}}' and bound a value to the width parameter in format() allows us to specify the width of a field using a variable.

In [365]:
'{:{width}}'.format('Monty Python', width=15)

'Monty Python   '

In [366]:
# We could use this to automatically customize the column to be just wide enough to accommodate all the words, using width = max(len(w) for w in words).

In [367]:
# Writing Results to a File

In [368]:
# It is often useful to write output to files as well. The following code opens a file output.txt for writing, and saves the program output to the file.

In [369]:
output_file = open('output.txt', 'w')

In [370]:
words = set(nltk.corpus.genesis.words('english-kjv.txt'))

In [371]:
for word in sorted(words):
    print(word, file=output_file)

In [372]:
# Text Wrapping

In [373]:
# When the output of our program is text-like, instead of tabular, it will usually be necessary to wrap it so that it can be displayed conveniently. Consider the following output, which overflows its line, and which uses a complicated print statement:

In [374]:
saying = ['After', 'all', 'is', 'said', 'and', 'done', ',',
         'more', 'is', 'said', 'than', 'done', '.']

In [375]:
for word in saying:
    print(word, '(' + str(len(word)) + '),', end=' ')

After (5), all (3), is (2), said (4), and (3), done (4), , (1), more (4), is (2), said (4), than (4), done (4), . (1), 

In [376]:
# We can take care of line wrapping with the help of Python's textwrap module. For maximum clarity we will separate each step onto its own line:



In [377]:
from textwrap import fill

In [378]:
pieces = ["{} {}".format(word, len(word)) for word in saying]

In [379]:
output = ' '.join(pieces)

In [380]:
wrapped = fill(output)

In [381]:
print(wrapped)

After 5 all 3 is 2 said 4 and 3 done 4 , 1 more 4 is 2 said 4 than 4
done 4 . 1
