In [1]:
import re
def get_text(file):
    """Read text from a file, normalizing whitespace and stripping HTML markup."""
    text = open(file).read()
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

In [3]:
contents = get_text("../Chapter 1/text.txt")

In [4]:
help(get_text)

Help on function get_text in module __main__:

get_text(file)
    Read text from a file, normalizing whitespace and stripping HTML markup.



## Function Inputs and Outputs

In [6]:
def repeat(msg, num):
    return ' '.join([msg] * num)
monty = 'Monty Python'
repeat(monty, 3)

'Monty Python Monty Python Monty Python'

In [7]:
def monty():
    return "Monty Python"
monty()

'Monty Python'

In [8]:
repeat(monty(), 3)

'Monty Python Monty Python Monty Python'

In [10]:
''' functions should modify the contents of a parameter (my_sort1()),
or return a value (my_sort2()),
not both (my_sort3()). '''
def my_sort1(mylist):      # good: modifies its argument, no return value
    mylist.sort()
def my_sort2(mylist):      # good: doesn't touch its argument, returns value
    return sorted(mylist)
def my_sort3(mylist):      # bad: modifies its argument and also returns it
    mylist.sort()
    return mylist

## Parameter Passing

In [None]:
''' Python's call-by-value parameter passing '''

In [16]:
def set_up(word, properties):
    word = 'lolcat'
    properties.append('noun')
    properties = 5
    print(properties)

In [17]:
w = ''
p = [] # p is a reference to an empty lists
set_up(w, p)
w

5


''

In [18]:
p

['noun']

In [20]:
''' similar '''
w = ''
word = w
word = 'lolcat'
w

''

In [19]:
''' similar '''
p = []
properties = p
properties.append('noun')
properties = 5
p

['noun']

## Variable Scope

In [21]:
''' LGB rule of name resolution:
local, then global, then built-in '''

' LGB rule of name resolution:\nlocal, then global, then built-in '

## Checking Parameter Types

In [22]:
def tag(word):
    if word in ['a', 'the', 'all']:
        return 'det'
    else:
        return 'noun'

In [23]:
tag('the')

'det'

In [24]:
tag('knight')

'noun'

In [25]:
tag(["'Tis", 'but', 'a', 'scratch'])

'noun'

In [28]:
def tag(word):
    assert isinstance(word, str), "argument to tag() must be a string"
    if word in ['a', 'the', 'all']:
        return 'det'
    else:
        return 'noun'

In [29]:
tag(["'Tis", 'but', 'a', 'scratch'])

AssertionError: argument to tag() must be a string

## Functional Decomposition

In [34]:
# data = load_corpus()
# results = analyze(data)
# present(results)

In [78]:
import nltk
from urllib import request
from bs4 import BeautifulSoup

import requests

from nltk import word_tokenize

def freq_words(url, freqdist, n):
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    raw = BeautifulSoup(response.text, "html5lib").get_text()
    for word in word_tokenize(raw):
        freqdist[word.lower()] += 1
    result = []
    for word, count in freqdist.most_common(n):
        result = result + [word]
    print(result)

In [79]:
constitution = "https://www.archives.gov/founding-docs/constitution-transcript"
fd = nltk.FreqDist()
freq_words(constitution, fd, 30)

[',', 'the', 'of', "''", 'and', 'shall', '.', ';', 'be', 'to', ':', 'in', 'states', 'or', ':1', 'a', '(', ')', 'united', 'state', 'by', 'for', '``', 'any', '{', '}', 'on', 'all', 'which', 'president']


In [87]:
def freq_words(url, n):
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    text = BeautifulSoup(response.text, "html5lib").get_text()
    fd2 = nltk.FreqDist(word.lower() for word in word_tokenize(text))
    return [word for (word, _) in fd2.most_common(n)]

In [88]:
print(freq_words(constitution, 30))

[',', 'the', 'of', "''", 'and', 'shall', '.', ';', 'be', 'to', ':', 'in', 'states', 'or', ':1', 'a', '(', ')', 'united', 'state', 'by', 'for', '``', 'any', '{', '}', 'on', 'all', 'which', 'president']


## Documenting Functions

In [89]:
def accuracy(reference, test):
    """
    Calculate the fraction of test items that equal the corresponding reference items.

    Given a list of reference values and a corresponding list of test values,
    return the fraction of corresponding values that are equal.
    In particular, return the fraction of indexes
    {0<i<=len(test)} such that C{test[i] == reference[i]}.

        >>> accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])
        0.5

    :param reference: An ordered list of reference values
    :type reference: list
    :param test: A list of values to compare against the corresponding
        reference values
    :type test: list
    :return: the accuracy score
    :rtype: float
    :raises ValueError: If reference and length do not have the same length
    """

    if len(reference) != len(test):
        raise ValueError("Lists must have the same length.")
    num_correct = 0
    for x, y in zip(reference, test):
        if x == y:
            num_correct += 1
    return float(num_correct) / len(reference)

In [91]:
help(accuracy)

Help on function accuracy in module __main__:

accuracy(reference, test)
    Calculate the fraction of test items that equal the corresponding reference items.
    
    Given a list of reference values and a corresponding list of test values,
    return the fraction of corresponding values that are equal.
    In particular, return the fraction of indexes
    {0<i<=len(test)} such that C{test[i] == reference[i]}.
    
        >>> accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])
        0.5
    
    :param reference: An ordered list of reference values
    :type reference: list
    :param test: A list of values to compare against the corresponding
        reference values
    :type test: list
    :return: the accuracy score
    :rtype: float
    :raises ValueError: If reference and length do not have the same length



In [92]:
accuracy(['ADJ', 'N', 'V', 'N'], ['N', 'N', 'V', 'ADJ'])

0.5