# Classes and dictionaries

## Dictionaries

In [34]:
#Simple
x = {
    'alpha': 1,
    'beta': 2,
    'chi': 3
}
print(x['beta'])

2


In [35]:
x['delta'] = 4
print(x)

{'alpha': 1, 'beta': 2, 'chi': 3, 'delta': 4}


In [36]:
#Less simple
y = {
    'alpha': ['A', 'B', 'C'],
    'beta': [1, 2, 3],
    'chi': {
        'delta': 'one',
        'epsilon': 'two'
    }
}
print(y['beta'])

[1, 2, 3]


In [37]:
print(y['chi']['epsilon'])

two


In [38]:
y['chi']['epsilon'] = 'three'
print(y)

{'alpha': ['A', 'B', 'C'], 'beta': [1, 2, 3], 'chi': {'delta': 'one', 'epsilon': 'three'}}


In [39]:
#Unknown key throws KeyError
y['gamma']

KeyError: 'gamma'

In [40]:
from collections import defaultdict
z = defaultdict(int)
z['alpha']

0

## Classes

In [41]:
class Baseclass:
    def __init__(self, first):
        self.firstname = first
    def FirstName(self):
        return self.firstname

class Subclass(Baseclass):
    def __init__(self, first, last):
        Baseclass.__init__(self, first)
        self.lastname = last
    def LastName(self):
        return self.lastname
    def FullName(self):
        return self.firstname + ' '  + self.lastname

In [42]:
x = Baseclass("Matt")
y = Subclass("Matt", "Wilkens")
print(x.FirstName())
print(y.LastName()),
print(y.FirstName(), "\t#<- NB. FirstName() not defined in Subclass"),
print(y.FullName())

Matt
Wilkens
Matt 	#<- NB. FirstName() not defined in Subclass
Matt Wilkens


## Single and double quotes

In [43]:
print("Cat's toy")

Cat's toy


In [45]:
print('Cat\'s toy')

Cat's toy


In [46]:
b = """
"This is some text's idea"
"""
print(b)


"This is some text's idea"



## Regular expressions

In [51]:
import re
s = 'This is a string.'
result = re.match('.', s)
print(result)

<re.Match object; span=(0, 1), match='T'>


In [52]:
print(re.match('.*', s))
print(re.match('.*', s)[0])

<re.Match object; span=(0, 17), match='This is a string.'>
This is a string.


In [53]:
print(re.findall('is a', s))

['is a']


In [54]:
print(re.findall('[is]+', s))

['is', 'is', 's', 'i']


In [55]:
#\w = 'word-like' characters
print(re.findall('\w+', s))
print(re.findall('\W+', s)) # Note capital version is negation of base case


['This', 'is', 'a', 'string']
[' ', ' ', ' ', '.']


## Generators

In [56]:
def decrement_steps(num):
    while num > 0:
        yield num
        num -= 1
print(decrement_steps(5))

<generator object decrement_steps at 0x1a16e8fcf0>


In [57]:
for i in decrement_steps(3):
    print(i)

3
2
1


In [58]:
# Cannot index into generators
decrement_steps(3)[0]

TypeError: 'generator' object is not subscriptable

In [59]:
# Cast to list (not a great idea)
print(list(decrement_steps(3)))

[3, 2, 1]


## CorpusReader code

In [60]:
# From textbook
import nltk
import codecs

from nltk.corpus.reader.api import CorpusReader
from nltk.corpus.reader.api import CategorizedCorpusReader

CAT_PATTERN = r'([a-z_\s]+)/.*'
DOC_PATTERN = r'(?!\.)[a-z_\s]+/[a-f0-9]+\.json'
TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

class HTMLCorpusReader(CategorizedCorpusReader, CorpusReader):
    """
    A corpus reader for raw HTML documents to enable preprocessing.
    """

    def __init__(self, root, fileids=DOC_PATTERN, encoding='utf8',
                 tags=TAGS, **kwargs):
        """
        Initialize the corpus reader.  Categorization arguments
        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
        the ``CategorizedCorpusReader`` constructor.  The remaining
        arguments are passed to the ``CorpusReader`` constructor.
        """
        # Add the default category pattern if not passed into the class.
        if not any(key.startswith('cat_') for key in kwargs.keys()):
            kwargs['cat_pattern'] = CAT_PATTERN

        # Initialize the NLTK corpus reader objects
        CategorizedCorpusReader.__init__(self, kwargs)
        CorpusReader.__init__(self, root, fileids, encoding)

        # Save the tags that we specifically want to extract.
        self.tags = tags

    def resolve(self, fileids, categories):
        """
        Returns a list of fileids or categories depending on what is passed
        to each internal corpus reader function. Implemented similarly to
        the NLTK ``CategorizedPlaintextCorpusReader``.
        """
        if fileids is not None and categories is not None:
            raise ValueError("Specify fileids or categories, not both")

        if categories is not None:
            return self.fileids(categories)
        return fileids

    def docs(self, fileids=None, categories=None):
        """
        Returns the complete text of an HTML document, closing the document
        after we are done reading it and yielding it in a memory safe fashion.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, loading one document into memory at a time.
        for path, encoding in self.abspaths(fileids, include_encoding=True):
            with codecs.open(path, 'r', encoding=encoding) as f:
                yield f.read()

    def sizes(self, fileids=None, categories=None):
        """
        Returns a list of tuples, the fileid and size on disk of the file.
        This function is used to detect oddly large files in the corpus.
        """
        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)

        # Create a generator, getting every path and computing filesize
        for path in self.abspaths(fileids):
            yield os.path.getsize(path)

In [61]:
import os
text_dir = os.path.join('..', 'data', 'texts')
c = HTMLCorpusReader(text_dir, '.+\.txt')

In [62]:
c.fileids() # Note inheritance from CategorizedCorpusReader()

['A-Alcott-Little_Women-1868-F.txt',
 'A-Cather-Antonia-1918-F.txt',
 'A-Chesnutt-Marrow-1901-M.txt',
 'A-Chopin-Awakening-1899-F.txt',
 'A-Crane-Maggie-1893-M.txt',
 'A-Davis-Life_Iron_Mills-1861-F.txt',
 'A-Dreiser-Sister_Carrie-1900-M.txt',
 'A-Freeman-Pembroke-1894-F.txt',
 'A-Gilman-Herland-1915-F.txt',
 'A-Harper-Iola_Leroy-1892-F.txt',
 'A-Hawthorne-Scarlet_Letter-1850-M.txt',
 'A-Howells-Silas_Lapham-1885-M.txt',
 'A-James-Golden_Bowl-1904-M.txt',
 'A-Jewett-Pointed_Firs-1896-F.txt',
 'A-London-Call_Wild-1903-M.txt',
 'A-Melville-Moby_Dick-1851-M.txt',
 'A-Norris-Pit-1903-M.txt',
 'A-Stowe-Uncle_Tom-1852-F.txt',
 'A-Twain-Huck_Finn-1885-M.txt',
 'A-Wharton-Age_Innocence-1920-F.txt',
 'B-Austen-Pride_Prejudice-1813-F.txt',
 'B-Bronte_C-Jane_Eyre-1847-F.txt',
 'B-Bronte_E-Wuthering_Heights-1847-F.txt',
 'B-Burney-Evelina-1778-F.txt',
 'B-Conrad-Heart_Darkness-1902-M.txt',
 'B-Dickens-Bleak_House-1853-M.txt',
 'B-Disraeli-Sybil-1845-M.txt',
 'B-Eliot-Middlemarch-1869-F.txt',
 'B-F

In [63]:
isinstance(c, HTMLCorpusReader)

True

In [64]:
issubclass(HTMLCorpusReader, CorpusReader)

True

In [65]:
#Generators are a pain to print or index!
print(c.sizes())

<generator object HTMLCorpusReader.sizes at 0x1a16e8fde0>


In [66]:
#Cannot index generators
c.sizes()[1]

TypeError: 'generator' object is not subscriptable

In [67]:
#Instead, cast to list (ick!) ...
sizes = list(c.sizes())
sizes[1]

440138

In [68]:
# ... or loop over values
for i in c.sizes():
    print(i)

1015333
440138
501350
362678
131011
81705
877726
439228
303622
400630
482635
698671
1158749
226008
175324
1214754
696565
1038920
562852
580020
693745
1024274
649624
850699
210664
1934468
895945
1790058
376769
997858
1027060
837176
220769
1680642
423462
360653
1709359
1899937
178886
363009


In [69]:
# Stepwise iteration with next()
x = c.sizes()
for i in range(3):
    print(next(x))

1015333
440138
501350
