In [18]:
import re
import requests
import string
import numpy as np
import sympy as sp
from bs4 import BeautifulSoup

### Build the columns of our future document term matrix

In [15]:
# Scrape first link
response = requests.get("https://www.poetryfoundation.org/poems/51900/londons-summer-morning")
soup = BeautifulSoup(response.content, 'html.parser')
divs = soup.find_all('div', class_ = "poem-body overflow-x-auto")

# Clean divs
for div in divs:
    poem_text = div.text.lower()
    poem_text_no_punc = poem_text.translate(str.maketrans('', '', string.punctuation))
    poem = (poem_text_no_punc.split()) # Lower text and include no punctuation when we append

poem

['who',
 'has',
 'not',
 'waked',
 'to',
 'list',
 'the',
 'busy',
 'sounds',
 'of',
 'summer’s',
 'morning',
 'in',
 'the',
 'sultry',
 'smoke',
 'of',
 'noisy',
 'london',
 'on',
 'the',
 'pavement',
 'hot',
 'the',
 'sooty',
 'chimneyboy',
 'with',
 'dingy',
 'face',
 'and',
 'tattered',
 'covering',
 'shrilly',
 'bawls',
 'his',
 'trade',
 'rousing',
 'the',
 'sleepy',
 'housemaid',
 'at',
 'the',
 'door',
 'the',
 'milkpail',
 'rattles',
 'and',
 'the',
 'tinkling',
 'bell',
 'proclaims',
 'the',
 'dustman’s',
 'office',
 'while',
 'the',
 'street',
 'is',
 'lost',
 'in',
 'clouds',
 'impervious',
 'now',
 'begins',
 'the',
 'din',
 'of',
 'hackneycoaches',
 'waggons',
 'carts',
 'while',
 'tinmen’s',
 'shops',
 'and',
 'noisy',
 'trunkmakers',
 'knifegrinders',
 'coopers',
 'squeaking',
 'corkcutters',
 'fruitbarrows',
 'and',
 'the',
 'hungergiving',
 'cries',
 'of',
 'vegetablevendors',
 'fill',
 'the',
 'air',
 'now',
 'every',
 'shop',
 'displays',
 'its',
 'varied',
 'trade'

In [17]:
# Remove stopwords
stopword_list = ['the', 'and', 'is', 'in', 'to', 'of']
poem_no_stopword = [word for word in poem if word not in stopword_list]

218

### Build the rows of our future document term matrix

In this exercise I am going to define document by sentences of the poem which are split based on punctuation. Our first step in building the document term matrix is to define and pull our documents. 

In [78]:
response = requests.get("https://www.poetryfoundation.org/poems/51900/londons-summer-morning")
soup = BeautifulSoup(response.content, 'html.parser')
divs = soup.find_all('div', class_ = "poem-body overflow-x-auto")

# Clean divs
for div in divs:
    poem_text = div.text.lower()
    # Later note: Needed to remove parentheses since this messed up the sorted structure a few blocks down
    poem_text = re.split(r'[.!?()]', poem_text) # Use regex split so we can split by multiple punctuation types

poem_text # Now we have multiple topics "documents split by punctuation"

['who has not waked to list the busy sounds of summer’s morning, in the sultry smoke of noisy london',
 ' on the pavement hot the sooty chimney-boy, with dingy face and tattered covering, shrilly bawls his trade, rousing the sleepy housemaid',
 ' at the door the milk-pail rattles, and the tinkling bell proclaims the dustman’s office; while the street is lost in clouds impervious',
 ' now begins the din of hackney-coaches, waggons, carts; while tinmen’s shops, and noisy trunk-makers, knife-grinders, coopers, squeaking cork-cutters, fruit-barrows, and the hunger-giving cries of vegetable-vendors, fill the air',
 ' now every shop displays its varied trade, and the fresh-sprinkled pavement cools the feet of early walkers',
 ' at the private door the ruddy housemaid twirls the busy mop, annoying the smart ’prentice, or neat girl, tripping with band-box lightly',
 ' now the sun darts burning splendor on the glittering pane, save where the canvas awning throws a shade on the gay merchandise',

The next step is to build a vocabulary of unique words. We want to do this because we will use the unique words that appear in the dataset to construct our column index in the document-term matrix.

To do this we will use a "global vocabulary" and make a set of the unqiue words from the entire document.

In [66]:
unique_poem_set = set()

for sentence in poem_text:
    words = sentence.split() # Split the 13 sentences into words
    for word in words:
        unique_poem_set.update(words) # Create a set of words that and .update() the set for each iteration

In [79]:
unique_poem_sorted = sorted(unique_poem_set)
unique_poem_sorted

[',',
 'a',
 'abyss',
 'air',
 'all',
 'along',
 'and',
 'annoying',
 'area',
 'at',
 'awning',
 'bag',
 'band-box',
 'base',
 'bawls',
 'bears',
 'beauty',
 'begins',
 'bell',
 'burning',
 'busy',
 'canvas',
 'carts;',
 'catch',
 'charm',
 'chimney-boy,',
 'clouds',
 'cools',
 'coopers,',
 'cork-cutters,',
 'covering,',
 'cries',
 'dainties',
 'damsel;',
 'darts',
 'din',
 'dingy',
 'discordant',
 'displays',
 'domestic',
 'door',
 'dreams,',
 'dustman’s',
 'early',
 'enthrall',
 'every',
 'eye',
 'face',
 'feet',
 'fill',
 'for',
 'fresh-sprinkled',
 'from',
 'fruit-barrows,',
 'gay',
 'girl,',
 'glittering',
 'green',
 'hackney-coaches,',
 'half',
 'half-filled',
 'half-worn',
 'has',
 'his',
 'hot',
 'housemaid',
 'huge',
 'humming',
 'hunger-giving',
 'impervious',
 'in',
 'industry',
 'insects,',
 'is',
 'its',
 'knife-grinders,',
 'ladder,',
 'lamp-lighter',
 'lamps,',
 'lightly',
 'limy',
 'list',
 'load',
 'london',
 'lost',
 'merchandise',
 'milk-pail',
 'minute',
 'monotonou

### Build the document-term matrix

Observation: The document-term matrix should have dimensions number of documents x number of words so I will take len() of these before I initialize the zero matrix and begin updating entries. 

In [81]:
# Check lengths

len(unique_poem_sorted) # 188 words
len(poem_text) # 17 documents

188

In [83]:
# Initialize a 17 x 187 zero matrix

dtm = np.zeros((17, 188))
dtm

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [86]:
# Build term frequencies into the matrix



who has not waked to list the busy sounds of summer’s morning, in the sultry smoke of noisy london
 on the pavement hot the sooty chimney-boy, with dingy face and tattered covering, shrilly bawls his trade, rousing the sleepy housemaid
 at the door the milk-pail rattles, and the tinkling bell proclaims the dustman’s office; while the street is lost in clouds impervious
 now begins the din of hackney-coaches, waggons, carts; while tinmen’s shops, and noisy trunk-makers, knife-grinders, coopers, squeaking cork-cutters, fruit-barrows, and the hunger-giving cries of vegetable-vendors, fill the air
 now every shop displays its varied trade, and the fresh-sprinkled pavement cools the feet of early walkers
 at the private door the ruddy housemaid twirls the busy mop, annoying the smart ’prentice, or neat girl, tripping with band-box lightly
 now the sun darts burning splendor on the glittering pane, save where the canvas awning throws a shade on the gay merchandise
 now, spruce and trim, in s