In [None]:
# Stuff that will appear at the top of notebooks;
# You don't have to understand how this works or change it (for now!!).

#import warnings
#warnings.simplefilter(action="ignore", category=FutureWarning)

from urllib.request import urlopen 
import re

In [None]:
def read_url(url): 
    return re.sub('\\s+', ' ', urlopen(url).read().decode())

In [None]:
# Read two books

huck_finn_url = 'https://www.inferentialthinking.com/data/huck_finn.txt'
huck_finn_text = read_url(huck_finn_url)
huck_finn_chapters = huck_finn_text.split('CHAPTER ')[44:]

little_women_url = 'https://www.inferentialthinking.com/data/little_women.txt'
little_women_text = read_url(little_women_url)
little_women_chapters = little_women_text.split('CHAPTER ')[1:]

# Introducing datascience Tables (from the Berkeley datascience package)

In [None]:
from datascience import *

In [None]:
# Needs to be <=0.17.5
import datascience
datascience.__version__

In [None]:
Table().with_column('Chapters', huck_finn_chapters)

In [None]:
import numpy as np

In [None]:
np.char.count(huck_finn_chapters, 'Tom')

In [None]:
np.char.count(huck_finn_chapters, 'Jim')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')

In [None]:
counts = Table().with_columns([
    'Tom', np.char.count(huck_finn_chapters, 'Tom'),
    'Jim', np.char.count(huck_finn_chapters, 'Jim'),
    'Huck', np.char.count(huck_finn_chapters, 'Huck'),
])
counts

In [None]:
# Count how many times the names Jim, Tom, and Huck appear in each chapter

counts = Table().with_columns([
        'Jim', np.char.count(huck_finn_chapters, 'Jim'),
        'Tom', np.char.count(huck_finn_chapters, 'Tom'),
        'Huck', np.char.count(huck_finn_chapters, 'Huck')
    ])

# Plot the cumulative counts:
# how many times in Chapter 1, how many times in Chapters 1 and 2, and so on.

cum_counts = counts.cumsum().with_column('Chapter', np.arange(1, 44, 1))
cum_counts.plot(column_for_xticks=3)
plots.title('Cumulative Number of Times Name Appears');

# Examining Little Woman

In [None]:
Table().with_column('Chapters', little_women_chapters)

In [None]:
# Counts of names in the chapters of Little Women

people = ['Amy', 'Beth', 'Jo', 'Laurie', 'Meg']
people_counts = {pp: np.char.count(little_women_chapters, pp) for pp in people}

counts = Table().with_columns([
        'Amy', people_counts['Amy'],
        'Beth', people_counts['Beth'],
        'Jo', people_counts['Jo'],
        'Laurie', people_counts['Laurie'],
        'Meg', people_counts['Meg']
    ])

In [None]:
# Plot the cumulative counts

cum_counts = counts.cumsum().with_column('Chapter', np.arange(1, 48, 1))
cum_counts.plot(column_for_xticks=5)
plots.title('Cumulative Number of Times Name Appears');

# Examine sentence length

In [None]:
# In each chapter, count the number of all characters;
# call this the "length" of the chapter.
# Also count the number of periods.

chars_periods_hf = Table().with_columns([
        'HF Chapter Length', [len(s) for s in huck_finn_chapters],
        'Number of Periods', np.char.count(huck_finn_chapters, '.')
    ])
chars_periods_lw = Table().with_columns([
        'LW Chapter Length', [len(s) for s in little_women_chapters],
        'Number of Periods', np.char.count(little_women_chapters, '.')
    ])

In [None]:
# The counts for Huckleberry Finn

chars_periods_hf

In [None]:
# The counts for Little Women

chars_periods_lw

In [None]:
plots.figure(figsize=(10, 10))
plots.scatter(chars_periods_hf[1], chars_periods_hf[0], color='darkblue')
plots.scatter(chars_periods_lw[1], chars_periods_lw[0], color='gold')
plots.xlabel('Number of periods in chapter')
plots.ylabel('Number of characters in chapter');

In [None]:
Table.static_plots()
chars_periods_lw.with_columns("Sentence Length", chars_periods_lw['LW Chapter Length']/chars_periods_lw['Number of Periods']).hist("Sentence Length")

In [None]:
Table.static_plots()
chars_periods_hf.with_columns("Sentence Length", chars_periods_hf['HF Chapter Length']/chars_periods_hf['Number of Periods']).hist("Sentence Length")