In [24]:
import pandas as pd
import numpy as np

In [25]:
with open("../data/alice-in-wonderland.txt") as f:
    s = pd.Series(f.read().split())
s.head()

0         ﻿The
1      Project
2    Gutenberg
3        EBook
4           of
dtype: object

In [26]:
# what is the most common word, counting dirty data?
s.value_counts(ascending=False).head()

the    732
and    362
a      321
to     311
of     300
Name: count, dtype: int64

In [27]:
# what is the most common word, ignoring case
s.str.lower().value_counts(ascending=False).head()

the    792
and    379
a      325
to     318
of     313
Name: count, dtype: int64

In [28]:
# strip punctuation and try again
from string import punctuation

s.str.lower().str.strip(punctuation).value_counts(ascending=False).head()

the    807
and    404
a      328
to     327
of     318
Name: count, dtype: int64

In [29]:
# how many capitalised words does the book contain?
mycaps = s.loc[s.str.strip(punctuation).str.match("[A-Z].+")]

In [None]:
# book version
theircaps = s.loc[s.str.strip(punctuation).str.contains(r"^[A-Z]\w+$", regex=True)]

In [None]:
s.loc[
    s.str.strip(punctuation).str.match("[A-Z].+")
    ^ s.str.strip(punctuation).str.contains(r"^[A-Z]\w+$", regex=True)
]
# here's the xor'd version of theirs and mine - so mine will find string that have capitalised
# words in them, and things like hyphenated words, whilst theirs will only find words that
# are letters only letters (after punctuation stripping)

110            ALICE'S
114              SAM'L
124              SAM'L
131            ALICE'S
136            I--DOWN
             ...      
12634     Gutenberg-tm
12657     Gutenberg-tm
12668     Gutenberg-tm
12687             U.S.
12730    Gutenberg-tm,
Length: 185, dtype: object

In [31]:
# how many capitalised words stripping punctuation and quotes
s[s.str.strip(punctuation + "\"'").str.match("[A-Z].+")].count()

1731

In [35]:
# average number of nouns in words
s.str.lower().str.count("[aeiou]").mean()

1.66379377889211

In [None]:
# book version of vowel counting - same result, more code, probably runs slower
def count_vowels(one_word):
    total = 0
    for one_letter in one_word.lower():
        if one_letter in "aeiou":
            total += 1
    return total


s.apply(count_vowels).mean()

1.66379377889211

# Extension questions
1. What is the mean of all the integers in Alice?
2. What words in Alice don't appear in the dictionary? Which are the five most common such words
3. What are the minimum and maximum words per paragraph?

In [40]:
# 1. mean of all integers in Alice
nums = s.loc[s.str.strip(punctuation).str.isdigit()]
nums = nums.str.strip(punctuation).astype(int)
nums.mean()

8030.851851851852

In [None]:
# 2. what words don't appear in the dictionary?
# load the wordlist
with open("../data/words.txt") as f:
    words = pd.Series(f.read().lower().strip().split("\n"))
words.head()

0        a
1        a
2       aa
3      aal
4    aalii
dtype: object

In [None]:
# get rid of any punctuation, spaces, tabs, the byte order mark ('\ufeff')
lower_words = s.str.lower().str.strip(punctuation + "\t \ufeff").drop_duplicates()
lower_words.loc[~(lower_words.isin(words))]

2                       gutenberg
3                           ebook
10                        carroll
27                   restrictions
37                         re-use
                   ...           
12646                      shared
12651                       years
12719                          pg
12722    http://www.gutenberg.org
12726                    includes
Length: 639, dtype: object

In [63]:
# 3. minimum and maximum number of words per paragraph
# premise: paragraphs just end in a blank line
with open("../data/alice-in-wonderland.txt") as f:
    paras = pd.Series(f.read().split("\n\n"))
paras.head()

0    ﻿The Project Gutenberg EBook of Alice in Wonde...
1    This eBook is for the use of anyone anywhere a...
2                         \nTitle: Alice in Wonderland
3                                Author: Lewis Carroll
4                         Illustrator: Gordon Robinson
dtype: object

In [64]:
# create a dataframe from the series and add a column that contains the number of words
para_df = pd.DataFrame({"paragraphs": paras})
para_df.head()

Unnamed: 0,paragraphs
0,﻿The Project Gutenberg EBook of Alice in Wonde...
1,This eBook is for the use of anyone anywhere a...
2,\nTitle: Alice in Wonderland
3,Author: Lewis Carroll
4,Illustrator: Gordon Robinson


In [65]:
para_df["word_count"] = para_df["paragraphs"].apply(lambda x: len(x.split()))
para_df.head()

Unnamed: 0,paragraphs,word_count
0,﻿The Project Gutenberg EBook of Alice in Wonde...,11
1,This eBook is for the use of anyone anywhere a...,44
2,\nTitle: Alice in Wonderland,4
3,Author: Lewis Carroll,3
4,Illustrator: Gordon Robinson,3


In [67]:
para_df["word_count"].describe()

count    393.000000
mean      32.475827
std       32.428415
min        0.000000
25%        7.000000
50%       22.000000
75%       48.000000
max      169.000000
Name: word_count, dtype: float64