## Solution

In [1]:
import re

# Get list of words from Frankenstein and Moby Dick

with open("frankenstein.txt", "r") as f:
    frankenstein = f.read()
    frankenstein_words = re.findall("[A-Za-z]+", frankenstein)
    frankenstein_words = [word.lower() for word in frankenstein_words]

with open("moby_dick.txt", "r") as f:
    moby_dick = f.read()
    moby_dick_words = re.findall("[A-Za-z]+", moby_dick)
    moby_dick_words = [word.lower() for word in moby_dick_words]

In [2]:

def word_stats_slow(l1, l2):
    # Find common words
    common_words = []
    for word in l1:
        if word not in common_words and word in l2:
            common_words.append(word)
    
    # Count number of unique words
    word_count = 0
    for word in common_words:
        word_count += 1
    
    # Compute mean
    word_length_sum = 0
    for word in common_words:
        word_length_sum += len(word)

    mean = word_length_sum / word_count

    # Compute variance
    variance = 0
    for word in common_words:
        variance += (len(word) - mean) ** 2
    variance = variance / word_count

    return word_count, mean, variance ** 0.5


In [3]:
# This is REALLY slow
word_stats_slow(frankenstein_words, moby_dick_words)

(5286, 6.943057132046916, 2.2687341813469475)

**Solution**: Use sets and list comprehensions to speed-up the function.

In [4]:
def word_stats_fast(l1, l2):
    common_words = set(l1).intersection(set(l2))
    word_lengths = [len(word) for word in common_words]
    
    mean = sum(word_lengths) / len(word_lengths)
    variance = sum([((x - mean) ** 2) for x in word_lengths]) / len(word_lengths) 

    return len(word_lengths), mean, variance ** 0.5

word_stats_fast(frankenstein_words, moby_dick_words)

(5286, 6.943057132046916, 2.2687341813469537)

In [5]:
import numpy as np

def word_stats_fast_numpy(l1, l2):
    common_words = set(l1).intersection(set(l2))
    word_lengths = np.array([len(word) for word in common_words])
    return len(word_lengths), word_lengths.mean(), word_lengths.std()

word_stats_fast_numpy(frankenstein_words, moby_dick_words)

(5286, 6.943057132046916, 2.268734181346992)

In [6]:
print("Slow:")
%timeit -r1 word_stats_slow(frankenstein_words, moby_dick_words)
print("Fast:")
%timeit word_stats_fast(frankenstein_words, moby_dick_words)
print("Numpy:") 
%timeit word_stats_fast_numpy(frankenstein_words, moby_dick_words)

Slow:
30.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
Fast:
24 ms ± 141 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Numpy:
23.7 ms ± 367 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
