In [1]:
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

from fasterstylometry.burrows_delta import BurrowsDelta, LazyBurrowsDelta
from fasterstylometry.corpus import LazyCorpus, Corpus

## Loading Books

We can load in some sample books from project gutenberg. Into the reference set we will load three works by Jane Austen and three works by Shakespeare but two of the words will be different editions of the same story. The test set will contain one work by Jane Austen, one by Shakespeare and one by Charles Dickens.

Depending on how Gutenbergs mirrors are faring when you try to download the books, you may have to select a [mirror from their list](https://www.gutenberg.org/MIRRORS.ALL).

In [2]:
mirror = 'http://gutenberg.pglaf.org/'

reference_ids = [1342, 161, 158, 1513, 27761, 1524]
reference_data = {
    'authors': ['Jane Austen', 'Jane Austen', 'Jane Austen', 'William Shakespeare', 'William Shakespeare', 'William Shakespeare'],
    'titles': ['Pride and Prejudice', 'Sense and Sensibility', 'Emma', 'Romeo and Juliet', 'Hamlet', 'Hamlet'],
    'texts': [strip_headers(load_etext(id, mirror=mirror)).strip() for id in reference_ids]
}

test_ids = [105, 23042, 1400]
test_data = {
    'authors': ['Jane Austen', 'William Shakespeare', 'Charles Dickens'],
    'titles': ['Persuasion', 'The Temptest', 'Great Expectations'],
    'texts': [strip_headers(load_etext(id, mirror=mirror)).strip() for id in test_ids]
}

In [3]:
reference_corpus = Corpus(**reference_data)
test_corpus = Corpus(**test_data)

lazy_reference_corpus = LazyCorpus(**reference_data)
lazy_test_corpus = LazyCorpus(**test_data)

In [4]:
delta = BurrowsDelta(reference_corpus, test_corpus)
delta.document_deltas

index,authors,titles,index_test,authors_test,titles_test,burrows_delta
str,str,str,str,str,str,f64
"""2cb421cb-40c1-4b38-8fd0-edfe9c…","""William Shakespeare""","""Romeo and Juliet""","""cf428c01-e2b2-40cf-b9a3-808b70…","""William Shakespeare""","""The Temptest""",8.156757
"""b7a42d1d-ee13-4027-8317-03aab0…","""William Shakespeare""","""Hamlet""","""cf428c01-e2b2-40cf-b9a3-808b70…","""William Shakespeare""","""The Temptest""",13.794595
"""e9b0b300-4384-439c-aac1-a4bce8…","""William Shakespeare""","""Hamlet""","""cf428c01-e2b2-40cf-b9a3-808b70…","""William Shakespeare""","""The Temptest""",18.167568
"""08d8dc84-7ae4-4abd-9b4a-8e5769…","""Jane Austen""","""Sense and Sensibility""","""df942dbe-dccc-4b4a-abb2-02c317…","""Jane Austen""","""Persuasion""",56.259459
"""eaae09dd-371a-4fb4-931a-ec3d67…","""Jane Austen""","""Pride and Prejudice""","""df942dbe-dccc-4b4a-abb2-02c317…","""Jane Austen""","""Persuasion""",68.216216
…,…,…,…,…,…,…
"""b7a42d1d-ee13-4027-8317-03aab0…","""William Shakespeare""","""Hamlet""","""df942dbe-dccc-4b4a-abb2-02c317…","""Jane Austen""","""Persuasion""",1.1608e7
"""08d8dc84-7ae4-4abd-9b4a-8e5769…","""Jane Austen""","""Sense and Sensibility""","""4caf9ad0-499b-4189-a726-a703fe…","""Charles Dickens""","""Great Expectations""",1.1608e7
"""e9b0b300-4384-439c-aac1-a4bce8…","""William Shakespeare""","""Hamlet""","""df942dbe-dccc-4b4a-abb2-02c317…","""Jane Austen""","""Persuasion""",1.1608e7
"""eaae09dd-371a-4fb4-931a-ec3d67…","""Jane Austen""","""Pride and Prejudice""","""4caf9ad0-499b-4189-a726-a703fe…","""Charles Dickens""","""Great Expectations""",1.1608e7


In [5]:
lazy_delta = LazyBurrowsDelta(lazy_reference_corpus, lazy_test_corpus)
lazy_delta.document_deltas

DuplicateError: the name 'tokens' is duplicate

It's possible that multiple expressions are returning the same default column name. If this is the case, try renaming the columns with `.alias("new_name")` to avoid duplicate column names.