In [4]:
authors = ['humbert', 'kinbote', 'v', 'canto', 'foreword']
file_range_map = {
    'humbert': range(1,6),
    'kinbote': range(1,6),
    'v': range(1,6),
    'canto': range(1,5),
    'foreword': range(1,5)
}
disputed = 'foreword'

# A function that compiles all of the text files associated with a single author into a single string
def read_files_into_string(filename):
    strings = []
    for i in file_range_map[filename]:
        with open('{}_{}.txt'.format(filename, i)) as f:
            strings.append(f.read())
    return '\n'.join(strings)


# Make a dictionary out of the authors' corpora
work_by_author = {}
for author in authors:
    work_by_author[author] = read_files_into_string(author)

# Load nltk
import nltk
matplotlib.use('TkAgg')

potential_authors = ['humbert', 'kinbote']

# Transform the authors' corpora into lists of word tokens
work_by_author_tokens = {}
work_by_author_length_distributions = {}
for author in authors:
    tokens = nltk.word_tokenize(work_by_author[author])

    # Filter out punctuation
    work_by_author_tokens[author] = ([token for token in tokens if any(c.isalpha() for c in token)])

# Lowercase the tokens so that the same word, capitalized or not, 
# counts as one word
for author in authors:
    work_by_author_tokens[author] = (
        [token.lower() for token in work_by_author_tokens[author]])

# Calculate chisquared for each of the two candidate authors
for author in authors:
   
    # First, build a joint corpus and identify the 50 most frequent words in it
    joint_corpus = (work_by_author_tokens[author] + 
                    work_by_author_tokens[disputed])
    joint_freq_dist = nltk.FreqDist(joint_corpus)
    most_common = list(joint_freq_dist.most_common(50))

    # What proportion of the joint corpus is made up 
    # of the candidate author's tokens?
    author_share = (len(work_by_author_tokens[author]) 
                    / len(joint_corpus))
    
    # Now, let's look at the 100 most common words in the candidate 
    # author's corpus and compare the number of times they can be observed 
    # to what would be expected if the author's papers 
    # and the Disputed papers were both random samples from the same distribution.
    chisquared = 0
    for word,joint_count in most_common:
        
        # How often do we really see this common word?
        author_count = work_by_author_tokens[author].count(word)
        disputed_count = work_by_author_tokens[disputed].count(word)
        
        # How often should we see it?
        expected_author_count = joint_count * author_share
        expected_disputed_count = joint_count * (1-author_share)
        
        # Add the word's contribution to the chi-squared statistic
        chisquared += ((author_count-expected_author_count) * 
                       (author_count-expected_author_count) / 
                       expected_author_count)
                    
        chisquared += ((disputed_count-expected_disputed_count) *
                       (disputed_count-expected_disputed_count) 
                       / expected_disputed_count)
        
    print("The Chi-squared statistic for candidate", author, "is", chisquared)

The Chi-squared statistic for candidate humbert is 103.10112417097942
The Chi-squared statistic for candidate kinbote is 71.26350902071468
The Chi-squared statistic for candidate v is 90.2092520619631
The Chi-squared statistic for candidate canto is 143.62251124185974
The Chi-squared statistic for candidate foreword is 0.0
