In [1]:
authors = ['humbert', 'kinbote', 'v', 'canto', 'foreword']
potential_authors = ['humbert', 'kinbote', 'v']
disputed = 'canto'
file_range_map = {
    'humbert': range(1,6),
    'kinbote': range(1,6),
    'v': range(1,6),
    'canto': range(1,5),
    'foreword': range(1,5)
}

# A function that compiles all of the text files associated with a single author into a single string
def read_files_into_string(filename):
    strings = []
    for i in file_range_map[filename]:
        with open('{}_{}.txt'.format(filename, i)) as f:
            strings.append(f.read())
    return '\n'.join(strings)


# Make a dictionary out of the authors' corpora
work_by_author = {}
for author in authors:
    work_by_author[author] = read_files_into_string(author)

# Load nltk
import nltk
matplotlib.use('TkAgg')

# Transform the authors' corpora into lists of word tokens
work_by_author_tokens = {}
work_by_author_length_distributions = {}
for author in authors:
    tokens = nltk.word_tokenize(work_by_author[author])
    work_by_author_tokens[author] = ([token for token in tokens if any(c.isalpha() for c in token)])


# Combine every paper except our test case into a single corpus
whole_corpus = []
for author in potential_authors:
    whole_corpus += work_by_author_tokens[author]
    
# Get a frequency distribution
whole_corpus_freq_dist = list(nltk.FreqDist(whole_corpus).most_common(50))

# The main data structure
features = [word for word,freq in whole_corpus_freq_dist]
feature_freqs = {}

for author in potential_authors:
    # A dictionary for each candidate's features
    feature_freqs[author] = {}
    
    # A helper value containing the number of tokens in the author's subcorpus
    overall = len(work_by_author_tokens[author])
    
    # Calculate each feature's presence in the subcorpus
    for feature in features:
        presence = work_by_author_tokens[author].count(feature)
        feature_freqs[author][feature] = presence / overall

import math

# The data structure into which we will be storing the "corpus standard" statistics
corpus_features = {}

# For each feature...
for feature in features:
    # Create a sub-dictionary that will contain the feature's mean 
    # and standard deviation
    corpus_features[feature] = {}
    
    # Calculate the mean of the frequencies expressed in the subcorpora
    feature_average = 0
    for author in potential_authors:
        feature_average += feature_freqs[author][feature]
    feature_average /= len(authors)
    corpus_features[feature]["Mean"] = feature_average
    
    # Calculate the standard deviation using the basic formula for a sample
    feature_stdev = 0
    for author in potential_authors:
        diff = feature_freqs[author][feature] - corpus_features[feature]["Mean"]
        feature_stdev += diff*diff
    feature_stdev /= (len(authors) - 1)
    feature_stdev = math.sqrt(feature_stdev)
    corpus_features[feature]["StdDev"] = feature_stdev

    
feature_zscores = {}
for author in potential_authors:
    feature_zscores[author] = {}
    for feature in features:
        
        # Z-score definition = (value - mean) / stddev
        # We use intermediate variables to make the code easier to read
        feature_val = feature_freqs[author][feature]
        feature_mean = corpus_features[feature]["Mean"]
        feature_stdev = corpus_features[feature]["StdDev"]
        feature_zscores[author][feature] = ((feature_val-feature_mean) / 
                                            feature_stdev)

# Tokenize the test case
testcase_tokens = work_by_author_tokens[disputed]
 
# Calculate the test case's features
overall = len(testcase_tokens)
testcase_freqs = {}
for feature in features:
    presence = testcase_tokens.count(feature)
    testcase_freqs[feature] = presence / overall
    
# Calculate the test case's feature z-scores
testcase_zscores = {}
for feature in features:
    feature_val = testcase_freqs[feature]
    feature_mean = corpus_features[feature]["Mean"]
    feature_stdev = corpus_features[feature]["StdDev"]
    testcase_zscores[feature] = (feature_val - feature_mean) / feature_stdev

for author in potential_authors:
    delta = 0
    for feature in features:
        delta += math.fabs((testcase_zscores[feature] - 
                            feature_zscores[author][feature]))
    delta /= len(features)
    print( "Delta score for candidate", author, "is", delta )


Delta score for candidate humbert is 1.1486340383396685
Delta score for candidate kinbote is 0.9518314112226345
Delta score for candidate v is 1.058999372507057
