# NLP Final Project: Authorship Attribution

## Yuval Timen

We will be exploring the task of Authorship Attribution - trying to assign to each piece of text the author who wrote it. We will be looking at the works of Aristotle, Friedrich Nietzsche, William Shakespeare, Henry Wadsworth Longfellow, Percy Bysshe Shelley, Winston Churchill (the American author, not the British Prime Minister), and Robert Browning. All data was collected from Project Gutenburg.

In [6]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

## Import the data

In [19]:
# Importing the raw data
# Each data file is a String
with open('./data_files/aristotle.txt', 'r') as f1:
    data_aristotle = f1.read()

with open('./data_files/browning.txt', 'r') as f2:
    data_browning = f2.read()
    
with open('./data_files/churchill.txt', 'r') as f3:
    data_churchill = f3.read()
    
with open('./data_files/longfellow.txt', 'r') as f4:
    data_longfellow = f4.read()
    
with open('./data_files/nietzsche.txt', 'r') as f5:
    data_nietzsche = f5.read()
    
with open('./data_files/shakespeare.txt', 'r') as f6:
    data_shakespeare = f6.read()
    
with open('./data_files/shelley.txt', 'r') as f7:
    data_shelley = f7.read()
    

print(set(list(data_shelley)))

{'O', 'r', '\n', '4', '_', ' ', 'V', 'c', 'K', ')', 'z', 'M', ',', '3', 'Z', 'y', ';', '2', 'v', '0', 'h', 'Y', 'U', 'R', 's', '1', '.', '6', 'b', 'l', 'H', 'w', 'o', '*', '5', 'm', 'u', 'E', "'", 'd', '8', 'B', 'I', '?', 'f', 'J', 'A', '&', 'n', '-', 'D', 'N', 'X', 'T', 'p', '9', 'C', 'a', ']', 'P', 'F', '(', '!', '"', 'S', 'i', 'W', 'k', 'Q', 'q', '7', 'e', 'g', 'j', '=', 'L', ':', 'x', 't', '[', 'G'}


## Preprocessing and cleaning

In [None]:
# Clean our data and return it as a list of sentences

# This function takes in a String of data and
# returns a list of sentences. It also removes
# all punctuation, symbols, and numbers, as well
# as quotation marks. Finally, we make all 
# upper-case into lower-case.
def clean_data(data):
    regex_numbers = r'[0-9]'
    regex_amounts = '£[0-9]+'
    regex_punctuation = r'([·£־§”$&*ᵒ%+=`.,;:_"\/\\\(\)\[\]!?<>]|--)'
    
    
    # Expressions matching a single quote that:
    # - precede a word
    # - follow a word
    # - are not next to a word
    regex_single_quote_start = "([^a-zA-Z])\\'([a-zA-Z])"
    regex_single_quote_end = "([a-zA-Z])\\'([^a-zA-Z])"
    regex_single_quote_standalone = "([^a-zA-Z])\\'([^a-zA-Z])"
    
    # Expressions matching symbols/special characters
    # Some symbols can be replaced with ascii characters,
    # such as æ -> ae, or È -> e
    regex_replaceable_ae = '[Ææ]'
    regex_replaceable_c = '[Çç]'
    regex_replaceable_a = '[ÀÁÂÄàáâä]'
    regex_replaceable_e = '[ÉÊÈêéèēë]'
    regex_replaceable_oe = '[Œœ]'
    regex_replaceable_u = '[ÜÛüûùú]'
    regex_replaceable_i = '[ÏÎìïî]'
    regex_replaceable_o = '[ÔÖôöòó]'
    
    # Some words are entirely non-English, and must be transformed
    # to a special token. 
    regex_greek = 'ὔνΣῡὝύᾴαδθυοπὶἀὖἄἔἆωῥγτῃῆάζἰἤύόχέῶΘὸὴρόὃῑξφηΙμἐλΧςώὼιίάῦβήἈ᾽ἑὲὀεἴἱσκέ'
    regex_hebrew = 'ןזסשדטתאהלכרפךחמבצנףוםקעיץג'
    
    
    
    
    
    
    regex_special_chars = None
    
    
    output = []
    
    for sent in sentences:
        tmp = sent.lower()
        tmp = re.sub(regex_punctuation, ' ', tmp)
        tmp = re.sub(regex_numbers, ' ', tmp)
        tmp = re.sub(regex_single_quote_start, r'\1 \2', tmp)
        tmp = re.sub(regex_single_quote_end, r'\1 \2', tmp)
        tmp = re.sub(regex_single_quote_standalone, r'\1 \2', tmp)
        output.append(tmp)
    
    return output
    
    