In [1]:
import os, sys
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import grid_search
from sklearn.metrics import classification_report



In [14]:
def cleanDocument(file):
    lines = file.split("\n")
    n = len(lines)
    begin, end = n//3, 2*n//3
    lines = lines[begin:end]
    lines = (line for line in lines if len(line)>0)
    return "\n".join(lines)

In [5]:
authors_dict = {}
def loadData(path):
    documents, authors = [], []
    subfolders = [subfolder for subfolder in os.listdir(path) if os.path.isdir(os.path.join(path, subfolder))]
    for i, subfolder in enumerate(subfolders):
        subpath = os.path.join(path, subfolder)
        authors_dict[i] = subfolder
        for f_name in os.listdir(subpath):
            if not f_name.endswith('.txt'):
                continue
            with open(os.path.join(subpath, f_name), 'r', encoding='utf-8') as f:
                cleanedText = cleanDocument(f.read())
                documents.append(cleanedText)
                authors.append(i)
    return documents, np.array(authors, dtype='int')
documents_all, authors_all = loadData('datasets')
documents, documents_test, authors, authors_test = train_test_split(documents_all, authors_all, test_size=0.2, random_state=5)

In [7]:
function_words = ["a", "able", "aboard", "about", "above", "absent","according" , "accordingly", "across", "after", "against",
                  "ahead", "albeit", "all", "along", "alongside", "although", "am", "amid", "amidst", "among", "amongst", "amount", "an",
                  "and", "another", "anti", "any", "anybody", "anyone", "anything", "are", "around", "as", "aside", "astraddle",
                  "astride", "at", "away", "bar", "barring", "be", "because", "been", "before", "behind", "being", "below", "beneath",
                  "beside", "besides", "better", "between", "beyond", "bit", "both", "but", "by", "can", "certain", "circa", "close",
                  "concerning", "consequently", "considering", "could", "couple", "dare", "deal", "despite", "down", "due", "during",
                  "each", "eight", "eighth", "either", "enough", "every", "everybody", "everyone", "everything", "except", "excepting",
                  "excluding", "failing", "few", "fewer", "fifth", "first", "five", "following", "for", "four", "fourth", "from", "front",
                  "given", "good", "great", "had", "half", "have", "he", "heaps", "hence", "her", "hers", "herself", "him", "himself",
                  "his", "however", "i", "if", "in", "including", "inside", "instead", "into", "is", "it", "its", "itself", "keeping",
                  "lack", "less", "like", "little", "loads", "lots", "majority", "many", "masses", "may", "me", "might", "mine", "minority",
                  "minus", "more", "most", "much", "must", "my", "myself", "near", "need", "neither", "nevertheless", "next", "nine",
                  "ninth", "no", "nobody", "none", "nor", "nothing", "notwithstanding", "number", "numbers", "of", "off", "on",
                  "once", "one", "onto", "opposite", "or", "other", "ought", "our", "ours", "ourselves", "out", "outside", "over", "part",
                  "past", "pending", "per", "pertaining", "place", "plenty", "plethora", "plus", "quantities", "quantity", "quarter",
                  "regarding", "remainder", "respecting", "rest", "round", "save", "saving", "second", "seven", "seventh", "several",
                  "shall", "she", "should", "similar", "since", "six", "sixth","so", "some", "somebody", "someone", "something", "spite",
                  "such", "ten", "tenth", "than", "thanks", "that", "the", "their", "theirs", "them", "themselves", "then", "thence",
                  "therefore", "these", "they", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus",
                  "till", "time", "to", "tons", "top", "toward", "towards", "two", "under", "underneath", "unless", "unlike", "until",
                  "unto", "up", "upon", "us", "used", "various", "versus", "via", "view", "wanting", "was", "we", "were", "what",
                  "whatever", "when", "whenever", "where", "whereas", "wherever", "whether", "which", "whichever", "while",
                  "whilst", "who", "whoever", "whole", "whom", "whomever", "whose", "will", "with", "within", "without", "would", "yet",
                  "you", "your", "yours", "yourself", "yourselves"]

In [9]:
def svmWithExtractor(extractor):
    X = extractor.fit_transform(documents)
    vocs = extractor.vocabulary_
    X_train, X_val, y_train, y_val = train_test_split(X, authors, test_size=0.2, random_state=1)
    parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
    svm = SVC()
    grid = grid_search.GridSearchCV(svm, parameters)
    grid.fit(X_train, y_train)
    print('Best parameters:', grid.best_params_)
    y_true, y_pred = y_val, grid.predict(X_val)
    print(classification_report(y_true, y_pred))
    return grid, vocs
print('Classification with function words:')
svmFW, _ = svmWithExtractor(CountVectorizer(vocabulary=function_words))
print('Classification with unigram and bigram:')
svmNG, NG_vocs = svmWithExtractor(CountVectorizer(analyzer='word', ngram_range=(1, 2)))
print('Classification with unigram tfidf:')

Classification with function words:
Best parameters: {'C': 1, 'kernel': 'linear'}
             precision    recall  f1-score   support

          0       1.00      1.00      1.00         1
          1       1.00      1.00      1.00         2
          2       1.00      0.80      0.89         5
          3       0.60      1.00      0.75         3
          4       1.00      0.75      0.86         4

avg / total       0.92      0.87      0.87        15

Classification with unigram and bigram:
Best parameters: {'C': 1, 'kernel': 'linear'}
             precision    recall  f1-score   support

          0       1.00      1.00      1.00         1
          1       0.67      1.00      0.80         2
          2       1.00      0.80      0.89         5
          3       0.50      0.67      0.57         3
          4       1.00      0.75      0.86         4

avg / total       0.86      0.80      0.81        15



In [10]:
def predictWithExtractor(svm, extractor):
    X = extractor.fit_transform(documents_test)
    y_preds = svm.predict(X)
    for i, y_pred in enumerate(y_preds):
        y_true = authors_test[i]
        print('True author is %s, predict author is %s' % (authors_dict[y_true], authors_dict[y_pred]))
        
print('Prediction with function words:')
predictWithExtractor(svmFW, CountVectorizer(vocabulary=function_words))
print('\nPrediction with unigram and bigram:')
predictWithExtractor(svmNG, CountVectorizer(analyzer='word', ngram_range=(1, 2), vocabulary = NG_vocs))

Prediction with function words:
True author is JulesVerne, predict author is JulesVerne
True author is JulesVerne, predict author is JulesVerne
True author is MarkTwain, predict author is MarkTwain
True author is MarkTwain, predict author is MarkTwain
True author is Shakespeare, predict author is Shakespeare
True author is CharlesDickens, predict author is MarkTwain
True author is BoothTarkington, predict author is BoothTarkington
True author is JulesVerne, predict author is JulesVerne
True author is JulesVerne, predict author is JulesVerne
True author is CharlesDickens, predict author is CharlesDickens
True author is JulesVerne, predict author is JulesVerne
True author is CharlesDickens, predict author is JulesVerne
True author is CharlesDickens, predict author is CharlesDickens
True author is CharlesDickens, predict author is CharlesDickens
True author is CharlesDickens, predict author is CharlesDickens
True author is CharlesDickens, predict author is BoothTarkington
True author is B

In [11]:
def helper(arr, n):
    ret = [0]*n
    for i in arr:
        if i >= n:
            ret[n-1] += 1
        elif i > 0:
            ret[i-1] += 1
    return ret
def sentenceLengths(tokenized_sentences):  
    t = map(len, tokenized_sentences)
    return helper(t, 26)
def wordLengths(words):
    t = map(len, words)
    return helper(t, 18)
def pronounPerSentence(tokenized_sentences):
    pronouns = set(['something', 'thou', 'everybody', 'all', 'anything', 'your', 'ours', 'her', 'myself', 'him', 
                    'everything', 'I', 'nobody', 'somebody', 'whomever', 'who', 'everyone', 'none', 'each', 'thee', 
                    'thy', 'anybody', 'nothing', 'this', 'one', 'our', 'his', 'we', 'yourself', 'they', 'another', 
                    'himself', 'me', 'several', 'hers', 'no one', 'ourselves', 'both', 'some', 'itself', 'my', 
                    'whose', 'these', 'other', 'either', 'someone', 'few', 'themselves', 'thine', 'whichever', 
                    'neither', 'as', 'he', 'she', 'theirs', 'which', 'such', 'mine', 'whom', 'that', 'yourselves', 
                    'whoever', 'what', 'those', 'others', 'whatever', 'it', 'them', 'us', 'anyone', 'their', 'most', 
                    'yours', 'any', 'many', 'herself', 'you'])
    def countPronouns(sentence):
        cnt = 0
        for word in sentence:
            if word in pronouns:
                cnt += 1
        return cnt
    pronounCnts = map(countPronouns, tokenized_sentences)
    return helper(pronounCnts, 10)
def conujnctionPerSentence(tokenized_sentences):
    conjunctions = set(['and','that','but','or','as','if','when','than','because','while','where',
                        'after','so','though','since','until','whether','before','although','nor','like',
                        'once','unless','now','except'])
    def countConjunctions(sentence):
        cnt = 0
        for word in sentence:
            if word in conjunctions:
                cnt += 1
        return cnt
    conjunctionCnts = map(countConjunctions, tokenized_sentences)
    return helper(conjunctionCnts, 7)
def customFeatureExtractor(document):
    features = []
    # number of unique words
    word_tokenizer = RegexpTokenizer(r'\w+')
    words = word_tokenizer.tokenize(document.lower())
    features.append(len(set(words)))
    # sentence lengths
    sentences = sent_tokenize(document.lower())
    tokenized_sentences = list(map(word_tokenizer.tokenize, sentences))
    sl = sentenceLengths(tokenized_sentences)
    features.extend(sl)
    # word lengths
    wl = wordLengths(words)
    features.extend(wl)
    # pronoun per sentences
    pps = pronounPerSentence(tokenized_sentences)
    features.extend(pps)
    # conjunction per sentences
    cps = conujnctionPerSentence(tokenized_sentences)
    features.extend(cps)
    return features

In [12]:
def svmCustom():
    X = np.asarray(list(map(customFeatureExtractor, documents)))
    X_train, X_val, y_train, y_val = train_test_split(X, authors, test_size=0.2, random_state=1)
    parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
    svm = SVC()
    grid = grid_search.GridSearchCV(svm, parameters)
    grid.fit(X_train, y_train)
    print('Best parameters:', grid.best_params_)
    y_true, y_pred = y_val, grid.predict(X_val)
    print(y_pred)
    print(classification_report(y_true, y_pred))
    return grid
svmCF = svmCustom()

Best parameters: {'C': 1, 'kernel': 'linear'}
[2 2 1 4 3 1 3 4 3 3 3 2 2 0 4]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00         1
          1       0.50      0.50      0.50         2
          2       1.00      0.80      0.89         5
          3       0.60      1.00      0.75         3
          4       1.00      0.75      0.86         4

avg / total       0.85      0.80      0.81        15



In [13]:
X = np.asarray(list(map(customFeatureExtractor, documents_test)))
y_preds = svmCF.predict(X)
for i, y_pred in enumerate(y_preds):
    y_true = authors_test[i]
    print('True author is %s, predict author is %s' % (authors_dict[y_true], authors_dict[y_pred]))

True author is JulesVerne, predict author is JulesVerne
True author is JulesVerne, predict author is MarkTwain
True author is MarkTwain, predict author is CharlesDickens
True author is MarkTwain, predict author is MarkTwain
True author is Shakespeare, predict author is Shakespeare
True author is CharlesDickens, predict author is BoothTarkington
True author is BoothTarkington, predict author is BoothTarkington
True author is JulesVerne, predict author is JulesVerne
True author is JulesVerne, predict author is MarkTwain
True author is CharlesDickens, predict author is CharlesDickens
True author is JulesVerne, predict author is JulesVerne
True author is CharlesDickens, predict author is CharlesDickens
True author is CharlesDickens, predict author is CharlesDickens
True author is CharlesDickens, predict author is JulesVerne
True author is CharlesDickens, predict author is CharlesDickens
True author is CharlesDickens, predict author is CharlesDickens
True author is BoothTarkington, predict 