In [146]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from io import BytesIO
import zipfile
# 
# https://programminghistorian.org/en/lessons/introduction-to-stylometry-with-python
url = 'https://github.com/programminghistorian/ph-submissions/raw/gh-pages/assets/introduction-to-stylometry-with-python/stylometry-federalist.zip'
zipped = requests.get(url).content
fp = BytesIO(zipped)
zfile = zipfile.ZipFile(fp, "r")

In [148]:
labels = {
    'Madison': [10, 14, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48],
    'Hamilton': [1, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 21, 22, 23, 24,
                 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 59, 60,
                 61, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
                 78, 79, 80, 81, 82, 83, 84, 85],
    'Jay': [2, 3, 4, 5],
    'Shared': [18, 19, 20],
    'Disputed': [49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 62, 63],
    'TestCase': [64]
}

dataset = []
for label, indices in labels.items():
    for index in indices:
        dataset.append({
            "label": label,
            "text": zfile.open(f"data/federalist_{index}.txt").read().decode('utf-8')
        })
dataset = pd.DataFrame(dataset)

## Ex 1: 
* create equal sized chunks of the text as individual samples
* split into train/test/exploration data set `X_train`, `X_test`, ...

### Chunking & Vectorization

In [149]:
# chunking (200 words per chunk)
    # Split the input text into chunks, where # each chunk contains N words 

def tokenizer(input_data):     # creates words from strings
    return input_data.split(' ')     

def chunker(ll):
    result = [[]]
    count = 0
    
    for item in ll:
        if len(result[-1]) >= 200:
            result.append([])
            
        result[-1].append(item)
        count += 1
    
    return result

In [159]:
# apply tokenizer & chunker

new_text = []

for _,row in dataset.iterrows():
    
    tokenized = tokenizer(row.text)
    chunks = chunker(tokenized)
    
    for chunk in chunks:
        new_text.append({
            'label' : row.label,
            'text' : chunk
        })
        
new_df = pd.DataFrame(new_text)

### Train-Test Split

In [151]:
# Train Test Split

test = new_df[new_df.label=='TestCase']
test.head()

trainevalu = new_df[new_df.label!='TestCase']
trainevalu.head()

Unnamed: 0,label,text
0,Madison,"[, 10\n\nThe, Same, Subject, Continued, (The, ..."
1,Madison,"[of, public, and, private, faith,, and, of, pu..."
2,Madison,"[actuated, by, some\ncommon, impulse, of, pass..."
3,Madison,"[a, reciprocal, influence, on, each, other;, a..."
4,Madison,"[fall, into, mutual\nanimosities,, that, where..."


In [164]:
# Vectorization

import sklearn.feature_extraction

def noop(X):
    return X

vectorizer = sklearn.feature_extraction.text.CountVectorizer(tokenizer=noop, preprocessor=noop)

new_df.text = vectorizer.fit_transform(new_df.text)

In [193]:
# TEST DATA 

test = new_df[new_df.label=='TestCase']
trainevalu = new_df[new_df.label!='TestCase']

X_test = test.text
y_test = test.label

# TRAIN & EVALUATION DATA
# test train split with sklearn (stratified = splits according to labels for equal representation)

from sklearn.model_selection import train_test_split

X_train, X_eval, y_train, y_eval = train_test_split(trainevalu.text, 
                                                    trainevalu.label,
                                                    stratify=trainevalu.label, 
                                                    test_size=0.25)

In [184]:
#plt.hist(y_eval)

In [185]:
#plt.hist(y_train)

## Ex 2:
* implement Burrows' Delta or other variants of stylometric methods

In [200]:
class BurrowsDelta:
    
    def __init__(self, num_words=500):
        self.num_words = num_words
    
    def fit(self, X, y):
        
        #import pdb;pdb.set_trace()
        
        self.chosen_words = np.ravel(X.sum(axis=0)).argsort()[::-1][:self.num_words]
        sX = X.T[self.chosen_words].toarray()
    
        ### YOUR CODE BELOW
        
        self.mea_word = sX.mean(axis=1)
        self.std_word = sX.std(axis=1)
        self.z_scores = ((sX.T - self.mea_word) / self.std_word).T
        
        ### YOUR CODE ABOVE
        self.y = np.array(y)
            
    def predict(self, X):
        ### YOUR CODE BELOW

        sX = X.T[self.chosen_words].toarray()
        new_z_scores = ((sX.T - self.mea_word) / self.std_word).T
        
        ### YOUR CODE ABOVE
        
        dists = 1-cdist(new_z_scores.T, self.z_scores.T)
        
        return self.y[dists.argmax(axis=1)]

In [204]:
clf = BurrowsDelta()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_eval)

print(classification_report(np.array(y_eval), y_pred))

AttributeError: 'list' object has no attribute 'sum'