# Discovery of Writing Differences - Huggingface Transformers

Capstone project by Tomo Umer

<img src="https://tomoumerdotcom.files.wordpress.com/2022/04/cropped-pho_logo_notext.png" style="width:400px;height:400px;"/>

## Imports

In [113]:
import pandas as pd
import numpy as np
import pickle

from datasets import Dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback

from sklearn.preprocessing import LabelBinarizer

from sklearn.model_selection import train_test_split
from scipy import special


import plotly.express as px
from sklearn.metrics import confusion_matrix

## Setting up the Data

This next chunk was for using the 50 books per author. Results were ... ok.

In [2]:
# 03 is the one with limit 50 books per author
# library_select = pd.read_pickle('../data/library_select03.pkl')

# since hugging face only accepts up to 512 characters with this model,
# better to get words from the middle of the book
# library_select['book_content_modified'] = library_select['book_content'].apply(lambda text: text[len(text) // 2:])

In [3]:
# only 5 books per author
library_select = pd.read_pickle('../data/library_fixed_author_five.pkl')

In [4]:
# how many parts of a book to take
n_parts = 10 # previous was 5

bookpart_list = []

for i in range(n_parts):
    # note: the +1s are there because I don't want the exact beginning, or the end of the book (there could be some junk there)
    bookpart_list.append(library_select['book_content'].apply(lambda text: text[(i+1)* len(text) // (n_parts+1):]))

# copy the library n_parts times, to concatenate with the split texts
library_select_multi = pd.concat([library_select]*n_parts, ignore_index=True).drop(columns='book_content')

# add the above parts of the text into a new column
library_select_multi['book_part'] = pd.concat(bookpart_list, ignore_index=True)

In [105]:
library_select_multi.shape

(1050, 13)

In [5]:
# for testing
# library_select['book_length'] = library_select['book_content'].str.len()
#library_select_multi['book_part_length'] = library_select_multi['book_part'].str.len()

# and then to verify that it's the same book - 105 is of course with 21 authors, 5 books each
# library_select_multi.loc[0]
# library_select_multi.loc[105]
# library_select_multi.loc[210]

# library_select['author_num'].nunique()

In [6]:
checkpoint = 'bert-base-uncased'
model_path = '../models/bert_base_uncased'

In [7]:
# note truncation side and padding side are to determine which side to cutoff - beginning (left) or end (rigt)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation_side='right', padding_side='right')

def tokenize_function(df):
    return tokenizer(df['text'], truncation=True, padding='max_length',  max_length=512)

acc = evaluate.load('accuracy') #average = None
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
mcc = evaluate.load('matthews_correlation')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc_m = acc.compute(predictions=predictions, references=labels)
    precision_m = precision.compute(predictions=predictions, average = 'macro', references=labels) #used  weighted for 50 books
    recall_m = recall.compute(predictions=predictions, average = 'macro', references=labels)
    f1_m = f1.compute(predictions=predictions, average = 'macro', references=labels)
    mcc_m = mcc.compute(predictions=predictions, references=labels)
    metrics = {
        'accuracy': acc_m['accuracy'],
        'precision': precision_m['precision'],
        'recall': recall_m['recall'],
        'f1': f1_m['f1'],
        'mcc': mcc_m['matthews_correlation']
    }
    return metrics

In [8]:
# this part is the same if I use library_select ..
select_authors = list(library_select_multi.sort_values(by='authorcentury')['author'].unique())

authors_to_num = {select_authors[i]: i for i in range(len(select_authors))}
num_to_authors = {v: k for k, v in authors_to_num.items()}

library_select_multi['author_num'] = library_select_multi['author'].map(authors_to_num)

In [172]:
authors_to_num

{'Homer': 0,
 'Confucius': 1,
 'Plato': 2,
 'Cicero, Marcus Tullius': 3,
 'Seneca, Lucius Annaeus': 4,
 'Dante Alighieri': 5,
 'Boccaccio, Giovanni': 6,
 'Machiavelli, Niccolò': 7,
 'Shakespeare, William': 8,
 'Molière': 9,
 'Jefferson, Thomas': 10,
 'Defoe, Daniel': 11,
 'Austen, Jane': 12,
 'Twain, Mark': 13,
 'Doyle, Arthur Conan': 14,
 'Dickens, Charles': 15,
 'Dumas, Alexandre': 16,
 'Dick, Philip K.': 17,
 'Lovecraft, H. P. (Howard Phillips)': 18,
 'Huxley, Aldous': 19,
 'Churchill, Winston': 20}

In [9]:
X = library_select_multi[['book_part']]
y = library_select_multi['author_num']

X_part, X_test, y_part, y_test = train_test_split(X, y, test_size=0.2, random_state = 42, stratify = y)

In [10]:
# to use for validation first
X_train, X_val, y_train, y_val = train_test_split(X_part, y_part, test_size=0.15, random_state = 42, stratify = y_part)

In [None]:
# split train further into train & 15% for validation (replace test here with validation)
train_ds = Dataset.from_dict({'text': X_train['book_part'], 'labels': y_train})
val_ds = Dataset.from_dict({'text': X_val['book_part'], 'labels': y_val})

In [None]:
tokenized_train_ds = train_ds.map(tokenize_function)
tokenized_train_ds = tokenized_train_ds.remove_columns(['text'])

In [None]:
tokenized_val_ds = val_ds.map(tokenize_function)
tokenized_val_ds = tokenized_val_ds.remove_columns(['text'])

In [None]:
# this needs to change if I change num authors
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=21)

In [None]:
# this is only needed if running the trainer locally
training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy='epoch',
    num_train_epochs=15, #10 with last model
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=15, #10 with last model
    save_strategy='epoch',
    load_best_model_at_end=True,
    log_level ='info',
    metric_for_best_model='eval_mcc',
    optim = 'adamw_torch',
    learning_rate=1e-05,
    #fp16=True #this is to run on the gpu
)

In [None]:
# this if uncommented to run the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
# only run this on google collab, it takes 4 hours on my laptop!
# trainer.train()

## Taking the pretrained model from google collab

In [23]:
#model = AutoModelForSequenceClassification.from_pretrained(model_path + '/checkpoint-58362')
model = AutoModelForSequenceClassification.from_pretrained('../models/bert_base_uncased/fivebooks_tenparts/')

In [24]:
test_args = TrainingArguments(
    output_dir= '../models/bert_base_uncased/fivebooks_tenparts/',
    do_train=False,
    do_predict=True,
    per_device_eval_batch_size=4,
    # fp16=True
)

In [25]:
trainer = Trainer(
    model=model,
    args=test_args,
    compute_metrics=compute_metrics
)

In [26]:
test_ds = Dataset.from_dict({'text': X_test['book_part'], 'labels': y_test})

tokenized_test_ds = test_ds.map(tokenize_function)
tokenized_test_ds = tokenized_test_ds.remove_columns(['text'])

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

In [27]:
results = trainer.predict(tokenized_test_ds) 

  0%|          | 0/53 [00:00<?, ?it/s]

In [18]:
results.metrics

{'test_loss': 0.747704803943634,
 'test_accuracy': 0.8523809523809524,
 'test_precision': 0.8709528566671423,
 'test_recall': 0.8523809523809524,
 'test_f1': 0.8499490192027161,
 'test_mcc': 0.8464927558247427,
 'test_runtime': 135.7142,
 'test_samples_per_second': 1.547,
 'test_steps_per_second': 0.391}

In [19]:
results.predictions

array([[-0.2970259 , -0.9948605 , -0.57020885, ..., -0.6852046 ,
         3.7632463 , -0.81379783],
       [-0.6107386 ,  4.5881853 ,  0.04168366, ...,  0.14784263,
        -0.73159015, -0.45718566],
       [-0.67458504, -0.2100727 , -0.83155763, ...,  1.8761533 ,
        -0.75377494,  1.2979147 ],
       ...,
       [-0.8716213 ,  0.0632986 , -0.87801504, ...,  3.6599355 ,
         0.17955239,  0.913813  ],
       [-0.34778407, -0.07626636, -0.7778298 , ...,  3.6431763 ,
        -0.50725883,  1.8426869 ],
       [-0.8960496 , -0.77703226, -0.78770393, ..., -0.23276825,
         2.7416377 , -0.9668989 ]], dtype=float32)

In [106]:
# this is to get which labels are being predicted
results.predictions.argmax(axis=1)

# this simply stores the correct predictions, so equivalent to y_test:
# results.label_ids

array([19,  1, 10, 13, 12, 10,  9,  8,  7, 11,  0,  0, 18, 13, 16,  4,  0,
        9,  7,  8, 17,  3,  5,  1,  7, 19, 19,  3,  1, 16, 10,  6, 11,  0,
        9, 10, 11,  8,  5, 14,  1, 20,  9,  7,  9, 20,  3,  2, 12,  2,  9,
       12, 17,  6,  7,  6,  1, 19,  7,  2, 15,  8, 10, 14,  4,  6,  3, 12,
       14,  8, 13, 17, 13,  8,  5, 20, 14,  3, 11,  3, 17, 19, 18, 18, 12,
       15, 19,  4,  8,  5, 10, 18, 12,  5,  3, 18,  6, 19,  3, 17,  2, 17,
       10, 16,  7, 16, 16, 16, 11, 20, 13, 20, 20, 20,  4,  4, 11,  2, 15,
       13, 10, 14, 16, 10, 12, 15, 11, 17,  6,  5, 13, 17, 20, 13,  0, 19,
        9,  7, 14,  9,  7, 11, 11,  0, 16,  3,  3,  4,  0, 18,  2,  9, 12,
       14,  6, 12,  8, 16, 11, 18,  5,  7,  4,  6, 20, 11,  0,  7,  1,  4,
        8,  9,  0,  2,  9, 18,  2,  3,  1, 13,  1, 12, 11, 16,  3,  0, 19,
       16,  2,  2,  9, 11, 16, 10,  9, 17, 17,  3, 15, 18,  1, 12, 10,  2,
        2, 14,  2, 18, 18, 15])

In [22]:
fig = px.imshow(confusion_matrix(y_test, results.predictions.argmax(axis=1)),
                width=1000,
                height=800,
                text_auto=True,
                labels=dict(x='Predicted Label',
                            y='True Label'),
                            x=select_authors,
                            y=select_authors,
                            color_continuous_scale='Teal'
                            )

fig.update(layout_coloraxis_showscale=False)

fig.show()

## Introducing New Text

This part will ideally be in an app where any text can be uploaded

In [77]:
newtext = pd.DataFrame()

for book_num, book_name in enumerate(['Lambda', 'Deathway']):
        filepath = f'../data/{book_name} by Tomo Umer.txt'

        with open(filepath, encoding = 'utf-8') as fi:
                book = fi.read()
        
        tmp_text = pd.DataFrame({'id': f'TU{str(book_num).zfill(3)}',
                                 'title': [book_name],
                                 'author': 'Umer, Tomo',
                                 'authorcentury': 21,
                                 'book_content': [book]})

        newtext = pd.concat([newtext, tmp_text], ignore_index = True)

In [78]:
bookpart_list = []

for i in range(n_parts):
    # note: the +1s are there because I don't want the exact beginning, or the end of the book (there could be some junk there)
    bookpart_list.append(newtext['book_content'].apply(lambda text: text[(i+1)* len(text) // (n_parts+1):]))

# copy the library n_parts times, to concatenate with the split texts
newtext_multi = pd.concat([newtext]*n_parts, ignore_index=True).drop(columns='book_content')

# add the above parts of the text into a new column
newtext_multi['book_part'] = pd.concat(bookpart_list, ignore_index=True)

In [79]:
newtext_ds = Dataset.from_dict({'text': newtext_multi['book_part']}) #, 'labels': y_test})

tokenized_newtext_ds= newtext_ds.map(tokenize_function)
tokenized_newtext_ds = tokenized_newtext_ds.remove_columns(['text'])

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [80]:
new_results = trainer.predict(tokenized_newtext_ds) 

  0%|          | 0/5 [00:00<?, ?it/s]

In [81]:
# first book Lambda, second Deathway
# newtext_multi['title'].unique()

# to see what got predicted
# new_results.predictions.argmax(axis=1)

# book1_authors = set(new_results.predictions.argmax(axis=1)[::2])
# book2_authors = set(new_results.predictions.argmax(axis=1)[1::2])

# for author in book1_authors:
#     print('Lambda is similar in writing to: ', num_to_authors[author])

# for author in book2_authors:
#     print('Deathway is similar in writing to: ', num_to_authors[author])

In [160]:
# this is needed for the strmlite app
authors_df = pd.read_pickle('../data/select_authors.pkl')

In [155]:
def compute_winners(model_predictions_max):
    unique_num, counts = np.unique(model_predictions_max, return_counts=True)

    unique_authors = [num_to_authors[unique] for unique in unique_num]

    return pd.DataFrame({'most likely author': unique_authors, 'number of times':counts})

# Note that the above is because I put two books in at once. for one at a time, I would define this function
# to just give it predictions - like so
# def compute_winners(model_predictions):
#     unique_num, counts = np.unique(model_predictions.argmax(axis=1), return_counts=True)

#     unique_authors = [num_to_authors[unique] for unique in unique_num]

#     return pd.DataFrame({'most likely author': unique_authors, 'number of times':counts})

In [156]:
book1_winners = compute_winners(new_results.predictions.argmax(axis=1)[::2])
book2_winners = compute_winners(new_results.predictions.argmax(axis=1)[1::2])

In [158]:
book2_winners

Unnamed: 0,most likely author,number of times
0,"Dick, Philip K.",7
1,"Lovecraft, H. P. (Howard Phillips)",2
2,"Churchill, Winston",1


In [86]:
new_results.predictions.shape

(20, 21)

Note: the results are logits. Need to use softmax to convert them to probabilities.

In [124]:
#import torch.nn.functional as F
# new_probabilities = F.softmax(new_results, dim=-1)

In [128]:
new_probabilities = special.softmax(new_results.predictions, axis=1)

In [192]:
new_probabilities_df = pd.DataFrame(new_probabilities, columns=select_authors).T

In [206]:
new_probabilities_df[0].head().map('{:.2%}'.format)

Homer                     1.35%
Confucius                 2.65%
Plato                     1.12%
Cicero, Marcus Tullius    0.83%
Seneca, Lucius Annaeus    1.12%
Name: 0, dtype: object

In [208]:
for column in new_probabilities_df:
    print(new_probabilities_df[column].sort_values(ascending=False).head().map('{:.2%}'.format))

Dick, Philip K.                       62.99%
Lovecraft, H. P. (Howard Phillips)     6.13%
Churchill, Winston                     3.09%
Jefferson, Thomas                      3.06%
Confucius                              2.65%
Name: 0, dtype: object
Dick, Philip K.                       37.36%
Lovecraft, H. P. (Howard Phillips)     9.17%
Jefferson, Thomas                      6.90%
Churchill, Winston                     6.14%
Doyle, Arthur Conan                    4.25%
Name: 1, dtype: object
Dick, Philip K.                       43.20%
Jefferson, Thomas                      8.21%
Churchill, Winston                     7.07%
Lovecraft, H. P. (Howard Phillips)     4.71%
Seneca, Lucius Annaeus                 4.27%
Name: 2, dtype: object
Dick, Philip K.                       56.11%
Lovecraft, H. P. (Howard Phillips)     6.49%
Churchill, Winston                     4.71%
Twain, Mark                            3.84%
Huxley, Aldous                         3.54%
Name: 3, dtype: object
Dick, Ph

In [129]:
similar_books = pd.concat([newtext_multi ,pd.DataFrame(new_probabilities, columns=select_authors)], axis=1)

In [177]:
#similar_books.loc[similar_books['title'] == 'Lambda'].head()
similar_books.loc[similar_books['title'] == 'Deathway'].head()

Unnamed: 0,id,title,author,authorcentury,book_part,Homer,Confucius,Plato,"Cicero, Marcus Tullius","Seneca, Lucius Annaeus",...,"Defoe, Daniel","Austen, Jane","Twain, Mark","Doyle, Arthur Conan","Dickens, Charles","Dumas, Alexandre","Dick, Philip K.","Lovecraft, H. P. (Howard Phillips)","Huxley, Aldous","Churchill, Winston"
1,TU001,Deathway,"Umer, Tomo",21,in the world left in her at that precise momen...,0.014533,0.021038,0.022319,0.021039,0.034507,...,0.014419,0.041128,0.029623,0.04246,0.017873,0.015319,0.373612,0.09168,0.034011,0.06143
3,TU001,Deathway,"Umer, Tomo",21,"rily lie down. “Just a quick break,” she told ...",0.014873,0.017995,0.008333,0.008065,0.024003,...,0.006739,0.020263,0.038441,0.017389,0.011713,0.012322,0.56109,0.064924,0.035425,0.047127
5,TU001,Deathway,"Umer, Tomo",21,"ing up with her laundry.\nAt the very least, h...",0.012432,0.015743,0.01293,0.013248,0.041439,...,0.009528,0.036448,0.046341,0.03393,0.01557,0.023936,0.307046,0.115609,0.068246,0.096436
7,TU001,Deathway,"Umer, Tomo",21,t to miss this party of yours.”\nThat last com...,0.018059,0.020753,0.017539,0.018849,0.031634,...,0.014582,0.028135,0.023656,0.034743,0.015124,0.017934,0.400278,0.086474,0.033554,0.0583
9,TU001,Deathway,"Umer, Tomo",21,ing mind games with her on this particular Thu...,0.010882,0.020914,0.018298,0.011226,0.0151,...,0.009344,0.031091,0.039341,0.047925,0.051747,0.03069,0.07375,0.250739,0.169421,0.06742


In [84]:
similar_books = (
    similar_books
        .drop(columns=['id', 'authorcentury', 'book_part', 'author'])
        .groupby('title')
        .mean()
        .pivot_table(columns='title')
)

In [104]:
print(similar_books.sort_values(by='Deathway', ascending=False)['Deathway'].head())
print('--------')
print(similar_books.sort_values(by='Lambda', ascending=False)['Lambda'].head())

Dick, Philip K.                       2.169794
Lovecraft, H. P. (Howard Phillips)    1.304179
Churchill, Winston                    0.888144
Huxley, Aldous                        0.590142
Jefferson, Thomas                     0.393259
Name: Deathway, dtype: float32
--------
Dick, Philip K.                       2.726228
Lovecraft, H. P. (Howard Phillips)    1.243493
Churchill, Winston                    0.499763
Doyle, Arthur Conan                   0.401514
Shakespeare, William                  0.356518
Name: Lambda, dtype: float32
