# Discovery of Writing Differences - Huggingface Transformers

Capstone project by Tomo Umer

<img src="https://tomoumerdotcom.files.wordpress.com/2022/04/cropped-pho_logo_notext.png" alt="PRAISE DOG" style="width:400px;height:400px;"/>

## Imports

In [1]:
import pandas as pd
import numpy as np
import pickle

from datasets import Dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
import torch

from sklearn.preprocessing import LabelBinarizer

from sklearn.model_selection import train_test_split

import plotly.express as px
from sklearn.metrics import confusion_matrix

## Setting up the Data

This next chunk was for using the 50 books per author. Results were ... ok.

In [2]:
# 03 is the one with limit 50 books per author
# library_select = pd.read_pickle('../data/library_select03.pkl')

# since hugging face only accepts up to 512 characters with this model,
# better to get words from the middle of the book
# library_select['book_content_modified'] = library_select['book_content'].apply(lambda text: text[len(text) // 2:])

In [3]:
# only 5 books per author
library_select = pd.read_pickle('../data/library_select_5perauthor.pkl')

In [4]:
# how many parts of a book to take
n_parts = 10 # previous was 5

bookpart_list = []

for i in range(n_parts):
    # note: the +1s are there because I don't want the exact beginning, or the end of the book (there could be some junk there)
    bookpart_list.append(library_select['book_content'].apply(lambda text: text[(i+1)* len(text) // (n_parts+1):]))

# copy the library n_parts times, to concatenate with the split texts
library_select_multi = pd.concat([library_select]*n_parts, ignore_index=True).drop(columns='book_content')

# add the above parts of the text into a new column
library_select_multi['book_part'] = pd.concat(bookpart_list, ignore_index=True)

In [5]:
# for testing
# library_select['book_length'] = library_select['book_content'].str.len()
#library_select_multi['book_part_length'] = library_select_multi['book_part'].str.len()

# and then to verify that it's the same book - 105 is of course with 21 authors, 5 books each
# library_select_multi.loc[0]
# library_select_multi.loc[105]
# library_select_multi.loc[210]

# library_select['author_num'].nunique()

In [6]:
checkpoint = 'bert-base-uncased'
model_path = '../data/bert_base_uncased_fivepart'

In [7]:
# note truncation side and padding side are to determine which side to cutoff - beginning (left) or end (rigt)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation_side='right', padding_side='right')

def tokenize_function(df):
    return tokenizer(df['text'], truncation=True, padding='max_length',  max_length=512)

acc = evaluate.load('accuracy') #average = None
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
mcc = evaluate.load('matthews_correlation')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc_m = acc.compute(predictions=predictions, references=labels)
    precision_m = precision.compute(predictions=predictions, average = 'macro', references=labels) #used  weighted for 50 books
    recall_m = recall.compute(predictions=predictions, average = 'macro', references=labels)
    f1_m = f1.compute(predictions=predictions, average = 'macro', references=labels)
    mcc_m = mcc.compute(predictions=predictions, references=labels)
    metrics = {
        'accuracy': acc_m['accuracy'],
        'precision': precision_m['precision'],
        'recall': recall_m['recall'],
        'f1': f1_m['f1'],
        'mcc': mcc_m['matthews_correlation']
    }
    return metrics

In [44]:
# this part is the same if I use library_select ..
select_authors = list(library_select_multi.sort_values(by='authorcentury')['author'].unique())

authors_to_num = {select_authors[i]: i for i in range(len(select_authors))}
num_to_authors = {v: k for k, v in authors_to_num.items()}

library_select_multi['author_num'] = library_select_multi['author'].map(authors_to_num)

In [9]:
#this part is not
#X = library_select[['book_part']]
#y = library_select['author_num']

X = library_select_multi[['book_part']]
y = library_select_multi['author_num']

X_part, X_test, y_part, y_test = train_test_split(X, y, test_size=0.2, random_state = 42, stratify = y)

In [10]:
# to use for validation first
X_train, X_val, y_train, y_val = train_test_split(X_part, y_part, test_size=0.15, random_state = 42, stratify = y_part)

In [11]:
# split train further into train & 15% for validation (replace test here with validation)
#train_ds = Dataset.from_dict({'text': X_train['book_content'], 'labels': LabelBinarizer().fit_transform(y_train['author_num'])})
#val_ds = Dataset.from_dict({'text': X_test['book_content'], 'labels': LabelBinarizer().fit_transform(y_test['author_num'])})

In [12]:
# split train further into train & 15% for validation (replace test here with validation)
train_ds = Dataset.from_dict({'text': X_train['book_part'], 'labels': y_train})
val_ds = Dataset.from_dict({'text': X_val['book_part'], 'labels': y_val})

In [13]:
tokenized_train_ds = train_ds.map(tokenize_function)
tokenized_train_ds = tokenized_train_ds.remove_columns(['text'])

Map:   0%|          | 0/714 [00:00<?, ? examples/s]

In [14]:
tokenized_val_ds = val_ds.map(tokenize_function)
tokenized_val_ds = tokenized_val_ds.remove_columns(['text'])

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [15]:
# this needs to change if I change num authors
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=21)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy='epoch',
    num_train_epochs=15, #10 with last model
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_total_limit=15, #10 with last model
    save_strategy='epoch',
    load_best_model_at_end=True,
    log_level ='info',
    metric_for_best_model='eval_mcc',
    optim = 'adamw_torch',
    learning_rate=1e-05,
    #fp16=True #this is to run on the gpu
)

In [None]:
# this if uncommented to run the trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train_ds,
#     eval_dataset=tokenized_val_ds,
#     compute_metrics=compute_metrics,
#     callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
# )

In [None]:
# only run this on google collab, it takes 4 hours on my laptop!
# trainer.train()

In [17]:
trainer = Trainer(
    model=AutoModelForSequenceClassification.from_pretrained('../models/books5_parts10_epochs20/', num_labels=21),
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [None]:
#LabelBinarizer().fit_transform(y_train)

In [18]:
test_ds = Dataset.from_dict({'text': X_test['book_part'], 'labels': y_test})

tokenized_test_ds = test_ds.map(tokenize_function)
tokenized_test_ds = tokenized_test_ds.remove_columns(['text'])

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

In [19]:
results = trainer.predict(tokenized_test_ds) 

***** Running Prediction *****
  Num examples = 210
  Batch size = 16


  0%|          | 0/14 [00:00<?, ?it/s]

In [20]:
results.metrics

{'test_loss': 0.747704803943634,
 'test_accuracy': 0.8523809523809524,
 'test_precision': 0.8709528566671423,
 'test_recall': 0.8523809523809524,
 'test_f1': 0.8499490192027161,
 'test_mcc': 0.8464927558247427,
 'test_runtime': 139.4928,
 'test_samples_per_second': 1.505,
 'test_steps_per_second': 0.1}

In [21]:
results.predictions

array([[-0.2970257 , -0.99485993, -0.57020867, ..., -0.6852048 ,
         3.7632456 , -0.81379783],
       [-0.6107387 ,  4.588185  ,  0.04168364, ...,  0.14784265,
        -0.7315902 , -0.45718578],
       [-0.67458475, -0.21007276, -0.8315575 , ...,  1.8761536 ,
        -0.7537751 ,  1.2979146 ],
       ...,
       [-0.87162113,  0.06329876, -0.8780149 , ...,  3.6599357 ,
         0.17955273,  0.91381323],
       [-0.34778407, -0.07626636, -0.7778298 , ...,  3.6431763 ,
        -0.50725883,  1.8426869 ],
       [-0.8960496 , -0.77703226, -0.78770393, ..., -0.23276825,
         2.7416377 , -0.9668989 ]], dtype=float32)

In [22]:
results.label_ids

array([19,  1, 13, 20, 12, 10,  9,  8,  7, 11,  0,  0, 18, 13, 16,  2,  5,
        9,  7,  8, 17,  3,  5,  1, 10, 19, 19,  3,  1, 16,  4,  6, 11,  0,
       19, 10, 11,  8, 19, 14,  1, 14,  1,  7,  9, 20,  4,  2, 12,  2,  9,
       12, 17,  6,  7,  6,  1, 19,  7,  2, 15,  8, 10, 14,  5,  6,  3, 12,
       14,  8, 13, 17, 13,  8,  5, 20, 14,  3, 11,  2, 17, 19, 18, 18, 15,
       15, 19,  4,  8,  5, 10, 18, 12,  5,  3, 18,  5, 15,  3, 17,  2, 17,
       10, 16,  7, 16, 16, 13,  6, 20, 20, 20, 20, 20,  5,  4, 11,  4, 15,
       13, 10, 14, 16, 10, 12, 15, 15, 17,  6,  5, 13, 17, 20, 13,  0, 19,
        9,  7, 14,  9,  7, 15, 11,  0, 13,  3,  4,  4,  0, 18,  2,  8, 12,
       14,  6, 12,  8, 16, 11, 18,  5,  6,  4,  6, 20, 11,  0,  7,  1,  4,
        8,  9,  0,  2,  9, 14,  2,  3,  1, 13,  1, 12, 11, 16,  3,  0, 19,
       16,  7,  6,  9, 11, 16, 10,  9, 17, 17,  4, 15, 18,  1, 12, 10,  0,
        3, 14,  2, 18, 18, 15])

In [23]:
# this is to get which labels are being predicted
results.predictions.argmax(axis=1).shape

# this simply stores the correct predictions, so equivalent to y_test:
# results.label_ids

(210,)

In [24]:
fig = px.imshow(confusion_matrix(y_test, results.predictions.argmax(axis=1)),
                width=1000,
                height=800,
                text_auto=True,
                labels=dict(x='Predicted Label',
                            y='True Label'),
                            x=select_authors,
                            y=select_authors,
                            color_continuous_scale='Teal'
                            )

fig.update(layout_coloraxis_showscale=False)

fig.show()

## Introducing New Text

This part will ideally be in an app where any text can be uploaded

In [25]:
newtext = pd.DataFrame()

for book_num, book_name in enumerate(['Lambda', 'Deathway']):
        filepath = f'../data/{book_name} by Tomo Umer.txt'

        with open(filepath, encoding = 'utf-8') as fi:
                book = fi.read()
        
        tmp_text = pd.DataFrame({'id': f'TU{str(book_num).zfill(3)}',
                                 'title': [book_name],
                                 'author': 'Umer, Tomo',
                                 'authorcentury': 21,
                                 'book_content': [book]})

        newtext = pd.concat([newtext, tmp_text], ignore_index = True)

In [26]:
bookpart_list = []

for i in range(n_parts):
    # note: the +1s are there because I don't want the exact beginning, or the end of the book (there could be some junk there)
    bookpart_list.append(newtext['book_content'].apply(lambda text: text[(i+1)* len(text) // (n_parts+1):]))

# copy the library n_parts times, to concatenate with the split texts
newtext_multi = pd.concat([newtext]*n_parts, ignore_index=True).drop(columns='book_content')

# add the above parts of the text into a new column
newtext_multi['book_part'] = pd.concat(bookpart_list, ignore_index=True)

In [30]:
newtext_ds = Dataset.from_dict({'text': newtext_multi['book_part']}) #, 'labels': y_test})

tokenized_newtext_ds= newtext_ds.map(tokenize_function)
tokenized_newtext_ds = tokenized_newtext_ds.remove_columns(['text'])

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [31]:
new_results = trainer.predict(tokenized_newtext_ds) 

***** Running Prediction *****
  Num examples = 20
  Batch size = 16


  0%|          | 0/2 [00:00<?, ?it/s]

In [60]:
# first book Lambda, second Deathway
# newtext_multi['title'].unique()

# to see what got predicted
# new_results.predictions.argmax(axis=1)

book1_authors = set(new_results.predictions.argmax(axis=1)[::2])
book2_authors = set(new_results.predictions.argmax(axis=1)[1::2])

In [63]:
for author in book1_authors:
    print('Lambda is similar in writing to: ', num_to_authors[author])

for author in book2_authors:
    print('Deathway is similar in writing to: ', num_to_authors[author])

Lambda is similar in writing to:  Dick, Philip K.
Lambda is similar in writing to:  Huxley, Aldous
Deathway is similar in writing to:  Dick, Philip K.
Deathway is similar in writing to:  Lovecraft, H. P. (Howard Phillips)
Deathway is similar in writing to:  Huxley, Aldous


In [65]:
select_authors

['Homer',
 'Confucius',
 'Plato',
 'Cicero, Marcus Tullius',
 'Seneca, Lucius Annaeus',
 'Dante Alighieri',
 'Boccaccio, Giovanni',
 'Machiavelli, Niccolò',
 'Shakespeare, William',
 'Molière',
 'Jefferson, Thomas',
 'Defoe, Daniel',
 'Austen, Jane',
 'Twain, Mark',
 'Doyle, Arthur Conan',
 'Dickens, Charles',
 'Dumas, Alexandre',
 'Dick, Philip K.',
 'Lovecraft, H. P. (Howard Phillips)',
 'Huxley, Aldous',
 'Churchill, Winston']

In [70]:
similar_books = pd.concat([newtext_multi ,pd.DataFrame(new_results.predictions, columns=select_authors)], axis=1)

In [87]:
similar_books = (
    similar_books
        .drop(columns=['id', 'authorcentury', 'book_part', 'author'])
        .groupby('title')
        .mean()
        .pivot_table(columns='title')
)

In [93]:
print(similar_books.sort_values(by='Deathway', ascending=False)['Deathway'].head())
print('--------')
print(similar_books.sort_values(by='Lambda', ascending=False)['Lambda'].head())

Lovecraft, H. P. (Howard Phillips)    2.386379
Dick, Philip K.                       2.149984
Austen, Jane                          1.299474
Huxley, Aldous                        1.048226
Dickens, Charles                      0.654352
Name: Deathway, dtype: float32
--------
Dick, Philip K.                       2.738583
Huxley, Aldous                        1.723177
Lovecraft, H. P. (Howard Phillips)    1.312014
Austen, Jane                          0.757156
Dickens, Charles                      0.749532
Name: Lambda, dtype: float32
