# Discovery of Writing Differences - Huggingface Transformers

Capstone project by Tomo Umer

<img src="https://tomoumerdotcom.files.wordpress.com/2022/04/cropped-pho_logo_notext.png" style="width:400px;height:400px;"/>

## Imports

In [1]:
import pandas as pd
import numpy as np
import pickle

from datasets import Dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback

from sklearn.preprocessing import LabelBinarizer

from sklearn.model_selection import train_test_split
from scipy import special


import plotly.express as px
from sklearn.metrics import confusion_matrix

## Setting up the Data

This next chunk was for using the 50 books per author. Results were ... ok.

In [2]:
# 03 is the one with limit 50 books per author
# library_select = pd.read_pickle('../data/library_select03.pkl')

# since hugging face only accepts up to 512 characters with this model,
# better to get words from the middle of the book
# library_select['book_content_modified'] = library_select['book_content'].apply(lambda text: text[len(text) // 2:])

In [3]:
# only 5 books per author
library_select = pd.read_pickle('../data/library_fixed_author_five.pkl')

In [4]:
select_authors = list(library_select['author'].unique())

In [5]:
# how many parts of a book to take
n_parts = 10 # previous was 5

bookpart_list = []

for i in range(n_parts):
    # note: the +1s are there because I don't want the exact beginning, or the end of the book (there could be some junk there)
    bookpart_list.append(library_select['book_content'].apply(lambda text: text[(i+1)* len(text) // (n_parts+1):]))

# copy the library n_parts times, to concatenate with the split texts
library_select_multi = pd.concat([library_select]*n_parts, ignore_index=True).drop(columns='book_content')

# add the above parts of the text into a new column
library_select_multi['book_part'] = pd.concat(bookpart_list, ignore_index=True)

In [6]:
library_select_multi.shape

(1050, 14)

In [7]:
# for testing
# library_select['book_length'] = library_select['book_content'].str.len()
#library_select_multi['book_part_length'] = library_select_multi['book_part'].str.len()

# and then to verify that it's the same book - 105 is of course with 21 authors, 5 books each
# library_select_multi.loc[0]
# library_select_multi.loc[105]
# library_select_multi.loc[210]

# library_select['author_num'].nunique()

In [8]:
checkpoint = 'bert-base-uncased'
model_path = '../models/bert_base_uncased'

In [9]:
# note truncation side and padding side are to determine which side to cutoff - beginning (left) or end (rigt)
tokenizer = AutoTokenizer.from_pretrained(checkpoint, truncation_side='right', padding_side='right')

def tokenize_function(df):
    return tokenizer(df['text'], truncation=True, padding='max_length',  max_length=512)

acc = evaluate.load('accuracy') #average = None
precision = evaluate.load('precision')
recall = evaluate.load('recall')
f1 = evaluate.load('f1')
mcc = evaluate.load('matthews_correlation')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc_m = acc.compute(predictions=predictions, references=labels)
    precision_m = precision.compute(predictions=predictions, average = 'macro', references=labels) #used  weighted for 50 books
    recall_m = recall.compute(predictions=predictions, average = 'macro', references=labels)
    f1_m = f1.compute(predictions=predictions, average = 'macro', references=labels)
    mcc_m = mcc.compute(predictions=predictions, references=labels)
    metrics = {
        'accuracy': acc_m['accuracy'],
        'precision': precision_m['precision'],
        'recall': recall_m['recall'],
        'f1': f1_m['f1'],
        'mcc': mcc_m['matthews_correlation']
    }
    return metrics

In [10]:
# # this part is the same if I use library_select ..
# select_authors = list(library_select_multi.sort_values(by='authorcentury')['author'].unique())

# authors_to_num = {select_authors[i]: i for i in range(len(select_authors))}
# num_to_authors = {v: k for k, v in authors_to_num.items()}

# library_select_multi['author_num'] = library_select_multi['author'].map(authors_to_num)
#authors_to_num

In [11]:
X = library_select_multi[['book_part']]
y = library_select_multi['author_num']

X_part, X_test, y_part, y_test = train_test_split(X, y, test_size=0.2, random_state = 42, stratify = y)

In [12]:
# to use for validation first
X_train, X_val, y_train, y_val = train_test_split(X_part, y_part, test_size=0.15, random_state = 42, stratify = y_part)

In [13]:
# split train further into train & 15% for validation (replace test here with validation)
train_ds = Dataset.from_dict({'text': X_train['book_part'], 'labels': y_train})
val_ds = Dataset.from_dict({'text': X_val['book_part'], 'labels': y_val})

In [14]:
tokenized_train_ds = train_ds.map(tokenize_function)
tokenized_train_ds = tokenized_train_ds.remove_columns(['text'])

Map:   0%|          | 0/714 [00:00<?, ? examples/s]

In [15]:
tokenized_val_ds = val_ds.map(tokenize_function)
tokenized_val_ds = tokenized_val_ds.remove_columns(['text'])

Map:   0%|          | 0/126 [00:00<?, ? examples/s]

In [16]:
# this needs to change if I change num authors
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=21)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [55]:
# this is only needed if running the trainer locally
training_args = TrainingArguments(
    output_dir=model_path,
    evaluation_strategy='epoch',
    num_train_epochs=15, #10 with last model
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    use_mps_device=True,
    save_total_limit=15, #10 with last model
    save_strategy='epoch',
    load_best_model_at_end=True,
    log_level ='info',
    metric_for_best_model='eval_mcc',
    optim = 'adamw_torch',
    learning_rate=1e-05,
    #fp16=True #this is to run on the gpu
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [56]:
# this if uncommented to run the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [57]:
# only run this on google collab, it takes 4 hours on my laptop!
trainer.train()

***** Running training *****
  Num examples = 714
  Num Epochs = 15
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 675
  Number of trainable parameters = 109,498,389


  0%|          | 0/675 [00:00<?, ?it/s]

## Taking the pretrained model from google collab

In [22]:
#model = AutoModelForSequenceClassification.from_pretrained(model_path + '/checkpoint-58362')
model = AutoModelForSequenceClassification.from_pretrained('../models/bert_base_uncased/fivebooks_tenparts/')

loading configuration file ../models/bert_base_uncased/fivebooks_tenparts/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert_base_uncased/fivebooks_tenparts/",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    

In [23]:
test_args = TrainingArguments(
    output_dir= '../models/bert_base_uncased/fivebooks_tenparts/',
    do_train=False,
    do_predict=True,
    per_device_eval_batch_size=4,
    # fp16=True
)

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [24]:
trainer = Trainer(
    model=model,
    args=test_args,
    compute_metrics=compute_metrics
)

In [25]:
test_ds = Dataset.from_dict({'text': X_test['book_part'], 'labels': y_test})

tokenized_test_ds = test_ds.map(tokenize_function)
tokenized_test_ds = tokenized_test_ds.remove_columns(['text'])

Map:   0%|          | 0/210 [00:00<?, ? examples/s]

In [26]:
results = trainer.predict(tokenized_test_ds) 

***** Running Prediction *****
  Num examples = 210
  Batch size = 4


In [27]:
results.metrics

In [28]:
results.predictions

In [29]:
# this is to get which labels are being predicted
results.predictions.argmax(axis=1)

# this simply stores the correct predictions, so equivalent to y_test:
# results.label_ids

In [None]:
select_authors = list(library_select['author'].unique())

In [30]:
fig = px.imshow(confusion_matrix(y_test, results.predictions.argmax(axis=1)),
                width=1000,
                height=800,
                text_auto=True,
                labels=dict(x='Predicted Label',
                            y='True Label'),
                            x=select_authors,
                            y=select_authors,
                            color_continuous_scale='Teal'
                            )

fig.update(layout_coloraxis_showscale=False)

fig.show()

## Introducing New Text

This part will ideally be in an app where any text can be uploaded

In [31]:
newtext = pd.DataFrame()

for book_num, book_name in enumerate(['Lambda', 'Deathway']):
        filepath = f'../data/{book_name} by Tomo Umer.txt'

        with open(filepath, encoding = 'utf-8') as fi:
                book = fi.read()
        
        tmp_text = pd.DataFrame({'id': f'TU{str(book_num).zfill(3)}',
                                 'title': [book_name],
                                 'author': 'Umer, Tomo',
                                 'authorcentury': 21,
                                 'book_content': [book]})

        newtext = pd.concat([newtext, tmp_text], ignore_index = True)

In [32]:
bookpart_list = []

for i in range(n_parts):
    # note: the +1s are there because I don't want the exact beginning, or the end of the book (there could be some junk there)
    bookpart_list.append(newtext['book_content'].apply(lambda text: text[(i+1)* len(text) // (n_parts+1):]))

# copy the library n_parts times, to concatenate with the split texts
newtext_multi = pd.concat([newtext]*n_parts, ignore_index=True).drop(columns='book_content')

# add the above parts of the text into a new column
newtext_multi['book_part'] = pd.concat(bookpart_list, ignore_index=True)

In [33]:
newtext_ds = Dataset.from_dict({'text': newtext_multi['book_part']}) #, 'labels': y_test})

tokenized_newtext_ds= newtext_ds.map(tokenize_function)
tokenized_newtext_ds = tokenized_newtext_ds.remove_columns(['text'])

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

In [34]:
new_results = trainer.predict(tokenized_newtext_ds) 

***** Running Prediction *****
  Num examples = 20
  Batch size = 4


In [35]:
# first book Lambda, second Deathway
# newtext_multi['title'].unique()

# to see what got predicted
# new_results.predictions.argmax(axis=1)

# book1_authors = set(new_results.predictions.argmax(axis=1)[::2])
# book2_authors = set(new_results.predictions.argmax(axis=1)[1::2])

# for author in book1_authors:
#     print('Lambda is similar in writing to: ', num_to_authors[author])

# for author in book2_authors:
#     print('Deathway is similar in writing to: ', num_to_authors[author])

In [36]:
# this is needed for the strmlite app
authors_df = pd.read_pickle('../data/select_authors.pkl')

In [37]:
def compute_winners(model_predictions_max):
    unique_num, counts = np.unique(model_predictions_max, return_counts=True)

    unique_authors = [num_to_authors[unique] for unique in unique_num]

    return pd.DataFrame({'most likely author': unique_authors, 'number of times':counts})

# Note that the above is because I put two books in at once. for one at a time, I would define this function
# to just give it predictions - like so
# def compute_winners(model_predictions):
#     unique_num, counts = np.unique(model_predictions.argmax(axis=1), return_counts=True)

#     unique_authors = [num_to_authors[unique] for unique in unique_num]

#     return pd.DataFrame({'most likely author': unique_authors, 'number of times':counts})

In [38]:
book1_winners = compute_winners(new_results.predictions.argmax(axis=1)[::2])
book2_winners = compute_winners(new_results.predictions.argmax(axis=1)[1::2])

In [39]:
book2_winners

In [40]:
new_results.predictions.shape

Note: the results are logits. Need to use softmax to convert them to probabilities.

In [41]:
#import torch.nn.functional as F
# new_probabilities = F.softmax(new_results, dim=-1)

In [42]:
new_probabilities = special.softmax(new_results.predictions, axis=1)

In [43]:
new_probabilities_df = pd.DataFrame(new_probabilities, columns=select_authors).T

In [44]:
new_probabilities_df[0].head().map('{:.2%}'.format)

In [45]:
for column in new_probabilities_df:
    print(new_probabilities_df[column].sort_values(ascending=False).head().map('{:.2%}'.format))

In [46]:
similar_books = pd.concat([newtext_multi ,pd.DataFrame(new_probabilities, columns=select_authors)], axis=1)

In [47]:
#similar_books.loc[similar_books['title'] == 'Lambda'].head()
similar_books.loc[similar_books['title'] == 'Deathway'].head()

In [48]:
similar_books = (
    similar_books
        .drop(columns=['id', 'authorcentury', 'book_part', 'author'])
        .groupby('title')
        .mean()
        .pivot_table(columns='title')
)

In [49]:
print(similar_books.sort_values(by='Deathway', ascending=False)['Deathway'].head())
print('--------')
print(similar_books.sort_values(by='Lambda', ascending=False)['Lambda'].head())