In [1]:
# %pip install transformers
# %pip install scikit-learn
# %pip install datasets
# %pip install accelerate -U
# %pip install jiwer

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from PIL import Image
from transformers import TrOCRProcessor, VisionEncoderDecoderModel, Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator
from datasets import load_metric

In [3]:
df = pd.read_fwf('fine_tune_examples.txt', header = None, encoding = 'ISO-8859-1')
df['file_name'] = [df.iloc[i, 0].split(" ")[0] for i in range(len(df))]
df['text'] = [" ".join(df[0].iloc[i].split(" ")[1:]) for i in range(len(df))]
df = df.drop(0, axis = 1)
df.head()

Unnamed: 0,file_name,text
0,data/lines/3-70-0.jpeg,11/12/23 the night Pearl died. Notes from the ...
1,data/lines/2-37-16.jpeg,12:34am and eventually they'll come to you. Go...
2,data/lines/1-20-17.jpeg,lease for optionality + look of any better
3,data/lines/2-61-2.jpeg,"cont. you loved, and love sex is way better. I..."
4,data/lines/2-63-13.jpeg,"I was trying to get at in my note abt ""you are"


In [4]:
train_df, test_df = train_test_split(df, test_size=0.2)
# we reset the indices to start from zero
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [5]:
class IAMDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=128):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # get file name + text 
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]
        # prepare image (i.e. resize + normalize)
        image = Image.open(self.root_dir + file_name).convert("RGB")
        pixel_values = self.processor(image, return_tensors="pt").pixel_values
        # add labels (input_ids) by encoding the text
        labels = self.processor.tokenizer(text, 
                                          padding="max_length", 
                                          max_length=self.max_target_length).input_ids
        # important: make sure that PAD tokens are ignored by the loss function
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

In [6]:
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
train_dataset = IAMDataset(root_dir='',
                           df=train_df,
                           processor=processor)
eval_dataset = IAMDataset(root_dir='',
                           df=test_df,
                           processor=processor)



In [7]:
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-stage1")

  return self.fget.__get__(instance, owner)()
Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-stage1 and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [9]:
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=False, 
    output_dir="model/",
    logging_steps=1,
    save_steps=1000,
    eval_steps=200,
)

In [10]:
training_args



In [11]:
cer_metric = load_metric("cer")
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

  cer_metric = load_metric("cer")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [12]:
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
)
trainer.train()



  0%|          | 0/72 [00:00<?, ?it/s]

{'loss': 9.2314, 'grad_norm': 62.57602310180664, 'learning_rate': 4.930555555555556e-05, 'epoch': 0.04}
{'loss': 8.4539, 'grad_norm': 38.18719482421875, 'learning_rate': 4.8611111111111115e-05, 'epoch': 0.08}
{'loss': 7.333, 'grad_norm': 34.19308853149414, 'learning_rate': 4.791666666666667e-05, 'epoch': 0.12}
{'loss': 7.2197, 'grad_norm': 24.368227005004883, 'learning_rate': 4.722222222222222e-05, 'epoch': 0.17}
{'loss': 6.5196, 'grad_norm': 17.323875427246094, 'learning_rate': 4.652777777777778e-05, 'epoch': 0.21}
{'loss': 6.5083, 'grad_norm': 21.561010360717773, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.25}
{'loss': 6.2791, 'grad_norm': 16.505128860473633, 'learning_rate': 4.5138888888888894e-05, 'epoch': 0.29}
{'loss': 6.0409, 'grad_norm': 22.271398544311523, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.33}
{'loss': 6.6672, 'grad_norm': 23.66922950744629, 'learning_rate': 4.375e-05, 'epoch': 0.38}
{'loss': 6.7896, 'grad_norm': 17.559350967407227, 'learning_rate': 4

TrainOutput(global_step=72, training_loss=3.1220834760202303, metrics={'train_runtime': 8544.5616, 'train_samples_per_second': 0.067, 'train_steps_per_second': 0.008, 'train_loss': 3.1220834760202303, 'epoch': 3.0})

In [17]:
import os
model_processor = []
for filename in os.listdir("data/lines/")[15:25]:
    image = Image.open(f"data/lines/{filename.lower()}").convert("RGB")

    processor = processor
    model = model
    pixel_values = processor(images=image, return_tensors="pt").pixel_values

    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    model_processor.append(f"{filename} {generated_text}")

In [18]:
model_processor

['1-100-22.jpeg something that shed think about it more, more',
 '1-100-23.jpeg her opinion, and disom me. In many ways this',
 '1-100-24.jpeg incident was one of my worst team.',
 '1-100-25.jpeg a creep w/ no control over the situation.',
 '1-100-26.jpeg The concert was okay. More tracking! And you',
 '1-100-27.jpeg anything backup new rep. rep. How rep w/ were less challenging',
 '1-100-28.jpeg orghtta balance new rep w/ meee less challenging.',
 '1-100-29.jpeg styles. There were some beautiful women in',
 '1-100-3.jpeg amudant, but for the 12-18 months it seems',
 '1-100-30.jpeg there. It made me sold where Also someone']

In [20]:
pt_proc = []
for filename in os.listdir("data/lines/")[0:25]:
    image = Image.open(f"data/lines/{filename.lower()}").convert("RGB")

    processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-stage1")
    model = model
    pixel_values = processor(images=image, return_tensors="pt").pixel_values

    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    pt_proc.append(f"{filename} {generated_text}")



In [21]:
pt_proc

["1-100-0.jpeg 4/23/23 her take the lead on it and she hasn't record",
 '1-100-1.jpeg cont. of to me. Driving out to UNCG tin to see',
 '1-100-10.jpeg writing that! He and I sat and talked and',
 '1-100-11.jpeg In the living in the living room, about NC Publishers and',
 '1-100-12.jpeg jabs and grad school school and dating and operational,',
 '1-100-13.jpeg eventually by sam, until Riley tested me to get',
 '1-100-14.jpeg eventually by som, until Riley tested me to get',
 '1-100-15.jpeg dinner. I picked her up from Lanzas, where every',
 '1-100-16.jpeg person my age in Carrboro has been hiding all',
 '1-100-17.jpeg this time, apparently, and we got food at Weaver.',
 '1-100-18.jpeg I told her about last Saturday and she brushed',
 '1-100-19.jpeg It at like it was nothing, barely working',
 '1-100-2.jpeg Jada Patent says & conduct. I don\'t "',
 '1-100-20.jpeg about. So that made me feel a little better.',
 '1-100-21.jpeg Then I spent the whole drive to Greensboro',
 "1-100-22.jpeg som

In [23]:
from_pretrained_model = []
for filename in os.listdir("data/lines/")[0:10]:
    image = Image.open(f"data/lines/{filename.lower()}").convert("RGB")

    processor = TrOCRProcessor.from_pretrained(model)
    model = model
    pixel_values = processor(images=image, return_tensors="pt").pixel_values

    generated_ids = model.generate(pixel_values)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    pt_proc.append(f"{filename} {generated_text}")
from_pretrained_model

OSError: Incorrect path_or_model_id: 'VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=False)
              (key): Linear(in_features=768, out_features=768, bias=False)
              (value): Linear(in_features=768, out_features=768, bias=False)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): ViTOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (layernorm_before): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layernorm_after): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
      )
    )
    (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (pooler): ViTPooler(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (activation): Tanh()
    )
  )
  (decoder): TrOCRForCausalLM(
    (model): TrOCRDecoderWrapper(
      (decoder): TrOCRDecoder(
        (embed_tokens): Embedding(50265, 1024, padding_idx=1)
        (embed_positions): TrOCRSinusoidalPositionalEmbedding()
        (layers): ModuleList(
          (0-11): 12 x TrOCRDecoderLayer(
            (self_attn): TrOCRAttention(
              (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (activation_fn): ReLU()
            (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (encoder_attn): TrOCRAttention(
              (k_proj): Linear(in_features=768, out_features=1024, bias=True)
              (v_proj): Linear(in_features=768, out_features=1024, bias=True)
              (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
            )
            (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (fc1): Linear(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear(in_features=4096, out_features=1024, bias=True)
            (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          )
        )
      )
    )
    (output_projection): Linear(in_features=1024, out_features=50265, bias=False)
  )
)'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
outputs

["1-100-0.jpeg 4/23/23 here the lead on it and she haven't read",
 '1-100-1.jpeg cont. out to me. Driving out to Uncle tn to see',
 '1-100-10.jpeg writing that! He and I sat and talked and',
 "1-100-11.jpeg longed in the living room, about NC Polb's and",
 '1-100-12.jpeg jobs and sad school and dating and opera, joined',
 '1-100-13.jpeg eventually by som, until Riley testified me to get',
 '1-100-14.jpeg eventually by som, until Riley testified me to get',
 '1-100-15.jpeg dinear. I picked her up from Lawyers, where every',
 '1-100-16.jpeg person my age in Carrboro has been hiding all',
 '1-100-17.jpeg this time, apparently, and we got food at Weaver.',
 '1-100-18.jpeg I told her about last Saturday and she brushed',
 '1-100-19.jpeg let like it was nothing, barely worth talking',
 '1-100-2.jpeg 7ada potent song & conduct. I don\'t "deal"',
 '1-100-20.jpeg about. So that. make we feel a little better.',
 '1-100-21.jpeg Then I spent the whole drive to Greensboro',
 "1-100-22.jpeg contikin

In [None]:
import os
os.getcwd()

'c:\\Users\\pithy\\Documents\\journalocr'