In [1]:
# Run the setup_04.py script to set up the necessary dependencies.
!python setup_04.py

Running command: sudo apt-get update
Command output: Hit:1 https://deb.nodesource.com/node_16.x focal InRelease
Hit:2 http://ppa.launchpad.net/alex-p/tesseract-ocr/ubuntu focal InRelease
Hit:3 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:4 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:5 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2681 kB]
Hit:6 http://archive.ubuntu.com/ubuntu focal InRelease
Get:7 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Get:8 http://security.ubuntu.com/ubuntu focal-security/universe amd64 Packages [1047 kB]
Get:9 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Get:10 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages [3169 kB]
Get:11 http://archive.ubuntu.com/ubuntu focal-updates/restricted amd64 Packages [2366 kB]
Fetched 9598 kB in 7s (1429 kB/s)
Reading package lists...

Running command: sudo apt-get install software-properties-com

In [2]:
# Importing necessary libraries and modules
    import os
    import pytesseract
    from PIL import Image
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
    from sklearn.metrics import confusion_matrix, classification_report
    import pandas as pd
    from datasets import load_dataset, DatasetDict
    import wandb
    from datetime import datetime

In [3]:
# python script
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

True

In [4]:
# Login to Weights & Biases for experiment tracking
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mpect[0m ([33mzhaw-sml-iwi-it_strategy_management[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# Loading train and validation datasets from Hugging Face Datasets
train_dataset = load_dataset("vaclavpechtor/rvl_cdip-small-200", split="train", cache_dir="./dataset/rvl-cdip-small-200/hf_cache")
validation_dataset = load_dataset("vaclavpechtor/rvl_cdip-small-200", split="validation", cache_dir="./dataset/rvl-cdip-small-200/hf_cache")
# Combining both train and validation datasets into a single DatasetDict
ocr_dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset})

Using custom data configuration vaclavpechtor--rvl_cdip-small-200-853f638e95c0bf72
Reusing dataset imagefolder (./dataset/rvl-cdip-small-200/hf_cache/vaclavpechtor___imagefolder/vaclavpechtor--rvl_cdip-small-200-853f638e95c0bf72/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597)
Using custom data configuration vaclavpechtor--rvl_cdip-small-200-853f638e95c0bf72
Reusing dataset imagefolder (./dataset/rvl-cdip-small-200/hf_cache/vaclavpechtor___imagefolder/vaclavpechtor--rvl_cdip-small-200-853f638e95c0bf72/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597)


In [6]:
# Creating a mapping from class labels to IDs and vice versa
class_labels = ocr_dataset['train'].features['label'].names
label_to_id = {label: i for i, label in enumerate(class_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [7]:
# Importing pickle for serializing and deserializing Python object structures
import pickle
# Function to save a dataset to a pickle file
def save_dataset(dataset, filename):
    with open(filename, 'wb') as f:
        pickle.dump(dataset, f)

In [8]:
# Function to load a dataset from a pickle file
def load_dataset(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [9]:
# Function to perform OCR on an image and return the extracted text
def ocr_image(img):
    text = pytesseract.image_to_string(img)
    return text

In [10]:
import os
# Checking if the serialized dataset exists
# If it does, load it. Otherwise, perform OCR on the images, map the labels to IDs, and then serialize the dataset.
if os.path.exists('./dataset/rvl-cdip-small-200/ocr_dataset.pkl'):
    print('Loading dataset from pkl...')
    ocr_dataset = load_dataset('./dataset/rvl-cdip-small-200/ocr_dataset.pkl')
else:
    print('Running OCR...')
    ocr_dataset = ocr_dataset.map(lambda x: {"text": ocr_image(x["image"])})
    ocr_dataset = ocr_dataset.map(lambda example: {'label': label_to_id[example['label']] if isinstance(example['label'], str) else example['label']})
    save_dataset(ocr_dataset, './dataset/rvl-cdip-small-200/ocr_dataset.pkl')

Loading dataset from pkl...


In [11]:
# Viewing the dataset
ocr_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label', 'text'],
        num_rows: 2560
    })
    validation: Dataset({
        features: ['image', 'label', 'text'],
        num_rows: 640
    })
})

In [12]:
# Setting the dataset format to pandas for easier data manipulation
ocr_dataset.set_format(type='pandas')

In [13]:
# Extracting the training data as a pandas DataFrame
df = ocr_dataset['train'][:]

In [14]:
# Previewing the first few rows of the data
df.head()

Unnamed: 0,image,label,text
0,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,"\n\na\n\nCevetrom Phi""eMonis,\n\n” Saratoe\n\..."
1,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,"eae Arizona\nFebruary 20-21, 1984 oo\n\nSit 00..."
2,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,ae\nPRIVEE SITET\neae beaters\n\n \n
3,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,a\nholds backtar;\nf ut lets Ue Sul.\nmenthol ...
4,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,fl .\na\n\n \n\n \n\nyou should know that many...


In [15]:
# Adding a new column 'category_name' that maps the 'label' to the actual class name
df['category_name'] = df['label'].map(id_to_label)

In [16]:
df.head()

Unnamed: 0,image,label,text,category_name
0,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,"\n\na\n\nCevetrom Phi""eMonis,\n\n” Saratoe\n\...",advertisement
1,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,"eae Arizona\nFebruary 20-21, 1984 oo\n\nSit 00...",advertisement
2,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,ae\nPRIVEE SITET\neae beaters\n\n \n,advertisement
3,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,a\nholds backtar;\nf ut lets Ue Sul.\nmenthol ...,advertisement
4,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,fl .\na\n\n \n\n \n\nyou should know that many...,advertisement


In [17]:
# Reverting the dataset format to the default format
ocr_dataset.set_format()

In [18]:
# Specifying the model checkpoint to use
model_checkpoint = "distilbert-base-uncased"
# Initializing the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(label_to_id))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier

In [19]:
# Importing accuracy_score from sklearn.metrics to compute the accuracy of our model
from sklearn.metrics import accuracy_score
# Defining a function to compute the accuracy of the model's predictions
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [20]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# Specifying the model checkpoint
model_checkpoint = "bert-base-uncased"  # or whatever model you're using
# Initializing the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

num_labels = 16  # adjust this according to your task
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

# Function to tokenize our text data
def tokenize_function(examples):
    output = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    # Convert tensors to numpy arrays
    return {key: value.numpy() for key, value in output.items()}

# Tokenizing the dataset
tokenized_dataset = ocr_dataset.map(tokenize_function, batched=True)

# Naming the run for tracking in Weights & Biases
run_name = "doc_demo_" + datetime.now().strftime("%Y%m%d_%H%M%S")

# Defining the training arguments
training_args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="wandb",  # enables reporting to W&B
    run_name=run_name,  # name of the W&B run
    logging_dir='./wandb',  # directory where the run files will be stored
)

# Initializing the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,  # Add this line
)

# Training the model
trainer.train()





Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: image, text. If image, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2560
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1600
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Currently logged in as: [33mpect[0m ([33mzhaw-sml-iwi-it_strategy_management[0m). Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.576834,0.564063
2,1.867200,1.280182,0.6125
3,1.867200,1.190104,0.640625
4,0.960500,1.125183,0.679688
5,0.564900,1.140371,0.675


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: image, text. If image, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 640
  Batch size = 8
Saving model checkpoint to output/checkpoint-320
Configuration saved in output/checkpoint-320/config.json
Model weights saved in output/checkpoint-320/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: image, text. If image, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 640
  Batch size = 8
Saving model checkpoint to output/checkpoint-640
Configuration saved in output/checkpoint-640/config.json
Model weights saved in output/checkpoint-64

TrainOutput(global_step=1600, training_loss=1.0885858297348023, metrics={'train_runtime': 361.547, 'train_samples_per_second': 35.403, 'train_steps_per_second': 4.425, 'total_flos': 842061211238400.0, 'train_loss': 1.0885858297348023, 'epoch': 5.0})

In [21]:
model.save_pretrained("./models/distilbert-base-uncased_model")
tokenizer.save_pretrained("./models/distilbert-base-uncased_tokenizer")

Configuration saved in ./models/distilbert-base-uncased_model/config.json
Model weights saved in ./models/distilbert-base-uncased_model/pytorch_model.bin
tokenizer config file saved in ./models/distilbert-base-uncased_tokenizer/tokenizer_config.json
Special tokens file saved in ./models/distilbert-base-uncased_tokenizer/special_tokens_map.json


('./models/distilbert-base-uncased_tokenizer/tokenizer_config.json',
 './models/distilbert-base-uncased_tokenizer/special_tokens_map.json',
 './models/distilbert-base-uncased_tokenizer/vocab.txt',
 './models/distilbert-base-uncased_tokenizer/added_tokens.json',
 './models/distilbert-base-uncased_tokenizer/tokenizer.json')