In [1]:
!python setup_04.py

Running command: sudo apt-get update
Command output: Get:1 https://deb.nodesource.com/node_16.x focal InRelease [4583 B]
Get:2 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:3 https://deb.nodesource.com/node_16.x focal/main amd64 Packages [776 B]
Get:4 http://security.ubuntu.com/ubuntu focal-security/universe amd64 Packages [1047 kB]
Get:5 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease [18.1 kB]
Get:6 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2675 kB]
Get:7 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal/main amd64 Packages [29.5 kB]
Get:8 http://security.ubuntu.com/ubuntu focal-security/restricted amd64 Packages [2203 kB]
Get:9 http://security.ubuntu.com/ubuntu focal-security/multiverse amd64 Packages [28.5 kB]
Get:10 http://archive.ubuntu.com/ubuntu focal InRelease [265 kB]
Get:11 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Get:12 http://archive.ubuntu.com/ubuntu focal-backports InRelease 

In [2]:
    import os
    import pytesseract
    from PIL import Image
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
    from sklearn.metrics import confusion_matrix, classification_report
    import pandas as pd
    from datasets import load_dataset, DatasetDict

In [3]:
train_dataset = load_dataset("vaclavpechtor/rvl_cdip-small-200", split="train", cache_dir="./dataset/rvl-cdip-small-200/hf_cache")
validation_dataset = load_dataset("vaclavpechtor/rvl_cdip-small-200", split="validation", cache_dir="./dataset/rvl-cdip-small-200/hf_cache")

ocr_dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset})

Using custom data configuration vaclavpechtor--rvl_cdip-small-200-853f638e95c0bf72
Reusing dataset imagefolder (./dataset/rvl-cdip-small-200/hf_cache/vaclavpechtor___imagefolder/vaclavpechtor--rvl_cdip-small-200-853f638e95c0bf72/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597)
Using custom data configuration vaclavpechtor--rvl_cdip-small-200-853f638e95c0bf72
Reusing dataset imagefolder (./dataset/rvl-cdip-small-200/hf_cache/vaclavpechtor___imagefolder/vaclavpechtor--rvl_cdip-small-200-853f638e95c0bf72/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597)


In [4]:
class_labels = ocr_dataset['train'].features['label'].names
label_to_id = {label: i for i, label in enumerate(class_labels)}
id_to_label = {i: label for label, i in label_to_id.items()}

In [5]:
import pickle

def save_dataset(dataset, filename):
    with open(filename, 'wb') as f:
        pickle.dump(dataset, f)

In [6]:
def load_dataset(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [7]:
def ocr_image(img):
    text = pytesseract.image_to_string(img)
    return text

In [8]:
import os

if os.path.exists('./dataset/rvl-cdip-small-200/ocr_dataset.pkl'):
    print('Loading dataset from pkl...')
    ocr_dataset = load_dataset('./dataset/rvl-cdip-small-200/ocr_dataset.pkl')
else:
    print('Running OCR...')
    ocr_dataset = ocr_dataset.map(lambda x: {"text": ocr_image(x["image"])})
    ocr_dataset = ocr_dataset.map(lambda example: {'label': label_to_id[example['label']] if isinstance(example['label'], str) else example['label']})
    save_dataset(ocr_dataset, './dataset/rvl-cdip-small-200/ocr_dataset.pkl')

Loading dataset from pkl...


In [9]:
ocr_dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'label', 'text'],
        num_rows: 2560
    })
    validation: Dataset({
        features: ['image', 'label', 'text'],
        num_rows: 640
    })
})

In [10]:
ocr_dataset.set_format(type='pandas')

In [11]:
df = ocr_dataset['train'][:]

In [12]:
df.head()

Unnamed: 0,image,label,text
0,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,"\n\na\n\nCevetrom Phi""eMonis,\n\n” Saratoe\n\..."
1,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,"eae Arizona\nFebruary 20-21, 1984 oo\n\nSit 00..."
2,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,ae\nPRIVEE SITET\neae beaters\n\n \n
3,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,a\nholds backtar;\nf ut lets Ue Sul.\nmenthol ...
4,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,fl .\na\n\n \n\n \n\nyou should know that many...


In [13]:
df['category_name'] = df['label'].map(id_to_label)

In [14]:
df.head()

Unnamed: 0,image,label,text,category_name
0,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,"\n\na\n\nCevetrom Phi""eMonis,\n\n” Saratoe\n\...",advertisement
1,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,"eae Arizona\nFebruary 20-21, 1984 oo\n\nSit 00...",advertisement
2,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,ae\nPRIVEE SITET\neae beaters\n\n \n,advertisement
3,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,a\nholds backtar;\nf ut lets Ue Sul.\nmenthol ...,advertisement
4,<PIL.TiffImagePlugin.TiffImageFile image mode=...,0,fl .\na\n\n \n\n \n\nyou should know that many...,advertisement


In [15]:
ocr_dataset.set_format()

In [16]:
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=len(label_to_id))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

In [21]:
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [24]:
def tokenize_function(examples):
    output = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    # Convert tensors to numpy arrays
    return {key: value.numpy() for key, value in output.items()}


tokenized_dataset = ocr_dataset.map(tokenize_function, batched=True)

training_args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="epoch",
    save_strategy="epoch",  # Add this line to set the save_strategy
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,  # Add this line
)

trainer.train()



Loading cached processed dataset at ./dataset/rvl-cdip-small-200/hf_cache/vaclavpechtor___imagefolder/vaclavpechtor--rvl_cdip-small-200-853f638e95c0bf72/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597/cache-1c80317fa3b1799d.arrow
Loading cached processed dataset at ./dataset/rvl-cdip-small-200/hf_cache/vaclavpechtor___imagefolder/vaclavpechtor--rvl_cdip-small-200-853f638e95c0bf72/0.0.0/48efdc62d40223daee675ca093d163bcb6cb0b7d7f93eb25aebf5edca72dc597/cache-bdd640fb06671ad1.arrow
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, image. If text, image are not expected by 

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.367627,0.654687
2,0.326900,1.549862,0.65
3,0.326900,1.660206,0.653125
4,0.229600,1.646754,0.670312
5,0.136500,1.721966,0.665625
6,0.136500,1.891162,0.6625
7,0.109200,1.871702,0.664062
8,0.083500,1.886006,0.665625
9,0.083500,1.875578,0.673438
10,0.065700,1.874407,0.66875


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, image. If text, image are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 640
  Batch size = 8
Saving model checkpoint to output/checkpoint-320
Configuration saved in output/checkpoint-320/config.json
Model weights saved in output/checkpoint-320/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, image. If text, image are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 640
  Batch size = 8
Saving model checkpoint to output/checkpoint-640
Configuration saved in output/checkpoint-640/config.json
Model weights saved

TrainOutput(global_step=3200, training_loss=0.1545017033815384, metrics={'train_runtime': 131.2019, 'train_samples_per_second': 195.119, 'train_steps_per_second': 24.39, 'total_flos': 848003019571200.0, 'train_loss': 0.1545017033815384, 'epoch': 10.0})

In [25]:
model.save_pretrained("./models/distilbert-base-uncased_model")
tokenizer.save_pretrained("./models/distilbert-base-uncased_tokenizer")

Configuration saved in ./models/distilbert-base-uncased_model/config.json
Model weights saved in ./models/distilbert-base-uncased_model/pytorch_model.bin
tokenizer config file saved in ./models/distilbert-base-uncased_tokenizer/tokenizer_config.json
Special tokens file saved in ./models/distilbert-base-uncased_tokenizer/special_tokens_map.json


('./models/distilbert-base-uncased_tokenizer/tokenizer_config.json',
 './models/distilbert-base-uncased_tokenizer/special_tokens_map.json',
 './models/distilbert-base-uncased_tokenizer/vocab.txt',
 './models/distilbert-base-uncased_tokenizer/added_tokens.json',
 './models/distilbert-base-uncased_tokenizer/tokenizer.json')