In [1]:
!python -m pip install --upgrade pip
!pip install pandas
!pip install scikit-learn
!pip install tqdm
!pip install -U 'tensorboardX'
!pip install -U 'tensorboard'
!pip install timm==0.6.13
!pip install donut-python==1.0.9 timm==0.5.4 transformers==4.25.1


Collecting pip
  Downloading pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-24.2-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.1
    Uninstalling pip-23.3.1:
      Successfully uninstalled pip-23.3.1
Successfully installed pip-24.2
[0mCollecting pandas
  Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m289.2 MB/s[0m eta

In [2]:
import os
import urllib.request
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Load the CSV data
csv_file = 'train.csv'  # Update with your actual CSV file path
df = pd.read_csv(csv_file)

# Limit to 1000 images
df = df.sample(n=1000, random_state=42).reset_index(drop=True)

# Define the dataset structure and paths
dataset_name = 'Amazon_v1'
splits = ['train', 'test', 'validation']

# Create dataset folder structure
for split in splits:
    os.makedirs(os.path.join(dataset_name, split), exist_ok=True)

# Split the dataset
train_val, test = train_test_split(df, test_size=0.2, random_state=42)
train, val = train_test_split(train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Create a map for easier access
split_map = {'train': train, 'test': test, 'validation': val}

# Function to generate ground truth parse
def generate_ground_truth(entity_name, entity_value):
    # Split the entity_value into parts and treat the last part as the unit
    parts = entity_value.split()
    value = parts[0]
    unit = " ".join(parts[1:])  # Join the rest as the unit (handles multi-word units)
    return {entity_name: {unit: value}}

# Function to process a single dataset split
def process_split(split, data):
    metadata = []
    for _, row in tqdm(data.iterrows(), total=len(data), desc=f'Processing {split}'):
        # Download the image
        image_name = row['image_link'].split('/')[-1]
        image_path = os.path.join(dataset_name, split, image_name)

        try:
            urllib.request.urlretrieve(row['image_link'], image_path)
        except:
            continue
        
        # Prepare metadata entry
        ground_truth_parse = generate_ground_truth(row['entity_name'], row['entity_value'])
        metadata_entry = {
            "file_name": image_name,
            "ground_truth": json.dumps({"gt_parse": ground_truth_parse})
        }
        metadata.append(metadata_entry)
    
    # Write metadata to jsonl file
    metadata_file = os.path.join(dataset_name, split, 'metadata.jsonl')
    with open(metadata_file, 'w') as f:
        for entry in metadata:
            f.write(json.dumps(entry) + '\n')

# Process all splits
for split in splits:
    process_split(split, split_map[split])

print("Dataset creation complete!")


Processing train: 100%|██████████| 600/600 [00:23<00:00, 25.18it/s]
Processing test: 100%|██████████| 200/200 [00:06<00:00, 32.86it/s]
Processing validation: 100%|██████████| 200/200 [00:06<00:00, 32.49it/s]

Dataset creation complete!





In [11]:
!python donut/train.py --config donut/config/train_cord.yaml \
                --pretrained_model_name_or_path "naver-clova-ix/donut-base" \
                --dataset_name_or_paths '["/workspace/Amazon_v1"]' \
                --exp_version "test_experiment" \
                --result_path "results"

resume_from_checkpoint_path: None
[36mresult_path: results
[0m[36mpretrained_model_name_or_path: naver-clova-ix/donut-base
[0m[36mdataset_name_or_paths: 
[0m  - /workspace/Amazon_v1
sort_json_key: False
train_batch_sizes: 
  - 16
val_batch_sizes: 
  - 4
input_size: 
  - 1280
  - 960
max_length: 512
align_long_axis: False
num_nodes: 1
seed: 2022
lr: 3e-05
warmup_steps: 100
num_training_samples_per_epoch: 400
max_epochs: 10
max_steps: -1
num_workers: 8
val_check_interval: 0.5
check_val_every_n_epoch: 1
gradient_clip_val: 1.0
verbose: True
exp_name: train_cord
exp_version: test_experiment
Config is saved at results/train_cord/test_experiment/config.yaml
Seed set to 2022
  return self.fget.__get__(instance, owner)()
Some weights of DonutModel were not initialized from the model checkpoint at naver-clova-ix/donut-base and are newly initialized: ['encoder.model.layers.3.downsample.norm.bias', 'encoder.model.layers.3.downsample.norm.weight', 'encoder.model.layers.3.downsample.reduction.

In [3]:
import os
import urllib.request
import json
import pandas as pd
from tqdm import tqdm

# Load the test CSV data
csv_file = 'test.csv'  # Update with your actual test CSV file path
df = pd.read_csv(csv_file)

# Define the test dataset structure and path
dataset_name = 'Amazon_v1_test'
test_split = 'test'

# Create test dataset folder structure
os.makedirs(os.path.join(dataset_name, test_split), exist_ok=True)

# Function to generate ground truth parse for test set (no entity value or unit)
def generate_test_ground_truth(entity_name):
    # Split the entity_value into parts and treat the last part as the unit
    value = "-"
    unit = "-"  # Join the rest as the unit (handles multi-word units)
    return {entity_name: {unit: value}}
    
# Process the test dataset
def process_test_split(data):
    metadata = []
    for _, row in tqdm(data.iterrows(), total=len(data), desc='Processing test'):
        # Download the image
        image_name = row['image_link'].split('/')[-1]
        image_path = os.path.join(dataset_name, test_split, image_name)

        try:
            urllib.request.urlretrieve(row['image_link'], image_path)
        except:
            continue

        # Prepare metadata entry
        ground_truth_parse = generate_test_ground_truth(row['entity_name'])
        metadata_entry = {
            "file_name": image_name,
            "ground_truth": json.dumps({"gt_parse": ground_truth_parse})
        }
        metadata.append(metadata_entry)
    
    # Write metadata to jsonl file
    metadata_file = os.path.join(dataset_name, 'metadata.jsonl')
    with open(metadata_file, 'w') as f:
        for entry in metadata:
            f.write(json.dumps(entry) + '\n')

# Process the test dataset
process_test_split(df)

print("Test dataset creation complete!")


Processing test: 100%|██████████| 131187/131187 [1:15:52<00:00, 28.81it/s]  


Test dataset creation complete!


In [4]:
!ls

Amazon_v1  Amazon_v1_test  data_prep.ipynb  donut  test.csv  train.csv
