## Process UCF-Small_Dataset

In [1]:
# import pandas as pd

# def process_large_csv(input_csv, output_csv=None, dataset_type="val"):
#     """
#     Reads a large CSV file, extracts required columns, modifies the image_path 
#     by removing dataset-specific prefixes and adding the correct directory,
#     and optionally saves the processed DataFrame.
    
#     Parameters:
#     - input_csv (str): Path to the input CSV file.
#     - output_csv (str, optional): Path to save the processed CSV file. If None, it won't save.
#     - dataset_type (str): Type of dataset ('train', 'test', 'val') to determine path adjustments.
    
#     Returns:
#     - pd.DataFrame: Processed DataFrame with updated image paths.
#     """
#     #use_cols = ["image_path", "caption"]  # Load only needed columns

#     # Read CSV with only required columns
#     df = pd.read_csv(input_csv)

#     # Define path mappings based on dataset type
#     path_mappings = {
#         "train": ("train_output_frames/", "/kaggle/input/ucf-small-dataset/train_ucf_output/"),
#         "test": ("test_output_frames/", "/kaggle/input/ucf-small-dataset/test_ucf_output/"),
#         "val": ("val_output_frames/", "/kaggle/input/ucf-small-dataset/val_ucf_output/")
#     }

#     # Get the correct replacement values
#     remove_prefix, new_prefix = path_mappings.get(dataset_type, ("", ""))

#     # Remove the dataset-specific prefix and add the correct directory
#     df["image_path"] = df["image_path"].str.replace(remove_prefix, "", regex=False)
#     df["image_path"] = new_prefix + df["image_path"]

#     # Save the processed DataFrame if an output path is provided
#     if output_csv:
#         df.to_csv(output_csv, index=False)

#     return df

# # Process train, test, and validation datasets with the correct path adjustments
# ds1 = process_large_csv("/kaggle/input/ucf-small-dataset/train_image_captions.csv", 
#                              "train_ucf_set.csv", dataset_type="train")
# ds2 = process_large_csv("/kaggle/input/ucf-small-dataset/test_image_captions.csv", 
#                             "test_ucf_set.csv", dataset_type="test")
# ds3 = process_large_csv("/kaggle/input/ucf-small-dataset/val_image_captions.csv", 
#                              "valid_ucf_set.csv", dataset_type="val")

# # train_ds  # Display first few rows
# ds1

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the full dataset
df = pd.read_csv("/kaggle/input/ucf-crime-extracted-frames/test_image_captions.csv")

# Initialize split containers
train_list, val_list, test_list = [], [], []

# Group by category
for category, group in df.groupby('category'):
    # Get all unique videos in this category
    video_keys = group['video_key'].unique()
    
    # Split into train (80%) and temp (20%)
    train_videos, temp_videos = train_test_split(
        video_keys, test_size=0.20, random_state=42
    )
    
    # Split temp into val (15%) and test (20%)
    val_videos, test_videos = train_test_split(
        temp_videos,
        test_size=(10 / 20),  # Adjusted for original total
        random_state=42
    )
    
    # Get dataframes for each split
    train_list.append(group[group['video_key'].isin(train_videos)])
    val_list.append(group[group['video_key'].isin(val_videos)])
    test_list.append(group[group['video_key'].isin(test_videos)])

# Concatenate all splits
train_df = pd.concat(train_list).reset_index(drop=True)
val_df = pd.concat(val_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

# Save splits
train_df.to_csv('train_image_captions.csv', index=False)
val_df.to_csv('val_image_captions.csv', index=False)
test_df.to_csv('test_image_captions.csv', index=False)

print(f"Train: {len(train_df)} samples")
print(f"Val: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")
train_df

Train: 24556 samples
Val: 2694 samples
Test: 2060 samples


Unnamed: 0,image_path,caption,video_key,category,frame_index
0,output_frames/Abuse001_x264_cap0_frame39.jpg,"A woman with short hair, slightly fat, wearing...",Abuse001_x264,Abuse,39
1,output_frames/Abuse001_x264_cap0_frame119.jpg,"A woman with short hair, slightly fat, wearing...",Abuse001_x264,Abuse,119
2,output_frames/Abuse001_x264_cap1_frame221.jpg,A man wearing a white shirt and black pants en...,Abuse001_x264,Abuse,221
3,output_frames/Abuse001_x264_cap1_frame243.jpg,A man wearing a white shirt and black pants en...,Abuse001_x264,Abuse,243
4,output_frames/Abuse001_x264_cap2_frame225.jpg,A man wearing a black shirt and black pants en...,Abuse001_x264,Abuse,225
...,...,...,...,...,...
24551,output_frames/Vandalism050_x264_cap0_frame180.jpg,There are three people next to a black car and...,Vandalism050_x264,Vandalism,180
24552,output_frames/Vandalism050_x264_cap1_frame318.jpg,A man in gray clothes with a hat poured someth...,Vandalism050_x264,Vandalism,318
24553,output_frames/Vandalism050_x264_cap1_frame476.jpg,A man in gray clothes with a hat poured someth...,Vandalism050_x264,Vandalism,476
24554,output_frames/Vandalism050_x264_cap2_frame641.jpg,The man in white pants next to the black car p...,Vandalism050_x264,Vandalism,641


In [3]:
# Define root directory for the images
root_path = '/kaggle/input/ucf-crime-extracted-frames/'

# Function to update paths
def update_paths(df):
    df['image_path'] = root_path + df['image_path']
    return df

# Apply to each split
train_df = update_paths(train_df)
val_df = update_paths(val_df)
test_df = update_paths(test_df)

# Save updated CSVs
train_df.to_csv('train_image_captions.csv', index=False)
val_df.to_csv('val_image_captions.csv', index=False)
test_df.to_csv('test_image_captions.csv', index=False)

print("Paths updated and CSVs saved.")
train_df

Paths updated and CSVs saved.


Unnamed: 0,image_path,caption,video_key,category,frame_index
0,/kaggle/input/ucf-crime-extracted-frames/outpu...,"A woman with short hair, slightly fat, wearing...",Abuse001_x264,Abuse,39
1,/kaggle/input/ucf-crime-extracted-frames/outpu...,"A woman with short hair, slightly fat, wearing...",Abuse001_x264,Abuse,119
2,/kaggle/input/ucf-crime-extracted-frames/outpu...,A man wearing a white shirt and black pants en...,Abuse001_x264,Abuse,221
3,/kaggle/input/ucf-crime-extracted-frames/outpu...,A man wearing a white shirt and black pants en...,Abuse001_x264,Abuse,243
4,/kaggle/input/ucf-crime-extracted-frames/outpu...,A man wearing a black shirt and black pants en...,Abuse001_x264,Abuse,225
...,...,...,...,...,...
24551,/kaggle/input/ucf-crime-extracted-frames/outpu...,There are three people next to a black car and...,Vandalism050_x264,Vandalism,180
24552,/kaggle/input/ucf-crime-extracted-frames/outpu...,A man in gray clothes with a hat poured someth...,Vandalism050_x264,Vandalism,318
24553,/kaggle/input/ucf-crime-extracted-frames/outpu...,A man in gray clothes with a hat poured someth...,Vandalism050_x264,Vandalism,476
24554,/kaggle/input/ucf-crime-extracted-frames/outpu...,The man in white pants next to the black car p...,Vandalism050_x264,Vandalism,641


## Process UCF-UCA-Dataset

In [4]:

# import pandas as pd
# def process_large_csv(input_csv, output_csv=None, dataset_type="val"):
#     """
#     Reads a large CSV file, extracts required columns, modifies the image_path 
#     by removing dataset-specific prefixes and adding the correct directory,
#     and optionally saves the processed DataFrame.
    
#     Parameters:
#     - input_csv (str): Path to the input CSV file.
#     - output_csv (str, optional): Path to save the processed CSV file. If None, it won't save.
#     - dataset_type (str): Type of dataset ('train', 'test', 'val') to determine path adjustments.
    
#     Returns:
#     - pd.DataFrame: Processed DataFrame with updated image paths.
#     """
#     #use_cols = ["image_path", "caption"]  # Load only needed columns

#     # Read CSV with only required columns
#     df = pd.read_csv(input_csv)

#     # Define path mappings based on dataset type
#     path_mappings = {
#         "train": ("output_frames/", "/kaggle/input/uca-ucf-dataset/train_ucf_output/"),
#         "test": ("output_frames/", "/kaggle/input/uca-ucf-dataset/test_ucf_output/"),
#         "val": ("val_output_frames/", "/kaggle/input/uca-ucf-dataset/Val_ucf_output/")
#     }

#     # Get the correct replacement values
#     remove_prefix, new_prefix = path_mappings.get(dataset_type, ("", ""))

#     # Remove the dataset-specific prefix and add the correct directory
#     df["image_path"] = df["image_path"].str.replace(remove_prefix, "", regex=False)
#     df["image_path"] = new_prefix + df["image_path"]

#     # Save the processed DataFrame if an output path is provided
#     if output_csv:
#         df.to_csv(output_csv, index=False)

#     return df

# # Process train, test, and validation datasets with the correct path adjustments
# train_df = process_large_csv("/kaggle/input/uca-ucf-dataset/train_image_captions (2).csv", 
#                              "train_ucf_set.csv", dataset_type="train")
# test_df = process_large_csv("/kaggle/input/uca-ucf-dataset/test_image_captions.csv", 
#                             "test_ucf_set.csv", dataset_type="test")
# valid_df = process_large_csv("/kaggle/input/uca-ucf-dataset/Val_image_captions.csv", 
#                              "valid_ucf_set.csv", dataset_type="val")

# train_df  # Display first few rows


# Splitting into Train, Test and Valid sets

In [5]:
import pandas as pd

# # Read the CSV file into a DataFrame
# ds1= pd.read_csv('/kaggle/input/ucf-small-dataset/train_image_captions.csv')
# ds2= pd.read_csv('/kaggle/input/ucf-small-dataset/val_image_captions.csv')
# ds3= pd.read_csv('/kaggle/input/ucf-small-dataset/test_image_captions.csv')

# combined_df = pd.concat([ds1, ds2, ds3], axis=0, ignore_index=True)
# print(combined_df)
# # Now df contains the data from the CSV file
# train_fl

In [6]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# import numpy as np

# # Example: load your dataset
# # Assuming your dataset is in a CSV file named 'dataset.csv'
# df = combined_df

# # Initialize empty DataFrames for each split
# train_df = pd.DataFrame()
# test_df = pd.DataFrame()
# valid_df = pd.DataFrame()

# # Define split percentages
# train_pct = 0.3
# # For the remaining 30%, you might split equally into test and valid (15% each)
# # Alternatively, adjust as needed:
# test_pct = 0.3
# valid_pct = 0.3

# # Process each category separately
# for cat in df['category'].unique():
#     cat_df = df[df['category'] == cat]
#     # Get unique video_keys within this category
#     video_keys = cat_df['video_key'].unique()
#     # Shuffle video_keys for random split (set random_state for reproducibility)
#     np.random.seed(42)
#     np.random.shuffle(video_keys)
    
#     n_total = len(video_keys)
#     n_train = int(train_pct * n_total)
#     n_test = int(test_pct * n_total)
#     # The remaining keys go to validation (or adjust if you want a different ratio)
    
#     train_keys = video_keys[:n_train]
#     test_keys = video_keys[n_train:n_train+n_test]
#     valid_keys = video_keys[n_train+n_test:]
    
#     # Assign all rows corresponding to these video_keys
#     train_df = pd.concat([train_df, cat_df[cat_df['video_key'].isin(train_keys)]])
#     test_df = pd.concat([test_df, cat_df[cat_df['video_key'].isin(test_keys)]])
#     valid_df = pd.concat([valid_df, cat_df[cat_df['video_key'].isin(valid_keys)]])
    
# # Optionally, shuffle the final DataFrames
# train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
# test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
# valid_df = valid_df.sample(frac=1, random_state=42).reset_index(drop=True)

# # Now train_df, test_df, and valid_df are your splits where each video_key remains intact.
# print("Train set shape:", train_df.shape)
# print("Test set shape:", test_df.shape)
# print("Validation set shape:", valid_df.shape)

# # You can then save the splits to files if needed:
# train_df.to_csv('train_split.csv', index=False)
# test_df.to_csv('test_split.csv', index=False)
# valid_df.to_csv('valid_split.csv', index=False)


# Hugging Face login

In [7]:

import os
os.environ["HF_TOKEN"] = "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"

from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Install needed Packages

In [8]:
!pip install datasets
!pip install accelerate
!pip install transformers
!pip install pycocotools



# Mixing Dataframe

In [9]:
import pandas as pd

# Assuming your DataFrame is named df
train_df = train_df.sample(frac=1).reset_index(drop=True)
val_df = val_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)
train_df

Unnamed: 0,image_path,caption,video_key,category,frame_index
0,/kaggle/input/ucf-crime-extracted-frames/outpu...,A group of people wearing orange half-sleeves ...,Normal_Videos308_x264,Normal,19851
1,/kaggle/input/ucf-crime-extracted-frames/outpu...,"A motorcycle turns right, and a woman holding ...",RoadAccidents139_x264,RoadAccidents,693
2,/kaggle/input/ucf-crime-extracted-frames/outpu...,"On the left side of the screen, two men in bl...",Normal_Videos308_x264,Normal,802115
3,/kaggle/input/ucf-crime-extracted-frames/outpu...,"There was a man chasing him behind,",Shooting031_x264,Shooting,1873
4,/kaggle/input/ucf-crime-extracted-frames/outpu...,The man in a dark blue shirt picked up the ite...,Normal_Videos012_x264,Normal,518
...,...,...,...,...,...
24551,/kaggle/input/ucf-crime-extracted-frames/outpu...,"On the side of the road, a bald man wearing a ...",Normal_Videos079_x264,Normal,1068
24552,/kaggle/input/ucf-crime-extracted-frames/outpu...,The performers at the back walked to the left ...,Normal_Videos308_x264,Normal,773379
24553,/kaggle/input/ucf-crime-extracted-frames/outpu...,A yellow taxi is turning left,Normal_Videos087_x264,Normal,28566
24554,/kaggle/input/ucf-crime-extracted-frames/outpu...,A white double-decker bus drives from right to...,Normal_Videos087_x264,Normal,80143


In [10]:
val_df


Unnamed: 0,image_path,caption,video_key,category,frame_index
0,/kaggle/input/ucf-crime-extracted-frames/outpu...,A car stopped in front of the gate of a house,Arson028_x264,Arson,125
1,/kaggle/input/ucf-crime-extracted-frames/outpu...,"The lights in the room are on, there are many ...",Normal_Videos031_x264,Normal,3303
2,/kaggle/input/ucf-crime-extracted-frames/outpu...,The man rummaged in the drawers of other desks.,Burglary032_x264,Burglary,3603
3,/kaggle/input/ucf-crime-extracted-frames/outpu...,A man in gray clothes counts money,Robbery020_x264,Robbery,1791
4,/kaggle/input/ucf-crime-extracted-frames/outpu...,"Two policemen stopped at the intersection, and...",Arrest020_x264,Arrest,815
...,...,...,...,...,...
2689,/kaggle/input/ucf-crime-extracted-frames/outpu...,The firefighter with a water pipe sprayed some...,Arson019_x264,Arson,33284
2690,/kaggle/input/ucf-crime-extracted-frames/outpu...,"The woman walked towards the man, but the woma...",Arrest048_x264,Arrest,963
2691,/kaggle/input/ucf-crime-extracted-frames/outpu...,A man took out a black box from the trunk of t...,Arson019_x264,Arson,86406
2692,/kaggle/input/ucf-crime-extracted-frames/outpu...,The old man in red outside the elevator presse...,Normal_Videos_783_x264,Normal,1074


In [11]:
import pandas as pd

# Assuming your DataFrame is named ds
train_df.to_csv('Train_ds.csv', index=False)
val_df.to_csv('Valid_ds.csv', index=False)
test_df.to_csv('Test_ds.csv', index=False)

In [12]:
# # Read the CSV file into a DataFrame
# train_df = pd.read_csv('/kaggle/input/uca-ucf-dataset/Train_ds (3).csv')
# valid_df= pd.read_csv('/kaggle/input/uca-ucf-dataset/Valid_ds (3).csv')
# test_df = pd.read_csv('/kaggle/input/uca-ucf-dataset/Test_ds (3).csv')

In [13]:
from datasets import Dataset, DatasetDict

# Convert each split into a Dataset object
train_datads = Dataset.from_pandas(train_df)
valid_datads = Dataset.from_pandas(val_df[:1000])
test_datads = Dataset.from_pandas(test_df[:500])

# Organize the splits into a DatasetDict
ds = DatasetDict({
    "train": train_datads,
    "validation": valid_datads,
    "test": test_datads
})
ds

DatasetDict({
    train: Dataset({
        features: ['image_path', 'caption', 'video_key', 'category', 'frame_index'],
        num_rows: 24556
    })
    validation: Dataset({
        features: ['image_path', 'caption', 'video_key', 'category', 'frame_index'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['image_path', 'caption', 'video_key', 'category', 'frame_index'],
        num_rows: 500
    })
})

# Initialize VisionEncoderDecoderModel

In [14]:
import torch
from torch.utils.data import Dataset
from PIL import Image
from transformers import AutoTokenizer, AutoFeatureExtractor
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor, ViTImageProcessor
from datasets import DatasetDict

# Initialize tokenizer and feature extractor (replace 'model_name' with your model's name)
#tokenizer = AutoTokenizer.from_pretrained('model_name')
#feature_extractor = AutoFeatureExtractor.from_pretrained('model_name')
#model_name="NourFakih/Vit-GPT2-COCO2017Flickr-85k-09"
model_name="nlpconnect/vit-gpt2-image-captioning"

model = VisionEncoderDecoderModel.from_pretrained(model_name)
feature_extractor = ViTImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# GPT2 only has bos/eos tokens but not decoder_start/pad tokens
tokenizer.pad_token = tokenizer.eos_token

# update the model config
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

output_dir = "vit-gpt2-model"
model.save_pretrained(output_dir)
feature_extractor.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.0"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

('vit-gpt2-model/tokenizer_config.json',
 'vit-gpt2-model/special_tokens_map.json',
 'vit-gpt2-model/vocab.json',
 'vit-gpt2-model/merges.txt',
 'vit-gpt2-model/added_tokens.json',
 'vit-gpt2-model/tokenizer.json')

# WANDB_DISABLED and nltk

In [15]:
import os
import datasets
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor,AutoTokenizer
os.environ["WANDB_DISABLED"] = "true"

import nltk
try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    nltk.download("punkt", quiet=True)

# Define Metric

In [16]:
!pip install rouge_score
!pip install evaluate

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=dddf9506e62435de6e5fc2773114ec7f3c551982d79aba5281f771421bd17b70
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [17]:
import evaluate
metric = evaluate.load("rouge")

import numpy as np

ignore_pad_token_for_loss = True


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [18]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Ensure preds is a NumPy array
    if isinstance(preds, torch.Tensor):
        preds = preds.cpu().numpy()

    if isinstance(preds, tuple):
        preds = preds[0]

    # Debugging
    print("Preds before decoding:", preds)
    
    # Ensure valid token IDs
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    if ignore_pad_token_for_loss:
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    return result



# Preparing Dataset for Training

In [19]:
import torch
from PIL import Image
# Fix attention mask issue by explicitly setting pad token

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token to prevent warnings


class ImageCapatioingDataset(torch.utils.data.Dataset):
    def __init__(self, ds, ds_type, max_target_length):
        self.ds = ds
        self.max_target_length = max_target_length
        self.ds_type = ds_type

    def __getitem__(self, idx):
        image_path = self.ds[self.ds_type]['image_path'][idx]
        caption = self.ds[self.ds_type]['caption'][idx]
        model_inputs = dict()
        model_inputs['labels'] = self.tokenization_fn(caption, self.max_target_length)
        model_inputs['pixel_values'] = self.feature_extraction_fn(image_path)
        return model_inputs

    def __len__(self):
        return len(self.ds[self.ds_type])
    
    # text preprocessing step
    def tokenization_fn(self, caption, max_target_length):
        """Run tokenization on caption."""
        labels = tokenizer(caption, 
                          padding="max_length", 
                          max_length=max_target_length).input_ids

        return labels
    
    # image preprocessing step
    def feature_extraction_fn(self, image_path):
        """
        Run feature extraction on images
        If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
        Otherwise, an exception will be thrown.
        """
        image = Image.open(image_path).convert("RGB")
        image = image.resize((224, 224))

        encoder_inputs = feature_extractor(images=image, return_tensors="np")

        return encoder_inputs.pixel_values[0]


train_ds = ImageCapatioingDataset(ds, 'train', 256)
eval_ds = ImageCapatioingDataset(ds, 'validation', 256)




# Training

In [20]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
output_dir="./Vit-GPT2-UCA-UCF-06"
hub_model_id="NourFakih/Vit-GPT2-UCA-UCF-06"
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    save_steps=1000,
    eval_steps=500,
    eval_strategy="steps",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    output_dir=output_dir,
    overwrite_output_dir=True,
    #evaluation_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    #tpu_num_cores
    #accelerator_config (str, dict, or AcceleratorConfig, optional),
    push_to_hub=True,
    hub_strategy="all_checkpoints",
    #resume_from_checkpoint="./Vit-GPT2-UCA-UCF-05/checkpoint-500",
    hub_always_push=True,
    hub_model_id=hub_model_id
)
from transformers import default_data_collator
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=default_data_collator,
)
trainer.train()
trainer.push_to_hub()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
500,0.8073,0.184,31.5942,9.2754,27.0997,27.4879,17.309
1000,0.6562,0.180519,31.3758,9.5474,26.788,27.1031,16.271
1500,0.6123,0.179457,32.219,9.7783,27.4235,27.7537,16.455
2000,0.5502,0.182103,31.0914,9.2688,26.5321,26.8962,15.66
2500,0.5281,0.183209,31.0119,9.0876,26.4645,26.7925,16.042
3000,0.5085,0.184712,31.0869,9.0206,26.2838,26.6729,16.004
3500,0.4584,0.191913,29.6475,8.3551,25.1859,25.455,15.92
4000,0.4536,0.19222,30.3476,8.7192,25.8444,26.0811,15.981
4500,0.4477,0.193697,29.6433,8.3589,25.256,25.5825,15.63


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instea

Preds before decoding: [[50256   464   582 ... 50256 50256 50256]
 [50256   464   582 ... 18316    13 50256]
 [50256   464   582 ...  2330 10147    13]
 ...
 [50256   464   582 ...   290  3114   379]
 [50256   464   582 ...  3084    13 50256]
 [50256   464   582 ...   262  3084    13]]


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Preds before decoding: [[50256   464   582 ... 50256 50256 50256]
 [50256   464   582 ...  5118  1306   284]
 [50256   464   582 ... 50256 50256 50256]
 ...
 [50256   464   582 ... 50256 50256 50256]
 [50256   464   582 ...  1243   319   262]
 [50256   464   582 ...   287   262  6131]]


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Preds before decoding: [[50256   464   582 ... 50256 50256 50256]
 [50256    32   582 ... 50256 50256 50256]
 [50256   464   582 ...   290  6619   284]
 ...
 [50256   464   582 ...   319   262  3084]
 [50256   464   582 ...   683    13   383]
 [50256   464   582 ... 50256 50256 50256]]


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Preds before decoding: [[50256   464   582 ... 50256 50256 50256]
 [50256    32   582 ...   262  3159 50256]
 [50256   464   582 ... 50256 50256 50256]
 ...
 [50256   464   582 ... 50256 50256 50256]
 [50256   464   582 ...  2323    13  3244]
 [50256   464   582 ...   262  1364 50256]]


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Preds before decoding: [[50256   464   582 ... 50256 50256 50256]
 [50256    32   582 ...  1364 50256 50256]
 [50256   464   582 ... 50256 50256 50256]
 ...
 [50256   464   582 ...   340   319   262]
 [50256   464   582 ...   379   262  3709]
 [50256   464   582 ... 50256 50256 50256]]


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Preds before decoding: [[50256   464   582 ... 50256 50256 50256]
 [50256    32   582 ... 50256 50256 50256]
 [50256   464   582 ...    13 50256 50256]
 ...
 [50256   464   582 ...    13  3244   339]
 [50256   464   582 ...  6807   284   262]
 [50256   464   582 ...    13 50256 50256]]


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Preds before decoding: [[50256   464   582 ... 50256 50256 50256]
 [50256    32   582 ... 50256 50256 50256]
 [50256   464   582 ...  5055   510 50256]
 ...
 [50256   464   582 ...  3084   287  2166]
 [50256   464   582 ...   262  2323    13]
 [50256   464   582 ... 50256 50256 50256]]


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Preds before decoding: [[50256   464   582 ... 50256 50256 50256]
 [50256    32   582 ... 50256 50256 50256]
 [50256   464   582 ... 50256 50256 50256]
 ...
 [50256   464   582 ...   262  3504   286]
 [50256   464   582 ...  1223    13   383]
 [50256   464   582 ...    13 50256 50256]]


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

Preds before decoding: [[50256   464   582 ... 50256 50256 50256]
 [50256    32   582 ... 50256 50256 50256]
 [50256   464   582 ... 50256 50256 50256]
 ...
 [50256   464   582 ...  3084   287   262]
 [50256   464   582 ... 50256 50256 50256]
 [50256   464   582 ...    13 50256 50256]]


Could not locate the best model at ./Vit-GPT2-UCA-UCF-06/checkpoint-1500/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


CommitInfo(commit_url='https://huggingface.co/NourFakih/Vit-GPT2-UCA-UCF-06/commit/4058d6925d7f54f49f949bb936addd27d09cc92d', commit_message='End of training', commit_description='', oid='4058d6925d7f54f49f949bb936addd27d09cc92d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NourFakih/Vit-GPT2-UCA-UCF-06', endpoint='https://huggingface.co', repo_type='model', repo_id='NourFakih/Vit-GPT2-UCA-UCF-06'), pr_revision=None, pr_num=None)

# Push to HuggingFace

In [21]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

No files have been modified since last commit. Skipping to prevent empty commit.


('./Vit-GPT2-UCA-UCF-06/tokenizer_config.json',
 './Vit-GPT2-UCA-UCF-06/special_tokens_map.json',
 './Vit-GPT2-UCA-UCF-06/vocab.json',
 './Vit-GPT2-UCA-UCF-06/merges.txt',
 './Vit-GPT2-UCA-UCF-06/added_tokens.json',
 './Vit-GPT2-UCA-UCF-06/tokenizer.json')

In [22]:
tokenizer.save_pretrained(output_dir)

('./Vit-GPT2-UCA-UCF-06/tokenizer_config.json',
 './Vit-GPT2-UCA-UCF-06/special_tokens_map.json',
 './Vit-GPT2-UCA-UCF-06/vocab.json',
 './Vit-GPT2-UCA-UCF-06/merges.txt',
 './Vit-GPT2-UCA-UCF-06/added_tokens.json',
 './Vit-GPT2-UCA-UCF-06/tokenizer.json')

In [23]:
tokenizer.push_to_hub(hub_model_id)

README.md:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/NourFakih/Vit-GPT2-UCA-UCF-06/commit/4058d6925d7f54f49f949bb936addd27d09cc92d', commit_message='Upload tokenizer', commit_description='', oid='4058d6925d7f54f49f949bb936addd27d09cc92d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NourFakih/Vit-GPT2-UCA-UCF-06', endpoint='https://huggingface.co', repo_type='model', repo_id='NourFakih/Vit-GPT2-UCA-UCF-06'), pr_revision=None, pr_num=None)

In [24]:
feature_extractor.push_to_hub(hub_model_id)

CommitInfo(commit_url='https://huggingface.co/NourFakih/Vit-GPT2-UCA-UCF-06/commit/6d553d3316222512b8543117e95a5c909184024d', commit_message='Upload processor', commit_description='', oid='6d553d3316222512b8543117e95a5c909184024d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NourFakih/Vit-GPT2-UCA-UCF-06', endpoint='https://huggingface.co', repo_type='model', repo_id='NourFakih/Vit-GPT2-UCA-UCF-06'), pr_revision=None, pr_num=None)

In [25]:
feature_extractor.save_pretrained(output_dir)

['./Vit-GPT2-UCA-UCF-06/preprocessor_config.json']

In [26]:
model.save_pretrained(output_dir)

In [27]:
model.push_to_hub(hub_model_id)

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/NourFakih/Vit-GPT2-UCA-UCF-06/commit/6d553d3316222512b8543117e95a5c909184024d', commit_message='Upload model', commit_description='', oid='6d553d3316222512b8543117e95a5c909184024d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NourFakih/Vit-GPT2-UCA-UCF-06', endpoint='https://huggingface.co', repo_type='model', repo_id='NourFakih/Vit-GPT2-UCA-UCF-06'), pr_revision=None, pr_num=None)