## Process UCF-Small_Dataset

In [1]:
import pandas as pd

def process_large_csv(input_csv, output_csv=None, dataset_type="val"):
    """
    Reads a large CSV file, extracts required columns, modifies the image_path 
    by removing dataset-specific prefixes and adding the correct directory,
    and optionally saves the processed DataFrame.
    
    Parameters:
    - input_csv (str): Path to the input CSV file.
    - output_csv (str, optional): Path to save the processed CSV file. If None, it won't save.
    - dataset_type (str): Type of dataset ('train', 'test', 'val') to determine path adjustments.
    
    Returns:
    - pd.DataFrame: Processed DataFrame with updated image paths.
    """
    #use_cols = ["image_path", "caption"]  # Load only needed columns

    # Read CSV with only required columns
    df = pd.read_csv(input_csv)

    # Define path mappings based on dataset type
    path_mappings = {
        "train": ("train_output_frames/", "/kaggle/input/ucf-small-dataset/train_ucf_output/"),
        "test": ("test_output_frames/", "/kaggle/input/ucf-small-dataset/test_ucf_output/"),
        "val": ("val_output_frames/", "/kaggle/input/ucf-small-dataset/val_ucf_output/")
    }

    # Get the correct replacement values
    remove_prefix, new_prefix = path_mappings.get(dataset_type, ("", ""))

    # Remove the dataset-specific prefix and add the correct directory
    df["image_path"] = df["image_path"].str.replace(remove_prefix, "", regex=False)
    df["image_path"] = new_prefix + df["image_path"]

    # Save the processed DataFrame if an output path is provided
    if output_csv:
        df.to_csv(output_csv, index=False)

    return df

# Process train, test, and validation datasets with the correct path adjustments
ds1 = process_large_csv("/kaggle/input/ucf-small-dataset/train_image_captions.csv", 
                             "train_ucf_set.csv", dataset_type="train")
ds2 = process_large_csv("/kaggle/input/ucf-small-dataset/test_image_captions.csv", 
                            "test_ucf_set.csv", dataset_type="test")
ds3 = process_large_csv("/kaggle/input/ucf-small-dataset/val_image_captions.csv", 
                             "valid_ucf_set.csv", dataset_type="val")

# train_ds  # Display first few rows
ds1

Unnamed: 0,image_path,caption,video_key,category,frame_index
0,/kaggle/input/ucf-small-dataset/train_ucf_outp...,A police car drove into a grassy field,Abuse006_x264,Abuse,141
1,/kaggle/input/ucf-small-dataset/train_ucf_outp...,"The police car stopped, and two police officer...",Abuse006_x264,Abuse,283
2,/kaggle/input/ucf-small-dataset/train_ucf_outp...,"The police car stopped, and two police officer...",Abuse006_x264,Abuse,283
3,/kaggle/input/ucf-small-dataset/train_ucf_outp...,"The police car stopped, and two police officer...",Abuse006_x264,Abuse,296
4,/kaggle/input/ucf-small-dataset/train_ucf_outp...,"The police car stopped, and two police officer...",Abuse006_x264,Abuse,309
...,...,...,...,...,...
1038,/kaggle/input/ucf-small-dataset/train_ucf_outp...,"The lights in the room turned on, someone was ...",Normal_Videos031_x264,Normal,2611
1039,/kaggle/input/ucf-small-dataset/train_ucf_outp...,"The room is illuminated by light, and the shel...",Normal_Videos031_x264,Normal,2952
1040,/kaggle/input/ucf-small-dataset/train_ucf_outp...,"The lights in the room are off, and there are ...",Normal_Videos031_x264,Normal,3294
1041,/kaggle/input/ucf-small-dataset/train_ucf_outp...,"The lights in the room are off, and there are ...",Normal_Videos031_x264,Normal,3294


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the full dataset
df = pd.read_csv('/kaggle/input/ucf-crime-extracted-frames/test_image_captions.csv)

# Initialize split containers
train_list, val_list, test_list = [], [], []

# Group by category
for category, group in df.groupby('category'):
    # Get all unique videos in this category
    video_keys = group['video_key'].unique()
    
    # Split into train (65%) and temp (35%)
    train_videos, temp_videos = train_test_split(
        video_keys, test_size=0.35, random_state=42
    )
    
    # Split temp into val (15%) and test (20%)
    val_videos, test_videos = train_test_split(
        temp_videos,
        test_size=(20 / 35),  # Adjusted for original total
        random_state=42
    )
    
    # Get dataframes for each split
    train_list.append(group[group['video_key'].isin(train_videos)])
    val_list.append(group[group['video_key'].isin(val_videos)])
    test_list.append(group[group['video_key'].isin(test_videos)])

# Concatenate all splits
train_df = pd.concat(train_list).reset_index(drop=True)
val_df = pd.concat(val_list).reset_index(drop=True)
test_df = pd.concat(test_list).reset_index(drop=True)

# Save splits
train_df.to_csv('train_image_captions.csv', index=False)
val_df.to_csv('val_image_captions.csv', index=False)
test_df.to_csv('test_image_captions.csv', index=False)

print(f"Train: {len(train_df)} samples")
print(f"Val: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")


## Process UCF-UCA-Dataset

In [24]:

# import pandas as pd
# def process_large_csv(input_csv, output_csv=None, dataset_type="val"):
#     """
#     Reads a large CSV file, extracts required columns, modifies the image_path 
#     by removing dataset-specific prefixes and adding the correct directory,
#     and optionally saves the processed DataFrame.
    
#     Parameters:
#     - input_csv (str): Path to the input CSV file.
#     - output_csv (str, optional): Path to save the processed CSV file. If None, it won't save.
#     - dataset_type (str): Type of dataset ('train', 'test', 'val') to determine path adjustments.
    
#     Returns:
#     - pd.DataFrame: Processed DataFrame with updated image paths.
#     """
#     #use_cols = ["image_path", "caption"]  # Load only needed columns

#     # Read CSV with only required columns
#     df = pd.read_csv(input_csv)

#     # Define path mappings based on dataset type
#     path_mappings = {
#         "train": ("output_frames/", "/kaggle/input/uca-ucf-dataset/train_ucf_output/"),
#         "test": ("output_frames/", "/kaggle/input/uca-ucf-dataset/test_ucf_output/"),
#         "val": ("val_output_frames/", "/kaggle/input/uca-ucf-dataset/Val_ucf_output/")
#     }

#     # Get the correct replacement values
#     remove_prefix, new_prefix = path_mappings.get(dataset_type, ("", ""))

#     # Remove the dataset-specific prefix and add the correct directory
#     df["image_path"] = df["image_path"].str.replace(remove_prefix, "", regex=False)
#     df["image_path"] = new_prefix + df["image_path"]

#     # Save the processed DataFrame if an output path is provided
#     if output_csv:
#         df.to_csv(output_csv, index=False)

#     return df

# # Process train, test, and validation datasets with the correct path adjustments
# train_df = process_large_csv("/kaggle/input/uca-ucf-dataset/train_image_captions (2).csv", 
#                              "train_ucf_set.csv", dataset_type="train")
# test_df = process_large_csv("/kaggle/input/uca-ucf-dataset/test_image_captions.csv", 
#                             "test_ucf_set.csv", dataset_type="test")
# valid_df = process_large_csv("/kaggle/input/uca-ucf-dataset/Val_image_captions.csv", 
#                              "valid_ucf_set.csv", dataset_type="val")

# train_df  # Display first few rows


Unnamed: 0,image_path,caption,video_key,category,frame_index
0,/kaggle/input/uca-ucf-dataset/train_ucf_output...,"A woman with short hair, slightly fat, wearing...",Abuse001_x264,Abuse,0
1,/kaggle/input/uca-ucf-dataset/train_ucf_output...,"A woman with short hair, slightly fat, wearing...",Abuse001_x264,Abuse,20
2,/kaggle/input/uca-ucf-dataset/train_ucf_output...,"A woman with short hair, slightly fat, wearing...",Abuse001_x264,Abuse,40
3,/kaggle/input/uca-ucf-dataset/train_ucf_output...,"A woman with short hair, slightly fat, wearing...",Abuse001_x264,Abuse,60
4,/kaggle/input/uca-ucf-dataset/train_ucf_output...,"A woman with short hair, slightly fat, wearing...",Abuse001_x264,Abuse,80
...,...,...,...,...,...
90382,/kaggle/input/uca-ucf-dataset/train_ucf_output...,walked past a woman in white,Normal_Videos_897_x264,Normal,560
90383,/kaggle/input/uca-ucf-dataset/train_ucf_output...,walked past a woman in white,Normal_Videos_897_x264,Normal,580
90384,/kaggle/input/uca-ucf-dataset/train_ucf_output...,walked past a woman in white,Normal_Videos_897_x264,Normal,600
90385,/kaggle/input/uca-ucf-dataset/train_ucf_output...,walked past a woman in white,Normal_Videos_897_x264,Normal,620


# Splitting into Train, Test and Valid sets

In [23]:
import pandas as pd

# # Read the CSV file into a DataFrame
# ds1= pd.read_csv('/kaggle/input/ucf-small-dataset/train_image_captions.csv')
# ds2= pd.read_csv('/kaggle/input/ucf-small-dataset/val_image_captions.csv')
# ds3= pd.read_csv('/kaggle/input/ucf-small-dataset/test_image_captions.csv')

combined_df = pd.concat([ds1, ds2, ds3], axis=0, ignore_index=True)
print(combined_df)
# # Now df contains the data from the CSV file
# train_fl

                                               image_path  \
0       /kaggle/input/uca-ucf-dataset/train_ucf_output...   
1       /kaggle/input/uca-ucf-dataset/train_ucf_output...   
2       /kaggle/input/uca-ucf-dataset/train_ucf_output...   
3       /kaggle/input/uca-ucf-dataset/train_ucf_output...   
4       /kaggle/input/uca-ucf-dataset/train_ucf_output...   
...                                                   ...   
244198  /kaggle/input/uca-ucf-dataset/Val_ucf_output/N...   
244199  /kaggle/input/uca-ucf-dataset/Val_ucf_output/N...   
244200  /kaggle/input/uca-ucf-dataset/Val_ucf_output/N...   
244201  /kaggle/input/uca-ucf-dataset/Val_ucf_output/N...   
244202  /kaggle/input/uca-ucf-dataset/Val_ucf_output/N...   

                                                  caption  \
0       A woman with short hair, slightly fat, wearing...   
1       A woman with short hair, slightly fat, wearing...   
2       A woman with short hair, slightly fat, wearing...   
3       A woman with sh

In [21]:
# import pandas as pd
# from sklearn.model_selection import train_test_split
# import numpy as np

# # Example: load your dataset
# # Assuming your dataset is in a CSV file named 'dataset.csv'
# df = combined_df

# # Initialize empty DataFrames for each split
# train_df = pd.DataFrame()
# test_df = pd.DataFrame()
# valid_df = pd.DataFrame()

# # Define split percentages
# train_pct = 0.3
# # For the remaining 30%, you might split equally into test and valid (15% each)
# # Alternatively, adjust as needed:
# test_pct = 0.3
# valid_pct = 0.3

# # Process each category separately
# for cat in df['category'].unique():
#     cat_df = df[df['category'] == cat]
#     # Get unique video_keys within this category
#     video_keys = cat_df['video_key'].unique()
#     # Shuffle video_keys for random split (set random_state for reproducibility)
#     np.random.seed(42)
#     np.random.shuffle(video_keys)
    
#     n_total = len(video_keys)
#     n_train = int(train_pct * n_total)
#     n_test = int(test_pct * n_total)
#     # The remaining keys go to validation (or adjust if you want a different ratio)
    
#     train_keys = video_keys[:n_train]
#     test_keys = video_keys[n_train:n_train+n_test]
#     valid_keys = video_keys[n_train+n_test:]
    
#     # Assign all rows corresponding to these video_keys
#     train_df = pd.concat([train_df, cat_df[cat_df['video_key'].isin(train_keys)]])
#     test_df = pd.concat([test_df, cat_df[cat_df['video_key'].isin(test_keys)]])
#     valid_df = pd.concat([valid_df, cat_df[cat_df['video_key'].isin(valid_keys)]])
    
# # Optionally, shuffle the final DataFrames
# train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)
# test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)
# valid_df = valid_df.sample(frac=1, random_state=42).reset_index(drop=True)

# # Now train_df, test_df, and valid_df are your splits where each video_key remains intact.
# print("Train set shape:", train_df.shape)
# print("Test set shape:", test_df.shape)
# print("Validation set shape:", valid_df.shape)

# # You can then save the splits to files if needed:
# train_df.to_csv('train_split.csv', index=False)
# test_df.to_csv('test_split.csv', index=False)
# valid_df.to_csv('valid_split.csv', index=False)


Train set shape: (83705, 5)
Test set shape: (69554, 5)
Validation set shape: (90944, 5)


# Hugging Face login

In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Install needed Packages

In [6]:
!pip install datasets
!pip install accelerate
!pip install transformers
!pip install pycocotools



# Mixing Dataframe

In [25]:
import pandas as pd

# Assuming your DataFrame is named df
train_df = train_df.sample(frac=1).reset_index(drop=True)
valid_df = valid_df.sample(frac=1).reset_index(drop=True)
test_df = test_df.sample(frac=1).reset_index(drop=True)
train_df

Unnamed: 0,image_path,caption,video_key,category,frame_index
0,/kaggle/input/uca-ucf-dataset/train_ucf_output...,"Three policemen came to the door, one walked i...",Abuse008_x264,Abuse,7420
1,/kaggle/input/uca-ucf-dataset/train_ucf_output...,Many vehicles are driving on the road.,Arrest015_x264,Arrest,480
2,/kaggle/input/uca-ucf-dataset/train_ucf_output...,Multiple firefighters were holding water pipe...,Explosion046_x264,Explosion,30220
3,/kaggle/input/uca-ucf-dataset/train_ucf_output...,The big man kept slashing at his head and body.,Assault004_x264,Assault,840
4,/kaggle/input/uca-ucf-dataset/train_ucf_output...,Two men had a quarrel,Fighting010_x264,Fighting,1740
...,...,...,...,...,...
90382,/kaggle/input/uca-ucf-dataset/train_ucf_output...,The woman in gray pulled away the man in the j...,Fighting004_x264,Fighting,7600
90383,/kaggle/input/uca-ucf-dataset/train_ucf_output...,"The camera changes the perspective, and the ma...",Robbery021_x264,Robbery,620
90384,/kaggle/input/uca-ucf-dataset/train_ucf_output...,The woman in the black and white skirt picked ...,Shoplifting014_x264,Shoplifting,16100
90385,/kaggle/input/uca-ucf-dataset/train_ucf_output...,The man walked around the sofa and came to a d...,Burglary040_x264,Burglary,2400


In [32]:
valid_df

Unnamed: 0,image_path,caption,video_key,category,frame_index
0,/kaggle/input/uca-ucf-dataset/Val_ucf_output/F...,"Another man stood up to help pack things, and ...",Fighting050_x264,Fighting,22560
1,/kaggle/input/uca-ucf-dataset/Val_ucf_output/F...,Two people were arguing in the middle of the r...,Fighting048_x264,Fighting,3580
2,/kaggle/input/uca-ucf-dataset/Val_ucf_output/R...,The man reached for the money and then opened ...,Robbery143_x264,Robbery,700
3,/kaggle/input/uca-ucf-dataset/Val_ucf_output/N...,The bald man quickly walked away from the shel...,Normal_Videos689_x264,Normal,8320
4,/kaggle/input/uca-ucf-dataset/Val_ucf_output/N...,"A woman in a gray coat walked by, then turned ...",Normal_Videos676_x264,Normal,2660
...,...,...,...,...,...
53043,/kaggle/input/uca-ucf-dataset/Val_ucf_output/F...,A man set up his bicycle for the man to ride a...,Fighting048_x264,Fighting,5620
53044,/kaggle/input/uca-ucf-dataset/Val_ucf_output/S...,The silver car continued to move backwards,Stealing109_x264,Stealing,1080
53045,/kaggle/input/uca-ucf-dataset/Val_ucf_output/N...,The man in the red hat chats with the man in blue,Normal_Videos689_x264,Normal,11240
53046,/kaggle/input/uca-ucf-dataset/Val_ucf_output/R...,The two worked together to suppress and beat t...,Robbery127_x264,Robbery,500


In [27]:
import pandas as pd

# Assuming your DataFrame is named ds
train_df.to_csv('Train_ds.csv', index=False)
valid_df.to_csv('Valid_ds.csv', index=False)
test_df.to_csv('Test_ds.csv', index=False)

In [9]:
# # Read the CSV file into a DataFrame
# train_df = pd.read_csv('/kaggle/input/uca-ucf-dataset/Train_ds (3).csv')
# valid_df= pd.read_csv('/kaggle/input/uca-ucf-dataset/Valid_ds (3).csv')
# test_df = pd.read_csv('/kaggle/input/uca-ucf-dataset/Test_ds (3).csv')

In [33]:
from datasets import Dataset, DatasetDict

# Convert each split into a Dataset object
train_datads = Dataset.from_pandas(train_df)
valid_datads = Dataset.from_pandas(valid_df[:10000])
test_datads = Dataset.from_pandas(test_df)

# Organize the splits into a DatasetDict
ds = DatasetDict({
    "train": train_datads,
    "validation": valid_datads,
    "test": test_datads
})
ds

DatasetDict({
    train: Dataset({
        features: ['image_path', 'caption', 'video_key', 'category', 'frame_index'],
        num_rows: 90387
    })
    validation: Dataset({
        features: ['image_path', 'caption', 'video_key', 'category', 'frame_index'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['image_path', 'caption', 'video_key', 'category', 'frame_index'],
        num_rows: 100768
    })
})

# Initialize VisionEncoderDecoderModel

In [11]:
import torch
from torch.utils.data import Dataset
from PIL import Image
from transformers import AutoTokenizer, AutoFeatureExtractor
from transformers import VisionEncoderDecoderModel, AutoTokenizer, AutoFeatureExtractor, ViTImageProcessor
from datasets import DatasetDict

# Initialize tokenizer and feature extractor (replace 'model_name' with your model's name)
#tokenizer = AutoTokenizer.from_pretrained('model_name')
#feature_extractor = AutoFeatureExtractor.from_pretrained('model_name')
model_name="NourFakih/Vit-GPT2-COCO2017Flickr-85k-09"
model = VisionEncoderDecoderModel.from_pretrained(model_name)
feature_extractor = ViTImageProcessor.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# GPT2 only has bos/eos tokens but not decoder_start/pad tokens
tokenizer.pad_token = tokenizer.eos_token

# update the model config
model.config.eos_token_id = tokenizer.eos_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

output_dir = "vit-gpt2-model"
model.save_pretrained(output_dir)
feature_extractor.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

config.json:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/957M [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "transformers_version": "4.47.0"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_rang

generation_config.json:   0%|          | 0.00/149 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/667 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

('vit-gpt2-model/tokenizer_config.json',
 'vit-gpt2-model/special_tokens_map.json',
 'vit-gpt2-model/vocab.json',
 'vit-gpt2-model/merges.txt',
 'vit-gpt2-model/added_tokens.json',
 'vit-gpt2-model/tokenizer.json')

# WANDB_DISABLED and nltk

In [12]:
import os
import datasets
from transformers import VisionEncoderDecoderModel, AutoFeatureExtractor,AutoTokenizer
os.environ["WANDB_DISABLED"] = "true"

import nltk
try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    nltk.download("punkt", quiet=True)

# Define Metric

In [13]:
!pip install rouge_score
!pip install evaluate

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=386ed069b17a4a2a57ea84ddb832051d824c30c1b633449b91daa6a9d34f44bf
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [14]:
import evaluate
metric = evaluate.load("rouge")

import numpy as np

ignore_pad_token_for_loss = True


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [15]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # Ensure preds is a NumPy array
    if isinstance(preds, torch.Tensor):
        preds = preds.cpu().numpy()

    if isinstance(preds, tuple):
        preds = preds[0]

    # Debugging
    print("Preds before decoding:", preds)
    
    # Ensure valid token IDs
    preds = np.clip(preds, 0, tokenizer.vocab_size - 1)

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    if ignore_pad_token_for_loss:
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)

    return result



# Preparing Dataset for Training

In [29]:
import torch
from PIL import Image
# Fix attention mask issue by explicitly setting pad token

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token to prevent warnings


class ImageCapatioingDataset(torch.utils.data.Dataset):
    def __init__(self, ds, ds_type, max_target_length):
        self.ds = ds
        self.max_target_length = max_target_length
        self.ds_type = ds_type

    def __getitem__(self, idx):
        image_path = self.ds[self.ds_type]['image_path'][idx]
        caption = self.ds[self.ds_type]['caption'][idx]
        model_inputs = dict()
        model_inputs['labels'] = self.tokenization_fn(caption, self.max_target_length)
        model_inputs['pixel_values'] = self.feature_extraction_fn(image_path)
        return model_inputs

    def __len__(self):
        return len(self.ds[self.ds_type])
    
    # text preprocessing step
    def tokenization_fn(self, caption, max_target_length):
        """Run tokenization on caption."""
        labels = tokenizer(caption, 
                          padding="max_length", 
                          max_length=max_target_length).input_ids

        return labels
    
    # image preprocessing step
    def feature_extraction_fn(self, image_path):
        """
        Run feature extraction on images
        If `check_image` is `True`, the examples that fails during `Image.open()` will be caught and discarded.
        Otherwise, an exception will be thrown.
        """
        image = Image.open(image_path).convert("RGB")
        image = image.resize((224, 224))

        encoder_inputs = feature_extractor(images=image, return_tensors="np")

        return encoder_inputs.pixel_values[0]


train_ds = ImageCapatioingDataset(ds, 'train', 256)
eval_ds = ImageCapatioingDataset(ds, 'validation', 256)




# Training

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
output_dir="./Vit-GPT2-UCA-UCF-06"
hub_model_id="NourFakih/Vit-GPT2-UCA-UCF-06"
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    save_steps=500,
    eval_steps=500,
    eval_strategy="steps",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    output_dir=output_dir,
    overwrite_output_dir=True,
    #evaluation_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    #tpu_num_cores
    #accelerator_config (str, dict, or AcceleratorConfig, optional),
    push_to_hub=True,
    hub_strategy="all_checkpoints",
    #resume_from_checkpoint="./Vit-GPT2-UCA-UCF-05/checkpoint-500",
    hub_always_push=True,
    hub_model_id=hub_model_id
)
from transformers import default_data_collator
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=default_data_collator,
)
trainer.train()
trainer.push_to_hub()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss


# Push to HuggingFace

In [None]:
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:
tokenizer.save_pretrained(output_dir)

In [None]:
tokenizer.push_to_hub(hub_model_id)

In [None]:
feature_extractor.push_to_hub(hub_model_id)

In [None]:
feature_extractor.save_pretrained(output_dir)

In [None]:
model.save_pretrained(output_dir)

In [None]:
model.push_to_hub(hub_model_id)