#**NoteBook Summary**:

This notebook demonstrates the evaluation and deployment of a fine-tuned image captioning model (Vit-GPT2-UCA-UCF-06) built on Vision Transformer (ViT) and GPT-2. The model is loaded using Hugging Face Transformers and wrapped in a pipeline for generating natural language captions from input images. A subset of sampled frames from the UCF-Crime dataset is used for inference, where each image is captioned by the model and saved in a new CSV file. To evaluate caption quality, the ROUGE metric is computed across the full dataset and per category, comparing model-generated captions to ground-truth descriptions. Additionally, a visualization function is provided to display test images with their generated captions, enabling quick qualitative inspection of model performance. This notebook supports both quantitative and visual assessment of image-to-text performance in a crime surveillance context.

In [None]:
import pandas as pd
from transformers import VisionEncoderDecoderModel, GPT2Tokenizer, ViTImageProcessor, pipeline

# Load the modelcheckpoint-2000
model_path= "NourFakih/Vit-GPT2-UCA-UCF-06"
#model_path= "NourFakih/Vit-GPT2-UCA-UCF-01"
model = VisionEncoderDecoderModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
feature_extractor = ViTImageProcessor.from_pretrained("NourFakih/Vit-GPT2-UCA-UCF-06")


# # # Create the pipeline
# image_captioner_1 = pipeline(
#     task="image-to-text",
#     model=model,
#     tokenizer=tokenizer,
#     feature_extractor=feature_extractor
# )

# 1) Check for GPU
device = 0 if torch.cuda.is_available() else -1

# 2) (Re)create your pipeline on GPU
image_captioner_1 = pipeline(
    task="image-to-text",
    model=model,              # your VisionEncoderDecoderModel
    tokenizer=tokenizer,
    feature_extractor=feature_extractor,
    device=device             # device=0 uses the first GPU :contentReference[oaicite:0]{index=0}
)


#image_caption_and_display_multiple(image_paths, image_captioner_1)

In [None]:
!pip install rouge_score
!pip install evaluate

In [None]:
# Read the CSV file into a DataFrame
csv_path = '/kaggle/input/ucf-crime-extracted-frames/test_image_captions (1).csv'
df = pd.read_csv(csv_path)
# # Define root directory for the images
# root_path = '/kaggle/input/ucf-crime-extracted-frames/'

# # Function to update paths
# df['image_path'] = root_path + df['image_path']
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df= df[1000:1020]
# Function to generate caption for a single image
def generate_caption(image_path, caption_pipeline):
    try:
        # The pipeline returns a list of dictionaries; extract the generated text
        result = caption_pipeline(image_path, max_new_tokens=70)
        caption = result[0]['generated_text'] if result else ""
        return caption
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ""

# Apply the function on the 'image_path' column and store results in a new column
df['gen-caption'] = df['image_path'].apply(lambda x: generate_caption(x, image_captioner_1))

# Optionally, save the updated DataFrame to a new CSV file
output_csv_path = 'gen-caption.csv'
df.to_csv(output_csv_path, index=False)

print("Image caption generation completed and saved to:", output_csv_path)

In [None]:
df

### Get the Rouge values of the test set

In [None]:
import pandas as pd
from tqdm.auto import tqdm
from evaluate import load
from nltk import sent_tokenize

# 1) Load your test CSV
df = pd.read_csv("/kaggle/input/ucf-crime-extracted-frames/test_image_captions (1).csv")  
# columns: image_path | caption | video_key | category | frame_index

# 2) Prepare the metric (ROUGE)
metric = load("rouge")  # :contentReference[oaicite:0]{index=0}

# 3) Utility to postprocess text for rougeLSum
def postprocess_text(preds, labels):
    preds = ["\n".join(sent_tokenize(p.strip())) for p in preds]
    labels = ["\n".join(sent_tokenize(l.strip())) for l in labels]
    return preds, labels

# 4) Generate captions for each image
preds = []
refs  = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    img_path = row["image_path"]
    gold     = row["caption"]
    out      = image_captioner_1(img_path)[0]["generated_text"]
    preds.append(out)
    refs.append(gold)

# 5) Compute metrics on the *full* test set
p_proc, r_proc = postprocess_text(preds, refs)
full_scores = metric.compute(predictions=p_proc, references=r_proc, use_stemmer=True)
# Round to percentages
full_scores = {k: round(v*100, 4) for k,v in full_scores.items()}

print("▶ Full Test-Set ROUGE")
print(full_scores)


# 6) Compute per-category
scores_by_cat = {}
for cat, subdf in df.groupby("category"):
    idxs = subdf.index.tolist()
    p_sub = [preds[i] for i in idxs]
    r_sub = [refs[i] for i in idxs]
    p_proc, r_proc = postprocess_text(p_sub, r_sub)
    cat_scores = metric.compute(predictions=p_proc, references=r_proc, use_stemmer=True)
    scores_by_cat[cat] = {k: round(v*100, 4) for k,v in cat_scores.items()}

print("\n▶ ROUGE by Category")
for cat, scores in scores_by_cat.items():
    print(f"{cat}: {scores}")


In [None]:
import os
import matplotlib.pyplot as plt
from PIL import Image
from transformers import VisionEncoderDecoderModel, GPT2Tokenizer, ViTFeatureExtractor, pipeline

# Define paths
image_dir = "/kaggle/input/test-ucf-uca"

# # Load model components
# model_path = "/kaggle/working/Vit-GPT2-UCA-UCF-04/checkpoint-7686"
# model = VisionEncoderDecoderModel.from_pretrained(model_path)
# tokenizer = GPT2Tokenizer.from_pretrained(model_path)
# feature_extractor = ViTFeatureExtractor.from_pretrained(model_path)

# Create the image-to-text pipeline
# image_captioner_1 = pipeline(
#     task="image-to-text",
#     model=model,
#     tokenizer=tokenizer,
#     feature_extractor=feature_extractor
# )

# Function to caption and display images
def image_caption_and_display_multiple(image_dir, captioner):
    image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(("png", "jpg", "jpeg"))]

    for image_file in image_files:
        image_path = os.path.join(image_dir, image_file)
        image = Image.open(image_path).convert("RGB")

        # Generate caption
        caption = image_captioner_1( image, max_new_tokens=70)[0]['generated_text']
      #  caption = captioner(image)[0]["generated_text"]

        # Display image with caption
        plt.figure(figsize=(6, 6))
        plt.imshow(image)
        plt.axis("off")
        plt.title(caption, fontsize=12, wrap=True)
        plt.show()

# Apply the function to process images
image_caption_and_display_multiple(image_dir, image_captioner_1)