# Task 4: Image Captioning

- Load dữ liệu ảnh và caption
- Tiền xử lý
- Trích xuất đặc trưng ảnh
- Train ImageCaptioner
- Đánh giá BLEU
- Visualize kết quả


In [ ]:
import pandas as pd
import numpy as np
import os
from src.data_processor import Flickr8kProcessor, TextPreprocessor, ImageProcessor
from src.models.captioning_models import ImageCaptioner
from src.utils.metrics import MetricsCalculator
from sklearn.model_selection import train_test_split


## 1. Load và tiền xử lý dữ liệu

In [ ]:
DATA_PATH = '../data'
IMG_DIR = os.path.join(DATA_PATH, 'flickr8k', 'Images')
processor = Flickr8kProcessor(DATA_PATH)
captions_df = processor.load_captions()
text_prep = TextPreprocessor(language='en')
tokenized = text_prep.tokenize(captions_df['caption'].tolist())
vocab = text_prep.build_vocabulary(tokenized, vocab_size=5000)
sequences = text_prep.texts_to_sequences(captions_df['caption'].tolist(), vocab)
padded = text_prep.pad_sequences(sequences, maxlen=20)
image_paths = [os.path.join(IMG_DIR, img_id) for img_id in captions_df['image_id']]
img_proc = ImageProcessor(model_name='InceptionV3')
features = img_proc.extract_features(image_paths)
X_img_train, X_img_val, X_cap_train, X_cap_val = train_test_split(features, padded, test_size=0.2, random_state=42)


## 2. Build, train và đánh giá mô hình

In [ ]:
captioner = ImageCaptioner(vocab_size=len(vocab))
captioner.build_model()
history = captioner.train(X_img_train, X_cap_train, epochs=5)
metrics = MetricsCalculator()
preds = []
for i in range(len(X_img_val)):
    pred_seq = captioner.generate_caption(X_img_val[i])
    preds.append(' '.join([str(idx) for idx in pred_seq]))
refs = [' '.join([str(idx) for idx in seq]) for seq in X_cap_val]
bleu = metrics.calculate_bleu(refs, preds)
print('BLEU score:', bleu)


## 3. Visualize kết quả

In [ ]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='Train Loss')
plt.title('Training Loss')
plt.legend()
plt.show()
