# Download Flickr8k Dataset
This notebook downloads and organizes the Flickr8k dataset, including images and captions.

In [1]:
# Import required libraries
import os
import requests
from zipfile import ZipFile



## Define URLs and Paths
Set up the URLs for downloading the dataset and the paths for saving the files.

In [None]:
# URLs for Flickr8k dataset
images_url = 'https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip'
captions_url = 'https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip'

# Paths for saving the dataset
data_dir = '../data/'
images_dir = os.path.join(data_dir, 'images/')
captions_file = os.path.join(data_dir, 'captions.txt')

# Create directories if they don't exist
os.makedirs(images_dir, exist_ok=True)

## Download and Extract Dataset
Download the images and captions, then extract them into the appropriate folders.

In [3]:
def download_and_extract(url, extract_to):
    """Download and extract a zip file from a URL."""
    zip_path = os.path.join(data_dir, url.split('/')[-1])
    if not os.path.exists(zip_path):
        print(f'Downloading {url}...')
        response = requests.get(url)
        with open(zip_path, 'wb') as f:
            f.write(response.content)
        print('Download complete.')
    else:
        print(f'{zip_path} already exists.')

    print(f'Extracting {zip_path}...')
    with ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print('Extraction complete.')

# Download and extract images
download_and_extract(images_url, images_dir)

# Download and extract captions
download_and_extract(captions_url, data_dir)

Downloading https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip...
Download complete.
Extracting ../data/Flickr8k_Dataset.zip...
Extraction complete.
Downloading https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip...
Download complete.
Extracting ../data/Flickr8k_text.zip...
Extraction complete.


## Verify Dataset
Check the number of images and the captions file.

In [5]:
# Verify images
image_files = os.listdir(images_dir)
print(f'Total images: {len(image_files)}')
print(f'Sample images: {image_files[:5]}')

# Verify captions
if os.path.exists(captions_file):
    print(f'Captions file found: {captions_file}')
    with open(captions_file, 'r') as f:
        captions_sample = [next(f).strip() for _ in range(5)]
    print('Sample captions:')
    for caption in captions_sample:
        print(caption)
else:
    print('Captions file not found.')

# Verify other text files
text_files = ['Flickr8k.lemma.token.txt', 'Flickr_8k.trainImages.txt', 'Flickr_8k.devImages.txt', 'Flickr_8k.testImages.txt']
for text_file in text_files:
    file_path = os.path.join(data_dir, text_file)
    if os.path.exists(file_path):
        print(f'{text_file} found.')
        with open(file_path, 'r') as f:
            sample_lines = [next(f).strip() for _ in range(5)]
        print(f'Sample lines from {text_file}:')
        for line in sample_lines:
            print(line)
    else:
        print(f'{text_file} not found.')

Total images: 8091
Sample images: ['2387197355_237f6f41ee.jpg', '2609847254_0ec40c1cce.jpg', '2046222127_a6f300e202.jpg', '2853743795_e90ebc669d.jpg', '2696951725_e0ae54f6da.jpg']
Captions file not found.
Flickr8k.lemma.token.txt found.
Sample lines from Flickr8k.lemma.token.txt:
1305564994_00513f9a5b.jpg#0	A man in street racer armor be examine the tire of another racer 's motorbike .
1305564994_00513f9a5b.jpg#1	Two racer drive a white bike down a road .
1305564994_00513f9a5b.jpg#2	Two motorist be ride along on their vehicle that be oddly design and color .
1305564994_00513f9a5b.jpg#3	Two person be in a small race car drive by a green hill .
1305564994_00513f9a5b.jpg#4	Two person in race uniform in a street car .
Flickr_8k.trainImages.txt found.
Sample lines from Flickr_8k.trainImages.txt:
2513260012_03d33305cf.jpg
2903617548_d3e38d7f88.jpg
3338291921_fe7ae0c8f8.jpg
488416045_1c6d903fe0.jpg
2644326817_8f45080b87.jpg
Flickr_8k.devImages.txt found.
Sample lines from Flickr_8k.devImages.