# Download Flickr8k Dataset
This notebook downloads and organizes the Flickr8k dataset, including images and captions.

In [1]:
# Import required libraries
import os
import requests
from zipfile import ZipFile

## Define URLs and Paths
Set up the URLs for downloading the dataset and the paths for saving the files.

In [2]:
# URLs for Flickr8k dataset
images_url = 'https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip'
captions_url = 'https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip'

# Paths for saving the dataset
data_dir = '../data/'
images_dir = os.path.join(data_dir, 'images/')
captions_file = os.path.join(data_dir, 'captions.txt')

# Create directories if they don't exist
os.makedirs(images_dir, exist_ok=True)

## Download and Extract Dataset
Download the images and captions, then extract them into the appropriate folders.

In [3]:
def download_and_extract(url, extract_to):
    """Download and extract a zip file from a URL."""
    zip_path = os.path.join(data_dir, url.split('/')[-1])
    if not os.path.exists(zip_path):
        print(f'Downloading {url}...')
        response = requests.get(url)
        with open(zip_path, 'wb') as f:
            f.write(response.content)
        print('Download complete.')
    else:
        print(f'{zip_path} already exists.')

    print(f'Extracting {zip_path}...')
    with ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    print('Extraction complete.')

# Download and extract images
download_and_extract(images_url, images_dir)

# Download and extract captions
download_and_extract(captions_url, data_dir)

../data/Flickr8k_Dataset.zip already exists.
Extracting ../data/Flickr8k_Dataset.zip...
Extraction complete.
../data/Flickr8k_text.zip already exists.
Extracting ../data/Flickr8k_text.zip...
Extraction complete.


## Verify Dataset
Check the number of images and the captions file.

In [5]:
# Verify images
image_files = os.listdir(images_dir)
print(f'Total images: {len(image_files)}')
print(f'Sample images: {image_files[:5]}')

# Verify captions
if os.path.exists(captions_file):
    print(f'Captions file found: {captions_file}')
    with open(captions_file, 'r') as f:
        captions_sample = [next(f).strip() for _ in range(5)]
    print('Sample captions:')
    for caption in captions_sample:
        print(caption)
else:
    print('Captions file not found.')

# Verify other text files
text_files = ['Flickr8k.lemma.token.txt', 'Flickr_8k.trainImages.txt', 'Flickr_8k.devImages.txt', 'Flickr_8k.testImages.txt']
for text_file in text_files:
    file_path = os.path.join(data_dir, text_file)
    if os.path.exists(file_path):
        print(f'{text_file} found.')
        with open(file_path, 'r') as f:
            sample_lines = [next(f).strip() for _ in range(5)]
        print(f'Sample lines from {text_file}:')
        for line in sample_lines:
            print(line)
    else:
        print(f'{text_file} not found.')

Total images: 2
Sample images: ['Flicker8k_Dataset', '__MACOSX']
Captions file not found.
Flickr8k.lemma.token.txt found.
Sample lines from Flickr8k.lemma.token.txt:
1305564994_00513f9a5b.jpg#0	A man in street racer armor be examine the tire of another racer 's motorbike .
1305564994_00513f9a5b.jpg#1	Two racer drive a white bike down a road .
1305564994_00513f9a5b.jpg#2	Two motorist be ride along on their vehicle that be oddly design and color .
1305564994_00513f9a5b.jpg#3	Two person be in a small race car drive by a green hill .
1305564994_00513f9a5b.jpg#4	Two person in race uniform in a street car .
Flickr_8k.trainImages.txt found.
Sample lines from Flickr_8k.trainImages.txt:
2513260012_03d33305cf.jpg
2903617548_d3e38d7f88.jpg
3338291921_fe7ae0c8f8.jpg
488416045_1c6d903fe0.jpg
2644326817_8f45080b87.jpg
Flickr_8k.devImages.txt found.
Sample lines from Flickr_8k.devImages.txt:
2090545563_a4e66ec76b.jpg
3393035454_2d2370ffd4.jpg
3695064885_a6922f06b2.jpg
1679557684_50a206e4a9.jpg
358268

## Clean Up and Reorganize Data Directory
Remove unnecessary files and organize the directory structure.

In [4]:
import shutil

# Remove unnecessary zip files after extraction
for zip_file in ['Flickr8k_Dataset.zip', 'Flickr8k_text.zip']:
    zip_path = os.path.join(data_dir, zip_file)
    if os.path.exists(zip_path):
        os.remove(zip_path)
        print(f'Removed {zip_file}')

# Remove MACOSX directories
macosx_dirs = [
    os.path.join(data_dir, '__MACOSX'),
    os.path.join(data_dir, 'images/__MACOSX'),
]
for dir_path in macosx_dirs:
    if os.path.exists(dir_path):
        shutil.rmtree(dir_path)
        print(f'Removed {dir_path}')

# Move images from nested Flicker8k_Dataset folder to images directory
nested_images_dir = os.path.join(data_dir, 'images/Flicker8k_Dataset')
if os.path.exists(nested_images_dir):
    for img in os.listdir(nested_images_dir):
        src = os.path.join(nested_images_dir, img)
        dst = os.path.join(images_dir, img)
        if os.path.exists(src):
            shutil.move(src, dst)
    shutil.rmtree(nested_images_dir)
    print('Moved images to the correct directory')

# Create processed directory for later use
processed_dir = os.path.join(data_dir, 'processed')
os.makedirs(processed_dir, exist_ok=True)

print('\nFinal directory structure:')
for root, dirs, files in os.walk(data_dir):
    level = root.replace(data_dir, '').count(os.sep)
    indent = ' ' * 4 * level
    print(f'{indent}{os.path.basename(root)}/')
    subindent = ' ' * 4 * (level + 1)
    for f in sorted(files)[:5]:  # Show only first 5 files per directory
        print(f'{subindent}{f}')
    if len(files) > 5:
        print(f'{subindent}...')

Removed Flickr8k_Dataset.zip
Removed Flickr8k_text.zip
Removed ../data/__MACOSX
Removed ../data/images/__MACOSX
Moved images to the correct directory

Final directory structure:
/
    .gitignore
    CrowdFlowerAnnotations.txt
    ExpertAnnotations.txt
    Flickr8k.lemma.token.txt
    Flickr8k.token.txt
    ...
images/
    1000268201_693b08cb0e.jpg
    1001773457_577c3a7d70.jpg
    1002674143_1b742ab4b8.jpg
    1003163366_44323f5815.jpg
    1007129816_e794419615.jpg
    ...
processed/


## Release Resources
Clean up resources and call garbage collector to free memory.

In [4]:
import gc

# Release resources and call garbage collector
gc.collect()
print("Resources released and garbage collector invoked.")

Resources released and garbage collector invoked.
