In [2]:
# Install necessary libraries (uncomment if you need to install them)
# !pip install tensorflow pandas matplotlib

# Importing necessary libraries
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import os

# Check TensorFlow version to ensure it's installed correctly
print("TensorFlow version:", tf.__version__)


TensorFlow version: 2.15.0


In [None]:
!wget "https://github.com/awsaf49/flickr-dataset/releases/download/v1.0/flickr8k.zip"
!unzip -q flickr8k.zip -d ./flickr8k


--2024-04-12 12:47:14--  https://github.com/awsaf49/flickr-dataset/releases/download/v1.0/flickr8k.zip
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/753516996/d7c62b13-1e50-40ea-8fae-f34a44b1695f?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAVCODYLSA53PQK4ZA%2F20240412%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240412T124715Z&X-Amz-Expires=300&X-Amz-Signature=f0b450a735b7ff734cae42883120d2bed1c747a1feaa949f4c81ac7d8d0a1eb6&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=753516996&response-content-disposition=attachment%3B%20filename%3Dflickr8k.zip&response-content-type=application%2Foctet-stream [following]
--2024-04-12 12:47:15--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/753516996/d7c62b13-1e50-40ea-8fae-f34a44b1695f?X-Amz-Algo

In [None]:
import pandas as pd
import os

# Load the captions
captions_df = pd.read_csv('./flickr8k/captions.txt')

# Display the first few rows to understand the structure
print(captions_df.head())

# List the image paths
image_dir = './flickr8k/Images/'
image_paths = [os.path.join(image_dir, filename) for filename in os.listdir(image_dir) if filename.endswith('.jpg')]

# Display the first few image paths to ensure they're loaded correctly
print(image_paths[:5])


In [None]:
import random

# Initialize a list to hold the no-match pairs
no_match_pairs = []

# Iterate over each image
for image_path in image_paths:
    # Randomly select a caption that is not associated with the current image
    potential_no_matches = captions_df[~captions_df['image'].str.contains(os.path.basename(image_path))]
    random_caption = potential_no_matches.sample(1).iloc[0]['caption']

    # Append the no-match pair
    no_match_pairs.append({'image': image_path, 'caption': random_caption, 'match': 0})

# Convert the no-match pairs to a DataFrame
no_match_df = pd.DataFrame(no_match_pairs)

# Display the first few rows of the no-match pairs DataFrame
print(no_match_df.head())


In [None]:
# Add a 'match' column to captions_df, setting all to 1 to indicate these are match pairs
captions_df['match'] = 1

# Combine the match and no-match pairs into a single DataFrame
combined_df = pd.concat([captions_df, no_match_df], ignore_index=True)

# Shuffle the combined DataFrame to ensure a mix of match and no-match pairs
combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Display the first few rows of the combined DataFrame
print(combined_df.head())


In [None]:
# Display a few rows where the 'match' column is 0 to confirm the presence of no-match pairs
print(combined_df[combined_df['match'] == 0].head())


In [None]:
from sklearn.model_selection import train_test_split

# Splitting the combined data into training+validation and test sets (85% training+validation, 15% testing)
train_val_df, test_df = train_test_split(combined_df, test_size=0.15, random_state=42)

# Splitting the training+validation data further into training and validation sets (82.35% training, 17.65% validation which is about 70% of the original data for training and 15% for validation)
train_df, val_df = train_test_split(train_val_df, test_size=0.1765, random_state=42)  # 0.1765 is approximately 15/85

# Display the size of each set
print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Testing set size: {len(test_df)}")


In [None]:
!ls ./flickr8k/Images/


In [None]:
print(train_df['image'].head())


In [None]:
# Prepend the image directory path to each image filename in the DataFrame
train_df['image'] = image_dir + train_df['image']


In [None]:
# Correct the image paths in the DataFrame
train_df['image'] = train_df['image'].apply(lambda x: x.replace(image_dir, '') if x.startswith(image_dir) else x)
train_df['image'] = image_dir + train_df['image']

# Verify the correction by displaying the first few image paths
print(train_df['image'].head())


In [None]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np

def preprocess_image(image_path):
    img = load_img(image_path, target_size=(224, 224))  # Resize image
    img_array = img_to_array(img)  # Convert to array
    img_array = img_array / 255.0  # Normalize to [0, 1] range
    return img_array

# Apply the preprocessing to each image in the training set
# Creating a smaller subset for training to avoid memory issues
small_train_df = train_df.sample(frac=0.1, random_state=42)

# Applying the preprocessing to each image in the smaller training set
small_train_images = np.array([preprocess_image(path) for path in small_train_df['image']])

