In [11]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import clip
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import os

In [12]:
import json

# Function to load JSON data
def load_json(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

# Function to save JSON data
def save_json(data, filepath):
    with open(filepath, 'w') as file:
        json.dump(data, file, indent=4)

# Function to match captions and update data
def match_captions(data, caption_data):
    for item in data:
        for caption_item in caption_data:
            if item['image_path'] == caption_item['image_path']:
                item['caption'] = caption_item['updated_caption']
                break
    return data

# Load datasets
train_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/train_data.json')
val_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/val_data.json')
caption_data = load_json('/workspaces/finetune/AFINAL/clip/clip_gpt_ootb/clip_new_caption_embedding.json')

# Update captions in the datasets
train_data = match_captions(train_data, caption_data)
val_data = match_captions(val_data, caption_data)

# Save updated datasets
save_json(train_data, '/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/train_data.json')
save_json(val_data, '/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/val_data.json')


In [13]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import clip

# Load JSON data
def load_json(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

train_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/train_data.json')
val_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/val_data.json')
caption_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/clip_caption_embedding.json')

# Match image paths with captions
def match_captions(data, caption_data):
    for item in data:
        for caption_item in caption_data:
            if item['image_path'] == caption_item['image_path']:
                item['caption'] = caption_item['caption']
                break
    return data

train_data = match_captions(train_data, caption_data)
val_data = match_captions(val_data, caption_data)


In [14]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import clip

# Load JSON data
def load_json(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

train_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/train_data.json')
val_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/val_data.json')
caption_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/clip_caption_embedding.json')

# Match image paths with captions
def match_captions(data, caption_data):
    for item in data:
        for caption_item in caption_data:
            if item['image_path'] == caption_item['image_path']:
                item['caption'] = caption_item['caption']
                break
    return data

train_data = match_captions(train_data, caption_data)
val_data = match_captions(val_data, caption_data)

class CustomClipDataset(Dataset):
    def __init__(self, data, preprocess):
        self.data = data
        self.preprocess = preprocess

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image = Image.open(item['image_path']).convert('RGB')
        image = self.preprocess(image)
        caption = item['caption']
        label = item['class']
        return image, caption, label

# Initialize the dataset and dataloaders
preprocess = clip.load("ViT-B/32")[1]

train_dataset = CustomClipDataset(train_data, preprocess)
val_dataset = CustomClipDataset(val_data, preprocess)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)


In [15]:
import torch
import torch.nn as nn
import clip

class CustomClipModel(nn.Module):
    def __init__(self, clip_model):
        super(CustomClipModel, self).__init__()
        self.clip_model = clip_model

        # Freeze all parameters in the CLIP model
        for param in self.clip_model.parameters():
            param.requires_grad = False

        # Define the new head for combining image and text embeddings
        self.image_embedding_dim = self.clip_model.visual.output_dim
        self.text_embedding_dim = self.clip_model.text_projection.shape[1]
        self.combined_dim = self.image_embedding_dim + self.text_embedding_dim

        self.fc = nn.Linear(self.combined_dim, 512)

    def forward(self, image, text):
        image_features = self.clip_model.encode_image(image)
        text_features = self.clip_model.encode_text(text)
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        return output

# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)
model = CustomClipModel(clip_model).to(device)


In [16]:
import json
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import clip
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
import os
import matplotlib.pyplot as plt

# Load JSON data
def load_json(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

train_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/train_data.json')
val_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/val_data.json')
caption_data = load_json('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/clip_caption_embedding.json')

# Match image paths with captions
def match_captions(data, caption_data):
    for item in data:
        for caption_item in caption_data:
            if item['image_path'] == caption_item['image_path']:
                item['caption'] = caption_item['caption']
                break
    return data

train_data = match_captions(train_data, caption_data)
val_data = match_captions(val_data, caption_data)

# Define a dataset class for handling images and captions
class CustomClipDataset(Dataset):
    def __init__(self, data, preprocess):
        self.data = data
        self.preprocess = preprocess
        self.label_to_index = self.create_label_to_index()
        self.index_to_label = {v: k for k, v in self.label_to_index.items()}
        self.class_to_images = self.create_class_to_images()

    def create_label_to_index(self):
        labels = sorted(set(item['class'] for item in self.data))
        return {label: idx for idx, label in enumerate(labels)}

    def create_class_to_images(self):
        class_to_images = {}
        for item in self.data:
            cls = item['class']
            if cls not in class_to_images:
                class_to_images[cls] = []
            class_to_images[cls].append(item['image_path'])
        return class_to_images

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image = Image.open(item['image_path']).convert('RGB')
        image = self.preprocess(image)
        caption = item['caption']
        label = self.label_to_index[item['class']]
        return image, caption, label

# Initialize the dataset and dataloaders
preprocess = clip.load("ViT-B/32")[1]

train_dataset = CustomClipDataset(train_data, preprocess)
val_dataset = CustomClipDataset(val_data, preprocess)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Define the CustomClipModel
class CustomClipModel(nn.Module):
    def __init__(self, clip_model):
        super(CustomClipModel, self).__init__()
        self.clip_model = clip_model

        # Freeze all parameters in the CLIP model
        for param in self.clip_model.parameters():
            param.requires_grad = False

        # Define the new head for combining image and text embeddings
        self.image_embedding_dim = self.clip_model.visual.output_dim
        self.text_embedding_dim = self.clip_model.text_projection.shape[1]
        self.combined_dim = self.image_embedding_dim + self.text_embedding_dim

        self.fc = nn.Linear(self.combined_dim, 512)

    def forward(self, image, text):
        image_features = self.clip_model.encode_image(image)
        text_features = self.clip_model.encode_text(text)
        combined_features = torch.cat((image_features, text_features), dim=1)
        output = self.fc(combined_features)
        return output

# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

# Initialize the custom model
model = CustomClipModel(clip_model).to(device)

In [17]:


# Function to generate triplets
def generate_triplets(embeddings, targets, model, dataset):
    device = embeddings.device
    batch_size = embeddings.size(0)

    anchors = []
    positives = []
    negatives = []

    for i in range(batch_size):
        anchor = embeddings[i]
        anchor_label = targets[i].item()
        anchor_class = dataset.index_to_label[anchor_label]

        # Find positive example (same class as anchor)
        positive_image_path = random.choice(dataset.class_to_images[anchor_class])
        positive_idx = next(i for i, item in enumerate(dataset.data) if item['image_path'] == positive_image_path)
        positive_image, positive_caption, _ = dataset[positive_idx]
        positive_image = positive_image.unsqueeze(0).to(device)
        positive_embedding = model(positive_image, clip.tokenize([positive_caption]).to(device)).squeeze(0)

        # Find negative example (different class than anchor)
        negative_label = random.choice([label for label in dataset.class_to_images.keys() if label != anchor_class])
        negative_image_path = random.choice(dataset.class_to_images[negative_label])
        negative_idx = next(i for i, item in enumerate(dataset.data) if item['image_path'] == negative_image_path)
        negative_image, negative_caption, _ = dataset[negative_idx]
        negative_image = negative_image.unsqueeze(0).to(device)
        negative_embedding = model(negative_image, clip.tokenize([negative_caption]).to(device)).squeeze(0)

        # Debugging: Print shapes and verify different embeddings
        #print(f"Anchor: {anchor.shape}, Positive: {positive_embedding.shape}, Negative: {negative_embedding.shape}")
        assert anchor_label != dataset.label_to_index[negative_label], "Anchor and negative have the same label!"

        anchors.append(anchor)
        positives.append(positive_embedding)
        negatives.append(negative_embedding)

    return torch.stack(anchors), torch.stack(positives), torch.stack(negatives)

# Define the triplet loss function
def triplet_loss(anchor, positive, negative, margin=1.0):
    positive_distance = F.pairwise_distance(anchor, positive)
    negative_distance = F.pairwise_distance(anchor, negative)
    loss = torch.mean(F.relu(positive_distance - negative_distance + margin))
    
    # Debugging: Print distances and loss
    #print(f"Positive Distance: {positive_distance.mean().item()}, Negative Distance: {negative_distance.mean().item()}, Loss: {loss.item()}")
    
    return loss
from tqdm import tqdm


In [None]:
# Training function with tqdm progress bars
def train_model(model, train_loader, val_loader, optimizer, num_epochs=10, save_dir='model_weights'):
    os.makedirs(save_dir, exist_ok=True)
    train_losses = []
    val_losses = []

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        train_loader_tqdm = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        
        for batch_idx, (images, captions, labels) in enumerate(train_loader_tqdm):
            images = images.to(device)
            texts = clip.tokenize(captions).to(device)
            labels = torch.tensor(labels, dtype=torch.long).to(device)  # Ensure labels are long tensor

            optimizer.zero_grad()
            embeddings = model(images, texts)

            anchors, positives, negatives = generate_triplets(embeddings, labels, model, train_dataset)

            loss = triplet_loss(anchors, positives, negatives)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

            train_loader_tqdm.set_postfix(loss=loss.item())

        avg_train_loss = train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        val_loss = 0
        model.eval()
        val_loader_tqdm = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")

        with torch.no_grad():
            for batch_idx, (images, captions, labels) in enumerate(val_loader_tqdm):
                images = images.to(device)
                texts = clip.tokenize(captions).to(device)
                labels = torch.tensor(labels, dtype=torch.long).to(device)
                embeddings = model(images, texts)

                anchors, positives, negatives = generate_triplets(embeddings, labels, model, val_dataset)

                loss = triplet_loss(anchors, positives, negatives)
                val_loss += loss.item()

                val_loader_tqdm.set_postfix(loss=loss.item())

        avg_val_loss = val_loss / len(val_loader)
        val_losses.append(avg_val_loss)

        print(f"Epoch [{epoch+1}/{num_epochs}] completed. Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        # Save model weights
        torch.save(model.state_dict(), os.path.join(save_dir, f'epoch_{epoch+1}.pth'))

    return train_losses, val_losses

# Train the model
optimizer = optim.Adam(model.fc.parameters(), lr=1e-4)  # Only train the new head
train_losses, val_losses = train_model(model, train_loader, val_loader, optimizer, num_epochs=10)

# Plotting the losses
plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Triplet Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.show()


In [18]:
import json

# Load JSON data
def load_json(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

# Save JSON data
def save_json(data, filepath):
    with open(filepath, 'w') as file:
        json.dump(data, file, indent=4)

# File paths
file_with_updated_captions = '/workspaces/finetune/AFINAL/clip/clip_gpt_ootb/clip_new_caption_embedding.json'
file_without_updated_captions = '/workspaces/finetune/AFINAL/clip/clip_blip_finetune/clip_caption_embedding.json'

# Load datasets
data_with_updated_captions = load_json(file_with_updated_captions)
data_without_updated_captions = load_json(file_without_updated_captions)

# Create a dictionary for quick lookup of updated captions
updated_captions_dict = {item['image_path']: item['updated_caption'] for item in data_with_updated_captions}

# Update the records in the second dataset
for item in data_without_updated_captions:
    image_path = item['image_path']
    if image_path in updated_captions_dict:
        item['updated_caption'] = updated_captions_dict[image_path]

# Save the updated dataset
save_json(data_without_updated_captions, file_without_updated_captions)

print("Updated captions have been added to the second JSON file.")


Updated captions have been added to the second JSON file.


In [19]:
# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model, preprocess = clip.load("ViT-B/32", device=device)

# Initialize the custom model
model = CustomClipModel(clip_model).to(device)

# Load the pre-trained weights into the custom model
checkpoint_path = '/workspaces/finetune/AFINAL/clip/clip_blip_finetune/model_weights/epoch_10.pth'
model.load_state_dict(torch.load(checkpoint_path, map_location=device))

# Define a dataset class specifically for handling text (captions)
class CaptionDataset(Dataset):
    def __init__(self, json_file):
        with open(json_file, 'r') as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        caption = self.data[idx]['updated_caption']
        return caption, idx

# Function to compute text embeddings and update the JSON file
def compute_text_embeddings_and_update_json(json_file, model, device):
    dataset = CaptionDataset(json_file)
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    embeddings = []

    with torch.no_grad():
        for captions, idxs in loader:
            captions = clip.tokenize(captions).to(device)
            
            # Get text embeddings
            text_embeddings = model.clip_model.encode_text(captions)
            
            # Normalize embeddings
            normalized_text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
            embeddings.append((idxs.item(), normalized_text_embeddings.squeeze(0).cpu().numpy().tolist()))
            
            # Debugging output
            print(f"Processed caption index: {idxs.item()}, embedding: {normalized_text_embeddings.squeeze(0).cpu().numpy().tolist()}")

    # Update the JSON file with text embeddings
    with open(json_file, 'r+') as f:
        data = json.load(f)
        for idx, emb in embeddings:
            data[idx]['text_embedding'] = emb
        f.seek(0)
        json.dump(data, f, indent=4)
        f.truncate()

    print(f"Updated {json_file} with text embeddings.")

# Specify the path to the JSON file
json_file = '/workspaces/finetune/AFINAL/clip/clip_blip_finetune/clip_caption_embedding.json'

# Compute text embeddings and update the JSON file
# compute_text_embeddings_and_update_json(json_file, model, device)

In [20]:
class ImageDataset(Dataset):
    def __init__(self, json_file):
        with open(json_file, 'r') as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image_path = item['image_path']
        # Ensure the image is correctly preprocessed into a 4D tensor
        image = preprocess(Image.open(image_path).convert('RGB')).to(device)
        return image, idx

In [24]:
import numpy as np

# Load and index text embeddings
def load_text_embeddings(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    path_to_embedding = {}
    for item in data:
        for path in item['image_paths']:
            path_to_embedding[path] = item['text_embedding']
    return path_to_embedding

# Compute embeddings and update the JSON file
def compute_embeddings_and_update_json(dataset_json, text_embeddings_json, output_json):
    text_embeddings = load_text_embeddings(text_embeddings_json)
    dataset = ImageDataset(dataset_json)
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    
    with open(dataset_json, 'r+') as f:
        data = json.load(f)
        with torch.no_grad():
            for images, idxs in loader:
                idx = idxs.item()
                image_embeddings = images.to(device)

                image_path = data[idx]['image_path']
                
                # Find the corresponding text embedding
                if image_path in text_embeddings:
                    text_embedding = torch.tensor(text_embeddings[image_path], dtype=torch.float32).to(device)
                    
                    # Ensure the image_embeddings tensor is 4D
                    if image_embeddings.dim() == 3:
                        image_embeddings = image_embeddings.unsqueeze(0)  # Adding batch dimension

                    # Get image embeddings
                    image_features = model.clip_model.encode_image(image_embeddings)

                    # Concatenate embeddings ensuring both are 1D [embedding_size]
                    combined_embedding = torch.cat((image_features.squeeze(0), text_embedding), dim=0).unsqueeze(0)

                    # Pass the combined embedding through the last trained layer
                    final_embedding = model.fc(combined_embedding).squeeze(0).cpu().tolist()
                    data[idx]['combined_embedding'] = final_embedding
        
        f.seek(0)
        json.dump(data, f, indent=4)
        f.truncate()
        
# Paths to the JSON files
dataset_files = [
    '/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/test_data.json',
    '/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/train_data.json',
    '/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/val_data.json'
]
text_embeddings_json = '/workspaces/finetune/AFINAL/clip/clip_blip_finetune/clip_caption_embedding.json'

# Process each dataset file
for file in dataset_files:
    compute_embeddings_and_update_json(file, text_embeddings_json, file)

In [25]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to load data from JSON file
def load_data(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

# Function to find embeddings for images based on image_path matching
def find_embeddings(data_set, embeddings_data):
    embeddings_list = []
    for item in data_set:
        for data in embeddings_data:
            if data['image_path'] == item['image_path']:
                item['combined_embedding'] = data['combined_embedding']
                embeddings_list.append(item)
                break
    return embeddings_list

# Function to compute top K accuracy
def compute_top_k_accuracy(query_images, gallery_data, k):
    total_queries = len(query_images)
    top_k_hits = 0

    # Convert gallery embeddings and classes to numpy arrays for faster processing
    gallery_embeddings = np.array([item['combined_embedding'] for item in gallery_data])
    gallery_classes = [item['class'] for item in gallery_data]

    for query in query_images:
        query_embedding = np.array([query['combined_embedding']])
        query_class = query['class']
        similarities = cosine_similarity(query_embedding, gallery_embeddings)[0]
        top_k_indices = np.argsort(similarities)[-k:]
        top_k_classes = np.array(gallery_classes)[top_k_indices]

        # Check if the query class is in the top k classes
        if query_class in top_k_classes:
            top_k_hits += 1

    return top_k_hits / total_queries if total_queries > 0 else 0

# Load the datasets
query_set = load_data('/workspaces/finetune/AFINAL/clip/G&Q BASE/query_set.json')
gallery_set = load_data('/workspaces/finetune/AFINAL/clip/G&Q BASE/gallery_set.json')
embeddings_data_test = load_data('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/test_data.json')
embeddings_data_train = load_data('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/train_data.json')
embeddings_data_val = load_data('/workspaces/finetune/AFINAL/clip/clip_blip_finetune/output/val_data.json')


# Find embeddings for each image in the query and gallery sets
query_images_with_embeddings_train = find_embeddings(query_set, embeddings_data_train)
query_images_with_embeddings_test = find_embeddings(query_set, embeddings_data_test)
query_images_with_embeddings_val = find_embeddings(query_set, embeddings_data_val)

gallery_images_with_embeddings_train = find_embeddings(gallery_set, embeddings_data_train)
gallery_images_with_embeddings_test = find_embeddings(gallery_set, embeddings_data_test)
gallery_images_with_embeddings_val = find_embeddings(gallery_set, embeddings_data_val)
# Combine the embeddings for query and gallery sets
query_images_with_embeddings = query_images_with_embeddings_train + query_images_with_embeddings_test + query_images_with_embeddings_val
gallery_images_with_embeddings = gallery_images_with_embeddings_train + gallery_images_with_embeddings_test + gallery_images_with_embeddings_val


# Compute the Top-K accuracy
for k in range(1, 11):
    top_k_accuracy = compute_top_k_accuracy(query_images_with_embeddings, gallery_images_with_embeddings, k)
    print(f"Average Top-{k} Accuracy: {top_k_accuracy:.4f}")


Average Top-1 Accuracy: 0.5280
Average Top-2 Accuracy: 0.6480
Average Top-3 Accuracy: 0.6720
Average Top-4 Accuracy: 0.7120
Average Top-5 Accuracy: 0.7520
Average Top-6 Accuracy: 0.7840
Average Top-7 Accuracy: 0.7840
Average Top-8 Accuracy: 0.7920
Average Top-9 Accuracy: 0.8000
Average Top-10 Accuracy: 0.8080
