In [6]:
import torch
import clip
from PIL import Image

In [5]:
import torch
import clip
from PIL import Image
import json
from torch.utils.data import DataLoader, Dataset

# Define a dataset class specifically for handling text (captions)
class CaptionDataset(Dataset):
    def __init__(self, json_file):
        with open(json_file, 'r') as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        caption = self.data[idx]['updated_caption']
        return caption, idx

# Function to compute text embeddings and update the JSON file
def compute_text_embeddings_and_update_json(json_file, model, device):
    dataset = CaptionDataset(json_file)
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    embeddings = []

    with torch.no_grad():
        for captions, idxs in loader:
            captions = clip.tokenize(captions).to(device)
            
            # Get text embeddings
            text_embeddings = model.encode_text(captions)
            
            # Normalize embeddings
            normalized_text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
            embeddings.append((idxs.item(), normalized_text_embeddings.squeeze(0).cpu().numpy().tolist()))

    # Update the JSON file with text embeddings
    with open(json_file, 'r+') as f:
        data = json.load(f)
        for idx, emb in embeddings:
            data[idx]['text_embedding'] = emb
        f.seek(0)
        json.dump(data, f, indent=4)
        f.truncate()

# Load the model and set up the device
device = "cuda" if torch.cuda.is_available() else "cpu"
model, _ = clip.load("ViT-B/32", device=device)
model.eval()

# Specify the path to the JSON file
json_file = '/workspaces/finetune/AFINAL/clip/clip_gpt_ootb/clip_new_caption_embedding.json'

# Compute text embeddings and update the JSON file
compute_text_embeddings_and_update_json(json_file, model, device)


In [8]:
# Load the CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)
model.eval()

class ImageDataset(Dataset):
    def __init__(self, json_file):
        with open(json_file, 'r') as f:
            self.data = json.load(f)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        image_path = item['image_path']
        # Ensure the image is correctly preprocessed into a 4D tensor
        image = preprocess(Image.open(image_path).convert('RGB')).to(device)
        return image, idx

In [9]:
import numpy as np
# Load and index text embeddings
def load_text_embeddings(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    path_to_embedding = {}
    for item in data:
        for path in item['image_paths']:
            path_to_embedding[path] = item['text_embedding']
    return path_to_embedding

# Then in your loop processing each image, ensure you handle the dimensions correctly:
def compute_embeddings_and_update_json(dataset_json, text_embeddings_json, output_json):
    text_embeddings = load_text_embeddings(text_embeddings_json)
    dataset = ImageDataset(dataset_json)
    loader = DataLoader(dataset, batch_size=1, shuffle=False)
    
    with open(dataset_json, 'r+') as f:
        data = json.load(f)
        with torch.no_grad():
            for images, idxs in loader:
                idx = idxs.item()
                image_embeddings = model.encode_image(images).cpu()

                # Make sure image_embeddings are squeezed if they are 2D [1, embedding_size]
                if image_embeddings.dim() == 2:
                    image_embeddings = image_embeddings.squeeze(0)  # Now [embedding_size]

                image_path = data[idx]['image_path']
                
                # Find the corresponding text embedding and ensure it's also squeezed
                if image_path in text_embeddings:
                    text_embedding = torch.tensor(text_embeddings[image_path], dtype=torch.float32).to(device)
                    if text_embedding.dim() == 2:
                        text_embedding = text_embedding.squeeze(0)  # Now [embedding_size]

                    # Concatenate embeddings ensuring both are 1D [embedding_size]
                    combined_embedding = torch.cat((image_embeddings, text_embedding), dim=0).tolist()
                    data[idx]['combined_embedding'] = combined_embedding
        
        f.seek(0)
        json.dump(data, f, indent=4)
        f.truncate()



# Paths to the JSON files
dataset_files = [
    '/workspaces/finetune/AFINAL/clip/clip_gpt_ootb/output/test_data.json',
    '/workspaces/finetune/AFINAL/clip/clip_gpt_ootb/output/train_data.json',
    '/workspaces/finetune/AFINAL/clip/clip_gpt_ootb/output/val_data.json'
]
text_embeddings_json = '/workspaces/finetune/AFINAL/clip/clip_gpt_ootb/clip_new_caption_embedding.json'

# Process each dataset file
for file in dataset_files:
    compute_embeddings_and_update_json(file, text_embeddings_json, file)

In [10]:
import json
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to load data from JSON file
def load_data(filepath):
    with open(filepath, 'r') as file:
        return json.load(file)

# Function to find embeddings for images based on image_path matching
def find_embeddings(data_set, embeddings_data):
    embeddings_list = []
    for item in data_set:
        for data in embeddings_data:
            if data['image_path'] == item['image_path']:
                item['combined_embedding'] = data['combined_embedding']
                embeddings_list.append(item)
                break
    return embeddings_list

# Function to compute top K accuracy
def compute_top_k_accuracy(query_images, gallery_data, k):
    total_queries = len(query_images)
    top_k_hits = 0

    # Convert gallery embeddings and classes to numpy arrays for faster processing
    gallery_embeddings = np.array([item['combined_embedding'] for item in gallery_data])
    gallery_classes = [item['class'] for item in gallery_data]

    for query in query_images:
        query_embedding = np.array([query['combined_embedding']])
        query_class = query['class']
        similarities = cosine_similarity(query_embedding, gallery_embeddings)[0]
        top_k_indices = np.argsort(similarities)[-k:]
        top_k_classes = np.array(gallery_classes)[top_k_indices]

        # Check if the query class is in the top k classes
        if query_class in top_k_classes:
            top_k_hits += 1

    return top_k_hits / total_queries if total_queries > 0 else 0

# Load the datasets
query_set = load_data('/workspaces/finetune/AFINAL/clip/G&Q BASE/query_set.json')
gallery_set = load_data('/workspaces/finetune/AFINAL/clip/G&Q BASE/gallery_set.json')
embeddings_data_test = load_data('/workspaces/finetune/AFINAL/clip/clip_gpt_ootb/output/test_data.json')
embeddings_data_train = load_data('/workspaces/finetune/AFINAL/clip/clip_gpt_ootb/output/train_data.json')
embeddings_data_val = load_data('/workspaces/finetune/AFINAL/clip/clip_gpt_ootb/output/val_data.json')


# Find embeddings for each image in the query and gallery sets
query_images_with_embeddings_train = find_embeddings(query_set, embeddings_data_train)
query_images_with_embeddings_test = find_embeddings(query_set, embeddings_data_test)
query_images_with_embeddings_val = find_embeddings(query_set, embeddings_data_val)

gallery_images_with_embeddings_train = find_embeddings(gallery_set, embeddings_data_train)
gallery_images_with_embeddings_test = find_embeddings(gallery_set, embeddings_data_test)
gallery_images_with_embeddings_val = find_embeddings(gallery_set, embeddings_data_val)
# Combine the embeddings for query and gallery sets
query_images_with_embeddings = query_images_with_embeddings_train + query_images_with_embeddings_test + query_images_with_embeddings_val
gallery_images_with_embeddings = gallery_images_with_embeddings_train + gallery_images_with_embeddings_test + gallery_images_with_embeddings_val


# Compute the Top-K accuracy
for k in range(1, 11):
    top_k_accuracy = compute_top_k_accuracy(query_images_with_embeddings, gallery_images_with_embeddings, k)
    print(f"Average Top-{k} Accuracy: {top_k_accuracy:.4f}")


Average Top-1 Accuracy: 0.4160
Average Top-2 Accuracy: 0.5440
Average Top-3 Accuracy: 0.5600
Average Top-4 Accuracy: 0.6000
Average Top-5 Accuracy: 0.6080
Average Top-6 Accuracy: 0.6320
Average Top-7 Accuracy: 0.6640
Average Top-8 Accuracy: 0.6640
Average Top-9 Accuracy: 0.6640
Average Top-10 Accuracy: 0.6640


In [4]:
# Input text
text = """Absolutely, here is a revised product description: This product showcases an individual holding a meticulously crafted piece of a car. The item is characterized by its well-defined edges that are cleanly cut, indicating a high level of precision during its creation. Each corner is sharp and distinct, pointing to a great level of detail and thought put into its design. The color of the car part is not just a simple monotone. Instead, it possesses a vibrant, metallic hue that catches the light and reflects it back in a myriad of glimmering specks. This car piece, held in the person's hand, not only signifies the quality of the product but also adds a touch of realism and relatability. The combination of its sharp edges, defined corners, and radiant color makes this product a noteworthy piece for any car enthusiasts or collectors."""

# Count the number of characters
num_characters = len(text)
print(f"Number of characters: {num_characters}")

Number of characters: 839
