In [1]:
!pip install clip

Collecting clip
  Downloading clip-0.2.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-0.2.0-py3-none-any.whl size=6988 sha256=212682e7ff5f2d97e11642574d709eee192f478c423e53403f4d216d30129cd5
  Stored in directory: /root/.cache/pip/wheels/7f/5c/e6/2c0fdb453a3569188864b17e9676bea8b3b7e160c037117869
Successfully built clip
Installing collected packages: clip
Successfully installed clip-0.2.0


In [3]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm

class ClipFineTuneDataset(Dataset):
    def __init__(self, dataset_dir, processor):
        self.dataset_dir = dataset_dir
        self.processor = processor
        self.data = []

        # Load all data
        for i in range(1, 24):
            link_dir = os.path.join(dataset_dir, f"link{i}")
            captions_folder = os.path.join(link_dir, "captions")
            images_folder = os.path.join(link_dir, "images")

            if os.path.exists(captions_folder) and os.path.exists(images_folder):
                for caption_file in os.listdir(captions_folder):
                    if caption_file.endswith(".txt"):
                        caption_path = os.path.join(captions_folder, caption_file)
                        image_file = caption_file.replace("gpt_", "").replace(".txt", ".jpg")
                        image_path = os.path.join(images_folder, image_file)

                        if os.path.exists(image_path):
                            self.data.append((image_path, caption_path))
                        else:
                            print(f"Image file not found for caption: {caption_file}")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_path, caption_path = self.data[idx]

        # Load image
        image = Image.open(image_path).convert("RGB")

        # Load caption
        with open(caption_path, "r") as f:
            caption = f.read().strip()

        return image, caption


# Custom collate function to preprocess images and captions in a batch
def collate_fn(batch):
    images, captions = zip(*batch)

    # Preprocess images and captions using CLIPProcessor with truncation
    inputs = processor(
        text=list(captions),
        images=list(images),
        return_tensors="pt",
        padding=True,  # Pad captions to the same length
        truncation=True  # Truncate long captions to fit the model's max sequence length
    )

    # Debug tokenized text lengths
    input_lengths = [len(seq) for seq in inputs['input_ids']]
    print(f"Tokenized text lengths: {input_lengths}")

    return inputs


# Initialize device, model, and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load dataset and dataloader
dataset_dir = "/content/drive/MyDrive/Dataset/"
dataset = ClipFineTuneDataset(dataset_dir, processor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Process batches with progress bar
for batch in tqdm(dataloader, desc="Processing Batches"):
    # Move batch to the appropriate device
    batch = {k: v.to(device) for k, v in batch.items()}

    # Forward pass through the model
    outputs = model(**batch)

    # Extract image and text embeddings
    image_embeds = outputs.image_embeds
    text_embeds = outputs.text_embeds

    print("Batch processed!")
    break


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Image file not found for caption: gpt_image_3.txt
Image file not found for caption: gpt_image_1.txt
Image file not found for caption: gpt_image_2.txt


Processing Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Processing Batches:   0%|          | 0/7 [00:26<?, ?it/s]

Batch processed!





In [4]:
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel
from torch.utils.data import DataLoader
from tqdm import tqdm

# Define Contrastive Loss
class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.07):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature
        self.cross_entropy = nn.CrossEntropyLoss()

    def forward(self, image_embeds, text_embeds):
        # Normalize embeddings
        image_embeds = image_embeds / image_embeds.norm(dim=-1, keepdim=True)
        text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

        # Similarity matrix (dot product of normalized embeddings)
        logits = image_embeds @ text_embeds.T / self.temperature

        # Labels are diagonal indices
        labels = torch.arange(len(logits)).to(logits.device)

        # Loss is the average of image-to-text and text-to-image
        loss = (self.cross_entropy(logits, labels) + self.cross_entropy(logits.T, labels)) / 2
        return loss

# Initialize Model, Processor, and Dataset
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Custom Dataset and Dataloader (already defined previously)
dataset_dir = "/content/drive/MyDrive/Dataset/"
dataset = ClipFineTuneDataset(dataset_dir, processor)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

# Optimizer and Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
num_epochs = 4

# Fine-tuning Loop
contrastive_loss = ContrastiveLoss(temperature=0.07)
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass
        outputs = model(**batch)
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds

        # Compute contrastive loss
        loss = contrastive_loss(image_embeds, text_embeds)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(dataloader):.4f}")

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_clip")
processor.save_pretrained("./fine_tuned_clip")


Image file not found for caption: gpt_image_3.txt
Image file not found for caption: gpt_image_1.txt
Image file not found for caption: gpt_image_2.txt


Epoch 1/4:   0%|          | 0/7 [00:00<?, ?it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 1/4:  14%|█▍        | 1/7 [00:22<02:15, 22.64s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 1/4:  29%|██▊       | 2/7 [00:47<01:59, 23.95s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 1/4:  43%|████▎     | 3/7 [01:11<01:35, 23.83s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 1/4:  57%|█████▋    | 4/7 [01:49<01:28, 29.65s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 1/4:  71%|███████▏  | 5/7 [02:12<00:54, 27.01s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 1/4:  86%|████████▌ | 6/7 [02:32<00:24, 24.69s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 1/4: 100%|██████████| 7/7 [02:54<00:00, 24.99s/it]


Epoch 1/4, Loss: 1.9751


Epoch 2/4:   0%|          | 0/7 [00:00<?, ?it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 2/4:  14%|█▍        | 1/7 [00:01<00:06,  1.00s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 2/4:  29%|██▊       | 2/7 [00:02<00:05,  1.00s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 2/4:  43%|████▎     | 3/7 [00:03<00:04,  1.02s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 2/4:  57%|█████▋    | 4/7 [00:04<00:03,  1.00s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 2/4:  71%|███████▏  | 5/7 [00:04<00:01,  1.01it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 2/4:  86%|████████▌ | 6/7 [00:05<00:00,  1.02it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 2/4: 100%|██████████| 7/7 [00:06<00:00,  1.03it/s]


Epoch 2/4, Loss: 0.6248


Epoch 3/4:   0%|          | 0/7 [00:00<?, ?it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 3/4:  14%|█▍        | 1/7 [00:00<00:05,  1.11it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 3/4:  29%|██▊       | 2/7 [00:01<00:04,  1.01it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 3/4:  43%|████▎     | 3/7 [00:03<00:04,  1.02s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 3/4:  57%|█████▋    | 4/7 [00:03<00:02,  1.02it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 3/4:  71%|███████▏  | 5/7 [00:05<00:02,  1.02s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 3/4:  86%|████████▌ | 6/7 [00:06<00:01,  1.01s/it]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 3/4: 100%|██████████| 7/7 [00:06<00:00,  1.03it/s]


Epoch 3/4, Loss: 0.3021


Epoch 4/4:   0%|          | 0/7 [00:00<?, ?it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 4/4:  14%|█▍        | 1/7 [00:00<00:05,  1.06it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 4/4:  29%|██▊       | 2/7 [00:01<00:04,  1.05it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 4/4:  43%|████▎     | 3/7 [00:02<00:03,  1.05it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 4/4:  57%|█████▋    | 4/7 [00:03<00:02,  1.06it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 4/4:  71%|███████▏  | 5/7 [00:04<00:01,  1.04it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 4/4:  86%|████████▌ | 6/7 [00:05<00:00,  1.02it/s]

Tokenized text lengths: [77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77]


Epoch 4/4: 100%|██████████| 7/7 [00:06<00:00,  1.04it/s]


Epoch 4/4, Loss: 0.1880


[]

In [10]:
image_data_path = "/content/drive/MyDrive/Dataset/GPTFramesUnique.json"
text_data_path = "/content/drive/MyDrive/Dataset/headings.json"

In [14]:
import json

input_file = image_data_path

with open(input_file, 'r', encoding='utf-8') as f:
    data = json.load(f)

filtered_data = {key: value for key, value in data.items() if value == "Code NA"}

print(len(filtered_data))


481


In [16]:
import json
import os
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import torch
from tqdm import tqdm

class FilteredImageDataset(Dataset):
    def __init__(self, filtered_image_paths):
        """
        Initialize the dataset with the filtered image paths.

        :param filtered_image_paths: List of full paths to valid images.
        """
        self.image_paths = filtered_image_paths

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = Image.open(image_path).convert("RGB")
        return image, image_path

def image_collate_fn(batch):
    """
    Custom collate function to preprocess images for CLIPProcessor.
    """
    images, image_paths = zip(*batch)
    inputs = processor(images=list(images), return_tensors="pt", padding=True)
    return inputs["pixel_values"], image_paths



dataset_dir = '/content/drive/MyDrive/Dataset/GPTFramesUnique/'
filtered_image_paths = [os.path.join(dataset_dir, key) for key in filtered_data.keys()]


image_dataset = FilteredImageDataset(filtered_image_paths)
image_dataloader = DataLoader(
    image_dataset,
    batch_size=32,
    shuffle=False,
    collate_fn=image_collate_fn
)

image_embeddings = []
image_paths = []

model.eval()
with torch.no_grad():
    for pixel_values, paths in tqdm(image_dataloader, desc="Encoding Images"):
        pixel_values = pixel_values.to(device)
        outputs = model.get_image_features(pixel_values=pixel_values)
        image_embeddings.append(outputs.cpu())
        image_paths.extend(paths)

image_embeddings = torch.cat(image_embeddings)

print(f"Processed {len(image_paths)} images.")


Encoding Images: 100%|██████████| 16/16 [04:00<00:00, 15.04s/it]

Processed 481 images.





In [28]:
import json
import torch
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

text_data_path = "/content/drive/MyDrive/Dataset/headings.json"

with open(text_data_path, "r", encoding="utf-8") as f:
    text_data = json.load(f)

headings = text_data

text_embeddings = []

model.eval()
with torch.no_grad():
    for i in tqdm(range(0, len(headings), 32), desc="Encoding Text Headings"):
        batch_headings = headings[i:i + 32]
        inputs = processor(text=batch_headings, return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model.get_text_features(**inputs)
        text_embeddings.append(outputs.cpu())

text_embeddings = torch.cat(text_embeddings)

threshold = 0.3
results = {}

for i, heading_embedding in enumerate(text_embeddings):
    similarities = cosine_similarity(heading_embedding.unsqueeze(0), image_embeddings)
    max_similarity_index = similarities.argmax()
    max_similarity_score = similarities[0, max_similarity_index]

    if max_similarity_score >= threshold:
        results[headings[i]] = image_paths[max_similarity_index]
    else:
        results[headings[i]] = None

for heading, image_path in results.items():
    print(f"Heading: {heading} -> Image Path: {image_path}")

Encoding Text Headings: 100%|██████████| 3/3 [00:00<00:00, 101.34it/s]


Heading: Exploring the Foundations of ChatGPT: A Journey Through Language Models -> Image Path: None
Heading: Understanding ChatGPT's Variability -> Image Path: None
Heading: The Power of the Transformer Architecture -> Image Path: /content/drive/MyDrive/Dataset/GPTFramesUnique/frame_5203.jpg
Heading: Simplifying the Complex: A DIY Approach to Transformers -> Image Path: None
Heading: Setting Up the Dataset -> Image Path: None
Heading: Uncovering the Vocabulary -> Image Path: None
Heading: Tokenizing the Text -> Image Path: None
Heading: Building a Character-Level Language Model with the Tiny Shakespeare Dataset -> Image Path: None
Heading: Data Acquisition and Preliminary Setup -> Image Path: None
Heading: Understanding the Vocabulary -> Image Path: None
Heading: Tokenization Strategy -> Image Path: None
Heading: Data Preparation for Model Training -> Image Path: None
Heading: Feeding Data into the Transformer Model -> Image Path: None
Heading: Building a Character-Level Transformer M

In [None]:
text_data_path = "/content/drive/MyDrive/Dataset/headings.json"
