In [23]:
import torch
import torch.nn as nn
temb_dim = 4
# time_steps has to have dimension of (batch_size, )
time_steps = torch.tensor([1, 2, 3, 4, 5], dtype = torch.float32)

factor = 10000 ** ((torch.arange(
        start = 0 , end = temb_dim // 2, dtype = torch.float32) / (temb_dim // 2) )
        )

t_emb = time_steps[:, None].repeat(1, temb_dim // 2) / factor
t_emb = torch.cat([torch.sin(t_emb), torch.cos(t_emb)], dim=-1)

In [22]:
t_emb

tensor([[ 0.8415,  0.0100,  0.5403,  0.9999],
        [ 0.9093,  0.0200, -0.4161,  0.9998],
        [ 0.1411,  0.0300, -0.9900,  0.9996],
        [-0.7568,  0.0400, -0.6536,  0.9992],
        [-0.9589,  0.0500,  0.2837,  0.9988]])

In [19]:
t_emb = time_steps[:, None].repeat(1, temb_dim // 2) / factor

In [20]:
t_emb

tensor([[1.0000, 0.0100],
        [2.0000, 0.0200],
        [3.0000, 0.0300],
        [4.0000, 0.0400],
        [5.0000, 0.0500]])

In [13]:
torch.arange(start = 0 , end = temb_dim // 2, dtype = torch.float32) / (temb_dim // 2) 

tensor([0.0000, 0.5000])

In [24]:
conv_layer = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)

# Create a random input tensor with the shape [batch_size, in_channels, height, width]
input_image = torch.randn(1, 3, 32, 32)

# Apply the convolutional layer
output = conv_layer(input_image)

## the formulae for the output layer is given by 

# output_size = ((input_size - kernel_size + 2*padding) / stride) + 1

# Print the shape of the output
print(output.shape)  

torch.Size([1, 16, 32, 32])


In [2]:
from turtle import pos
import torch
import torch.nn as nn
import clip

# Define the feature extractor (Fenc) using CLIP
class FeatureExtractor(nn.Module):
    def __init__(self, model_name="ViT-B/32"):
        super(FeatureExtractor, self).__init__()
        self.model, _ = clip.load(model_name, device='cpu')  # Load the CLIP model
        self.model = self.model.visual  # Use the visual part of the model

    def forward(self, x):
        return self.model(x)

# Define the position embedding (Emb)
class PositionEmbedding(nn.Module):
    def __init__(self, embed_size):
        super(PositionEmbedding, self).__init__()
        self.embed_size = embed_size
        self.embedding = nn.Parameter(torch.randn(1, embed_size))

    def forward(self, x):
        batch_size, _ = x.size()
        embeddings = self.embedding.repeat(batch_size, 1)
        return embeddings

# Define the image input encoder
class ImageInputEncoder(nn.Module):
    def __init__(self, embed_size, model_name="ViT-B/32"):
        super(ImageInputEncoder, self).__init__()
        self.feature_extractor = FeatureExtractor(model_name)
        self.position_embedding = PositionEmbedding(embed_size)
        self.embed_size = embed_size

    def forward(self, x):
        features = self.feature_extractor(x)  # Extract features
        B, C = features.size()
        
        pos_embeddings = self.position_embedding(features)  # Get position embeddings
        print(pos_embeddings.shape)
        features += pos_embeddings
        
        print(features.shape)
        # Aggregate to get the image-level representation
        image_condition = features.mean(dim=1)
        
        return image_condition

# Example usage:
# Create a random image tensor with shape [batch_size, channels, height, width]
input_image = torch.randn(8, 3, 224, 224)  # Example input

# Define the model and forward pass
embed_size = 512  # Example embedding size for CLIP ViT-B/32
model = ImageInputEncoder(embed_size, model_name="ViT-B/32")
output = model(input_image)

print(output.shape)  # Should print: torch.Size([8, 512])


torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8])


In [12]:
import torch
import torch.nn as nn
import clip

# Define the feature extractor (Fenc) using CLIP
class FeatureExtractor(nn.Module):
    def __init__(self, model_name="ViT-B/32"):
        super(FeatureExtractor, self).__init__()
        self.model, _ = clip.load(model_name, device='cpu')  # Load the CLIP model
        self.model = self.model.visual  # Use the visual part of the model

    def forward(self, x):
        return self.model(x)

# Define the position embedding (Emb)
class PositionEmbedding(nn.Module):
    def __init__(self, embed_size):
        super(PositionEmbedding, self).__init__()
        self.embed_size = embed_size
        self.embedding = nn.Parameter(torch.randn(1, 1, embed_size))

    def forward(self, x):
        batch_size, num_patches, _ = x.size()
        embeddings = self.embedding.expand(batch_size, num_patches, -1)
        return embeddings

# Define the image input encoder
class ImageInputEncoder(nn.Module):
    def __init__(self, embed_size, model_name="ViT-B/32"):
        super(ImageInputEncoder, self).__init__()
        self.feature_extractor = FeatureExtractor(model_name)
        self.position_embedding = PositionEmbedding(embed_size)
        self.embed_size = embed_size

    def forward(self, x):
        features = self.feature_extractor(x)  # Extract features
        print(features.shape)
        B, C = features.size()
        
        # Reshape features to match positional embeddings dimensions
        # features = features.view(B, 1, C)
        # # print(features.shape)
        
        # pos_embeddings = self.position_embedding(features)  # Get position embeddings
        # features += pos_embeddings
        
        # print(pos_embeddings.shape)
        # # Aggregate to get the image-level representation
        # image_condition = features.mean(dim=1)
        
        return features

# Example usage:
# Create a random image tensor with shape [batch_size, channels, height, width]
input_image = torch.randn(8, 3, 224, 224)  # Example input

# Define the model and forward pass
embed_size = 512  # Example embedding size for CLIP ViT-B/32
model = ImageInputEncoder(embed_size, model_name="ViT-B/32")
output = model(input_image)

print(output.shape)  # Should print: torch.Size([8, 512])


torch.Size([8, 512])
torch.Size([8, 512])


In [7]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer, CLIPTokenizer, CLIPTextModel

# Define the function to get the tokenizer and model based on the model type
def get_tokenizer_and_model(model_type, device, eval_mode=True):
    if model_type == 'bert':
        text_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        text_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
    elif model_type == 'clip':
        text_tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-base-patch16')
        text_model = CLIPTextModel.from_pretrained('openai/clip-vit-base-patch16').to(device)
    else:
        raise ValueError(f"Unsupported model_type: {model_type}")
    
    if eval_mode:
        text_model.eval()
    
    return text_tokenizer, text_model

# Define the function to get text representation
def get_text_representation(text, text_tokenizer, text_model, device,
                            truncation=True,
                            padding='max_length',
                            max_length=77):
    token_output = text_tokenizer(text,
                                  truncation=truncation,
                                  padding=padding,
                                  return_attention_mask=True,
                                  max_length=max_length,
                                  return_tensors='pt')  # Return PyTorch tensors
    tokens_tensor = token_output['input_ids'].to(device)
    mask_tensor = token_output['attention_mask'].to(device)
    
    with torch.no_grad():  # Disable gradient calculation
        text_embed = text_model(input_ids=tokens_tensor, attention_mask=mask_tensor).last_hidden_state
    
    return text_embed

# Example usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
text = ["A photo of a cat", "A photo of a dog"]
model_type = 'clip'
text_tokenizer, text_model = get_tokenizer_and_model(model_type, device)
text_embed = get_text_representation(text, text_tokenizer, text_model, device)
print(text_embed.shape)  # Should print: torch.Size([1, 77, 512])


torch.Size([2, 77, 512])


In [14]:
from PIL import Image
import requests
from transformers import AutoProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(images=image, return_tensors="pt")

image_features = model.get_image_features(**inputs)
image_features.shape



torch.Size([1, 512])

In [None]:
#TODO we need to add the dataset class for IAM dataset 
#TODO we need to prepare the dataset get method and also load images


import glob
import os
import random
import torch
import torchvision
import numpy as np
from PIL import Image
from Utils.diffusion_utils import load_latents
from tqdm import tqdm
from torch.utils.data.dataset import Dataset


# IAM Dataset Experimenting here Need to complete this

class IAM(Dataset):
    r"""
    Celeb dataset will by default centre crop and resize the images.
    This can be replaced by any other dataset. As long as all the images
    are under one directory.
    """
    
    def __init__(self, split, im_path, im_size=256, im_channels=3, im_ext='jpg',
                 use_latents=False, latent_path=None, condition_config=None):
        self.split = split
        self.im_size = im_size
        self.im_channels = im_channels
        self.im_ext = im_ext
        self.im_path = im_path
        self.latent_maps = None
        self.use_latents = False
        
        self.condition_types = [] if condition_config is None else condition_config['condition_types']
        
        self.idx_to_cls_map = {}
        self.cls_to_idx_map ={}
        
        if 'image' in self.condition_types:
            self.mask_channels = condition_config['image_condition_config']['image_condition_input_channels']
            self.mask_h = condition_config['image_condition_config']['image_condition_h']
            self.mask_w = condition_config['image_condition_config']['image_condition_w']
            
        self.images, self.texts, self.masks = self.load_images(im_path)
        
        # Whether to load images or to load latents
        if use_latents and latent_path is not None:
            latent_maps = load_latents(latent_path)
            if len(latent_maps) == len(self.images):
                self.use_latents = True
                self.latent_maps = latent_maps
                print('Found {} latents'.format(len(self.latent_maps)))
            else:
                print('Latents not found')
    
    def load_images(self, im_path):
        r"""
        Gets all images from the path specified
        and stacks them all up
        """
        assert os.path.exists(im_path), "images path {} does not exist".format(im_path)
        ims = []
        fnames = glob.glob(os.path.join(im_path, 'CelebA-HQ-img/*.{}'.format('png')))
        fnames += glob.glob(os.path.join(im_path, 'CelebA-HQ-img/*.{}'.format('jpg')))
        fnames += glob.glob(os.path.join(im_path, 'CelebA-HQ-img/*.{}'.format('jpeg')))
        texts = []
        masks = []
        
        if 'image' in self.condition_types:
            label_list = ['skin', 'nose', 'eye_g', 'l_eye', 'r_eye', 'l_brow', 'r_brow', 'l_ear', 'r_ear', 'mouth',
                          'u_lip', 'l_lip', 'hair', 'hat', 'ear_r', 'neck_l', 'neck', 'cloth']
            self.idx_to_cls_map = {idx: label_list[idx] for idx in range(len(label_list))}
            self.cls_to_idx_map = {label_list[idx]: idx for idx in range(len(label_list))}
        
        for fname in tqdm(fnames):
            ims.append(fname)
            
            if 'text' in self.condition_types:
                im_name = os.path.split(fname)[1].split('.')[0]
                captions_im = []
                with open(os.path.join(im_path, 'celeba-caption/{}.txt'.format(im_name))) as f:
                    for line in f.readlines():
                        captions_im.append(line.strip())
                texts.append(captions_im)
                
            if 'image' in self.condition_types:
                im_name = int(os.path.split(fname)[1].split('.')[0])
                masks.append(os.path.join(im_path, 'CelebAMask-HQ-mask', '{}.png'.format(im_name)))
        if 'text' in self.condition_types:
            assert len(texts) == len(ims), "Condition Type Text but could not find captions for all images"
        if 'image' in self.condition_types:
            assert len(masks) == len(ims), "Condition Type Image but could not find masks for all images"
        print('Found {} images'.format(len(ims)))
        print('Found {} masks'.format(len(masks)))
        print('Found {} captions'.format(len(texts)))
        return ims, texts, masks
    
    def get_mask(self, index):
        r"""
        Method to get the mask of WxH
        for given index and convert it into
        Classes x W x H mask image
        :param index:
        :return:
        """
        mask_im = Image.open(self.masks[index])
        mask_im = np.array(mask_im)
        im_base = np.zeros((self.mask_h, self.mask_w, self.mask_channels))
        for orig_idx in range(len(self.idx_to_cls_map)):
            im_base[mask_im == (orig_idx+1), orig_idx] = 1
        mask = torch.from_numpy(im_base).permute(2, 0, 1).float()
        return mask
    
    def __len__(self):
        return len(self.images)
    
    def __getitem__(self, index):
        ######## Set Conditioning Info ########
        cond_inputs = {}
        if 'text' in self.condition_types:
            cond_inputs['text'] = random.sample(self.texts[index], k=1)[0]
        if 'image' in self.condition_types:
            mask = self.get_mask(index)
            cond_inputs['image'] = mask
        #######################################
        
        if self.use_latents:
            latent = self.latent_maps[self.images[index]]
            if len(self.condition_types) == 0:
                return latent
            else:
                return latent, cond_inputs
        else:
            im = Image.open(self.images[index])
            im_tensor = torchvision.transforms.Compose([
                torchvision.transforms.Resize(self.im_size),
                torchvision.transforms.CenterCrop(self.im_size),
                torchvision.transforms.ToTensor(),
            ])(im)
            im.close()
        
            # Convert input to -1 to 1 range.
            im_tensor = (2 * im_tensor) - 1
            if len(self.condition_types) == 0:
                return im_tensor
            else:
                return im_tensor, cond_inputs


In [21]:
example_path = 'iam-handwriting-words-da/iam_words/words/a01/a01-000u/a01-000u-00-00.png'
text_string =  "a01-000u-00-00 ok 154 408 768 27 51 AT A"

split_string = text_string.split(' ')
split_string

['a01-000u-00-00', 'ok', '154', '408', '768', '27', '51', 'AT', 'A']

In [27]:
word = split_string[-1]
image_name = split_string[0]
image_name_folder_1 = image_name.split('-')[0]
image_name_folder_2 = image_name.split('-')[1]

In [41]:
image_name = split_string[0]
image_name_folder_1 = image_name.split('-')[0]
image_name_folder_2 = image_name_folder_1 + '-' + image_name.split('-')[1] 
image_final_folder = image_name_folder_1 + '/' + image_name_folder_2 + '/' + image_name

im_path = '/iam-handwriting-words-da'

In [42]:
import os
image_path = os.path.join(im_path, 'iam_words/words/{}.{}'.format(image_final_folder, 'png'))

In [43]:
image_path 

'/iam-handwriting-words-da/iam_words/words/a01/a01-000u/a01-000u-00-00.png'

In [19]:

!python3 /Users/tejanagubandi/Desktop/projects/CTIDiffusion/Training/train.py

{'dataset_params': {'im_path': '/iam_words', 'im_channels': 3, 'im_size': 256, 'name': 'IAMHandwriting'}, 'diffusion_params': {'num_timesteps': 1000, 'beta_start': 0.0015, 'beta_end': 0.0195}, 'ldm_params': {'down_channels': [128, 256, 256, 256], 'mid_channels': [256, 256], 'down_sample': [False, False, False], 'attn_down': [True, True, True], 'time_emb_dim': 256, 'norm_channels': 32, 'num_heads': 16, 'conv_out_channels': 128, 'num_down_layers': 2, 'num_mid_layers': 2, 'num_up_layers': 2, 'condition_config': {'condition_types': ['text', 'image', 'style'], 'text_condition_config': {'text_embed_model': 'clip', 'train_text_embed_model': False, 'text_embed_dim': 512, 'cond_drop_prob': 0.1}, 'image_condition_config': {'image_embed_model': 'clip', 'train_image_embed_model': False, 'image_embed_dim': 512, 'cond_drop_prob': 0.1}}}, 'autoencoder_params': {'z_channels': 3, 'codebook_size': 20, 'down_channels': [32, 64, 128], 'mid_channels': [128, 128], 'down_sample': [True, True], 'attn_down': [