In [23]:
import torch
import torch.nn as nn
temb_dim = 4
# time_steps has to have dimension of (batch_size, )
time_steps = torch.tensor([1, 2, 3, 4, 5], dtype = torch.float32)

factor = 10000 ** ((torch.arange(
        start = 0 , end = temb_dim // 2, dtype = torch.float32) / (temb_dim // 2) )
        )

t_emb = time_steps[:, None].repeat(1, temb_dim // 2) / factor
t_emb = torch.cat([torch.sin(t_emb), torch.cos(t_emb)], dim=-1)

In [22]:
t_emb

tensor([[ 0.8415,  0.0100,  0.5403,  0.9999],
        [ 0.9093,  0.0200, -0.4161,  0.9998],
        [ 0.1411,  0.0300, -0.9900,  0.9996],
        [-0.7568,  0.0400, -0.6536,  0.9992],
        [-0.9589,  0.0500,  0.2837,  0.9988]])

In [19]:
t_emb = time_steps[:, None].repeat(1, temb_dim // 2) / factor

In [20]:
t_emb

tensor([[1.0000, 0.0100],
        [2.0000, 0.0200],
        [3.0000, 0.0300],
        [4.0000, 0.0400],
        [5.0000, 0.0500]])

In [13]:
torch.arange(start = 0 , end = temb_dim // 2, dtype = torch.float32) / (temb_dim // 2) 

tensor([0.0000, 0.5000])

In [24]:
conv_layer = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)

# Create a random input tensor with the shape [batch_size, in_channels, height, width]
input_image = torch.randn(1, 3, 32, 32)

# Apply the convolutional layer
output = conv_layer(input_image)

## the formulae for the output layer is given by 

# output_size = ((input_size - kernel_size + 2*padding) / stride) + 1

# Print the shape of the output
print(output.shape)  

torch.Size([1, 16, 32, 32])


In [2]:
from turtle import pos
import torch
import torch.nn as nn
import clip

# Define the feature extractor (Fenc) using CLIP
class FeatureExtractor(nn.Module):
    def __init__(self, model_name="ViT-B/32"):
        super(FeatureExtractor, self).__init__()
        self.model, _ = clip.load(model_name, device='cpu')  # Load the CLIP model
        self.model = self.model.visual  # Use the visual part of the model

    def forward(self, x):
        return self.model(x)

# Define the position embedding (Emb)
class PositionEmbedding(nn.Module):
    def __init__(self, embed_size):
        super(PositionEmbedding, self).__init__()
        self.embed_size = embed_size
        self.embedding = nn.Parameter(torch.randn(1, embed_size))

    def forward(self, x):
        batch_size, _ = x.size()
        embeddings = self.embedding.repeat(batch_size, 1)
        return embeddings

# Define the image input encoder
class ImageInputEncoder(nn.Module):
    def __init__(self, embed_size, model_name="ViT-B/32"):
        super(ImageInputEncoder, self).__init__()
        self.feature_extractor = FeatureExtractor(model_name)
        self.position_embedding = PositionEmbedding(embed_size)
        self.embed_size = embed_size

    def forward(self, x):
        features = self.feature_extractor(x)  # Extract features
        B, C = features.size()
        
        pos_embeddings = self.position_embedding(features)  # Get position embeddings
        print(pos_embeddings.shape)
        features += pos_embeddings
        
        print(features.shape)
        # Aggregate to get the image-level representation
        image_condition = features.mean(dim=1)
        
        return image_condition

# Example usage:
# Create a random image tensor with shape [batch_size, channels, height, width]
input_image = torch.randn(8, 3, 224, 224)  # Example input

# Define the model and forward pass
embed_size = 512  # Example embedding size for CLIP ViT-B/32
model = ImageInputEncoder(embed_size, model_name="ViT-B/32")
output = model(input_image)

print(output.shape)  # Should print: torch.Size([8, 512])


torch.Size([8, 512])
torch.Size([8, 512])
torch.Size([8])


In [4]:
import torch
import torch.nn as nn
import clip

# Define the feature extractor (Fenc) using CLIP
class FeatureExtractor(nn.Module):
    def __init__(self, model_name="ViT-B/32"):
        super(FeatureExtractor, self).__init__()
        self.model, _ = clip.load(model_name, device='cpu')  # Load the CLIP model
        self.model = self.model.visual  # Use the visual part of the model

    def forward(self, x):
        return self.model(x)

# Define the position embedding (Emb)
class PositionEmbedding(nn.Module):
    def __init__(self, embed_size):
        super(PositionEmbedding, self).__init__()
        self.embed_size = embed_size
        self.embedding = nn.Parameter(torch.randn(1, 1, embed_size))

    def forward(self, x):
        batch_size, num_patches, _ = x.size()
        embeddings = self.embedding.expand(batch_size, num_patches, -1)
        return embeddings

# Define the image input encoder
class ImageInputEncoder(nn.Module):
    def __init__(self, embed_size, model_name="ViT-B/32"):
        super(ImageInputEncoder, self).__init__()
        self.feature_extractor = FeatureExtractor(model_name)
        self.position_embedding = PositionEmbedding(embed_size)
        self.embed_size = embed_size

    def forward(self, x):
        features = self.feature_extractor(x)  # Extract features
        B, C = features.size()
        
        # Reshape features to match positional embeddings dimensions
        features = features.view(B, 1, C)
        print(features.shape)
        
        pos_embeddings = self.position_embedding(features)  # Get position embeddings
        features += pos_embeddings
        
        print(pos_embeddings.shape)
        # Aggregate to get the image-level representation
        image_condition = features.mean(dim=1)
        
        return image_condition

# Example usage:
# Create a random image tensor with shape [batch_size, channels, height, width]
input_image = torch.randn(8, 3, 224, 224)  # Example input

# Define the model and forward pass
embed_size = 512  # Example embedding size for CLIP ViT-B/32
model = ImageInputEncoder(embed_size, model_name="ViT-B/32")
output = model(input_image)

print(output.shape)  # Should print: torch.Size([8, 512])


torch.Size([8, 1, 512])
torch.Size([8, 1, 512])
torch.Size([8, 512])


In [6]:
import torch
from transformers import DistilBertModel, DistilBertTokenizer, CLIPTokenizer, CLIPTextModel

# Define the function to get the tokenizer and model based on the model type
def get_tokenizer_and_model(model_type, device, eval_mode=True):
    if model_type == 'bert':
        text_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        text_model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
    elif model_type == 'clip':
        text_tokenizer = CLIPTokenizer.from_pretrained('openai/clip-vit-base-patch16')
        text_model = CLIPTextModel.from_pretrained('openai/clip-vit-base-patch16').to(device)
    else:
        raise ValueError(f"Unsupported model_type: {model_type}")
    
    if eval_mode:
        text_model.eval()
    
    return text_tokenizer, text_model

# Define the function to get text representation
def get_text_representation(text, text_tokenizer, text_model, device,
                            truncation=True,
                            padding='max_length',
                            max_length=77):
    token_output = text_tokenizer(text,
                                  truncation=truncation,
                                  padding=padding,
                                  return_attention_mask=True,
                                  max_length=max_length,
                                  return_tensors='pt')  # Return PyTorch tensors
    tokens_tensor = token_output['input_ids'].to(device)
    mask_tensor = token_output['attention_mask'].to(device)
    
    with torch.no_grad():  # Disable gradient calculation
        text_embed = text_model(input_ids=tokens_tensor, attention_mask=mask_tensor).last_hidden_state
    
    return text_embed

# Example usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
text = "A photo of a cat"
model_type = 'clip'
text_tokenizer, text_model = get_tokenizer_and_model(model_type, device)
text_embed = get_text_representation(text, text_tokenizer, text_model, device)
print(text_embed.shape)  # Should print: torch.Size([1, 77, 512])


torch.Size([1, 77, 512])
