<a href="https://colab.research.google.com/github/tirtthshah/text-to-image-pipeline/blob/main/Task_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers accelerate torch
!pip install --upgrade --force-reinstall transformers accelerate torch

In [None]:
!pip install torch torchvision diffusers transformers accelerate safetensors

In [None]:
import os

hf_token = os.environ.get("HF_TOKEN")
from huggingface_hub import login
login(hf_token)

In [None]:
from transformers import CLIPTokenizer, CLIPTextModel
import torch

tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
text_prompt = ["A futuristic city at sunset"]

inputs = tokenizer(text_prompt, padding=True, return_tensors="pt")


with torch.no_grad():
    text_embeddings = text_encoder(**inputs).last_hidden_state
print("Text Embeddings Shape:", text_embeddings.shape)

In [None]:
import os
from huggingface_hub import login

hf_token = os.environ.get("HF_TOKEN")
login(hf_token)

In [None]:
from diffusers import StableDiffusionPipeline
import torch

model_id = "prompthero/openjourney"
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

inputs = tokenizer(clean_prompt, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
    embeddings = model(**inputs).last_hidden_state.mean(dim=1)

In [None]:
import torch.nn as nn

class Generator(nn.Module):
    def __init__(self, embedding_dim=768, noise_dim=100, img_channels=3, feature_maps=64):
        super(Generator, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(embedding_dim + noise_dim, feature_maps * 8 * 4 * 4),
            nn.BatchNorm1d(feature_maps * 8 * 4 * 4),
            nn.ReLU(True)
        )
        self.deconv = nn.Sequential(
            nn.ConvTranspose2d(feature_maps * 8, feature_maps * 4, 4, 2, 1),
            nn.BatchNorm2d(feature_maps * 4),
            nn.ReLU(True),
            nn.ConvTranspose2d(feature_maps * 4, feature_maps * 2, 4, 2, 1),
            nn.BatchNorm2d(feature_maps * 2),
            nn.ReLU(True),
            nn.ConvTranspose2d(feature_maps * 2, feature_maps, 4, 2, 1),
            nn.BatchNorm2d(feature_maps),
            nn.ReLU(True),
            nn.ConvTranspose2d(feature_maps, img_channels, 4, 2, 1),
            nn.Tanh()
        )

    def forward(self, embedding, noise):
        x = torch.cat((embedding, noise), dim=1)
        x = self.fc(x).view(-1, 512, 4, 4)
        return self.deconv(x)

In [None]:
noise = torch.randn(1, 100)
gen = Generator()
gen.eval()
generated_image = gen(embeddings, noise)

In [None]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

base_prompt = input("Enter your thought (prompt): ")
style = input("Enter desired style (e.g., portrait, sketch): ")
prompt = f"{style} style {base_prompt}"
clean_prompt = preprocess_text(prompt)
image = pipe(prompt).images[0]

import matplotlib.pyplot as plt
plt.imshow(image)
plt.axis("off")
plt.title("Generated Image")
plt.show()