In [None]:
!pip install diffusers transformers accelerate torch safetensors peft timm opencv-python

In [None]:
from diffusers import StableDiffusionXLPipeline, StableDiffusionXLControlNetPipeline, StableDiffusionXLImg2ImgPipeline
import torch
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import cv2

In [None]:
# Load txt2img pipe
model_id = "stabilityai/stable-diffusion-xl-base-1.0"
txt2img_pipe = StableDiffusionXLPipeline.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    use_safetensors=True,
    )

# Load the LoRA weights
lora_weights_path = "/content/pytorch_lora_weights.safetensors"
txt2img_pipe.load_lora_weights(lora_weights_path)

txt2img_pipe.to("cuda")

In [4]:
seed = 123
generator = torch.manual_seed(seed)

In [66]:
# Define the text prompt
txt2img_prompt = (
    "photo of sks character wearing only t-shirt, sks face, wearing no jacket, jeans, no watch, sitting in restaurant, "
    "zoomed out, happy, sitting down at table, detailed background, "
    "modern cartoon style, cartoon style, full-body, "
    "detailed character design, high quality background, good anatomy,"

)

txt2img_negative_prompt = (
    "low-quality, blurry, deformed, disfigured, extra limbs, extra fingers,"
    "cropped, watermark, text, signature, nsfw, bad anatomy, unnatural proportions, "
    "distorted face, closed eyes, sad expression, outdated style, monochrome, "
    "overexposed, underexposed, grainy, pixelated, poor lighting, artifacts, "
    "duplicate, ugly, tiling, mutated hands, long neck, disconnected limbs, multiple characters, "
    "bad eyes, realistic, glasses, plain background, monochrome"
    )


# Pass both the text and the IP adapter image to the pipeline
with torch.no_grad():
  initial_img = txt2img_pipe(prompt=txt2img_prompt,
                              negative_prompt=txt2img_negative_prompt,
                              generator=generator,
                              num_inference_steps=75,
                              num_images_per_prompt=3,
                              height=1024,
                              width=1024,
                              guidance_scale=10.5).images[0]

# Display Generated Image
plt.imshow(initial_img)
plt.axis("off")
plt.show()

  0%|          | 0/75 [00:00<?, ?it/s]

In [None]:
# Optionally save initial image
# initial_img.save("initial_image.png")

In [67]:
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda"

detector_processor = AutoProcessor.from_pretrained(model_id)
detector = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

In [77]:
text = "clothing."

inputs = detector_processor(images=initial_img, text=text, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = detector(**inputs)

results = detector_processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[initial_img.size[::-1]]
)
print(results)

[{'scores': tensor([0.4365], device='cuda:0'), 'labels': ['clothing'], 'boxes': tensor([[225.4590, 106.6570, 783.9850, 867.3058]], device='cuda:0')}]


In [78]:
from transformers import AutoModelForMaskGeneration

# Define the SAM model and processor
segmenter_id = "facebook/sam-vit-huge"
segmentator = AutoModelForMaskGeneration.from_pretrained(segmenter_id).to("cuda")
segmentator_processor = AutoProcessor.from_pretrained(segmenter_id)

# Extract bounding boxes and normalize them for SAM
boxes = []
image_width, image_height = initial_img.size
for r in results:
    for box in r['boxes']:
        normalized_box = [
            box[0] ,  # xmin
            box[1] ,  # ymin
            box[2] ,  # xmax
            box[3]    # ymax
        ]
        boxes.append(normalized_box)

boxes = torch.Tensor(boxes).cpu().numpy()
boxes = [[list(map(float, box)) for box in boxes]]
print(boxes)
# Use SAM with the extracted bounding boxes
inputs_sam = segmentator_processor(images=initial_img, input_boxes=[boxes], return_tensors="pt").to("cuda")

with torch.no_grad():
    outputs_sam = segmentator(**inputs_sam)

# Display the segmentation mask
mask = outputs_sam.pred_masks.squeeze().cpu().numpy()
print(mask.shape)

[[[225.45895385742188, 106.65701293945312, 783.9849853515625, 867.3057861328125]]]
(3, 256, 256)


In [79]:
mask_binary = cv2.resize(mask.transpose(1,2,0), (1024, 1024), interpolation=cv2.INTER_NEAREST)
mask_binary = np.max(mask_binary, axis=2).astype(np.uint8)

print(mask_binary.shape)
mask_binary = 255 - mask_binary
# Convert the mask to a PIL image (single channel, 'L' mode for luminance)
clothing_mask_image = Image.fromarray(mask_binary, mode='L')

(1024, 1024)


In [72]:
# Optionally save mask
# clothing_mask_image.save("clothing_mask.png")

In [195]:
# Delete from memory if necessary
# del segmentator_processor, segmentator, detector_processor, detector

In [80]:
import gc
torch.cuda.empty_cache()
gc.collect()

106

In [129]:
from diffusers import StableDiffusionXLControlNetInpaintPipeline, ControlNetModel, StableDiffusionXLInpaintPipeline
import torch

outfit_pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.bfloat16,
    use_safetensors=True
).to("cuda")

# Load your LoRA weights
lora_weights = "/content/pytorch_lora_weights.safetensors"
outfit_pipe.load_lora_weights(lora_weights)

# Load IP adapter
outfit_pipe.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
# scale = 0.3
scale = {
    "down": {"block_2": [0.0, 1.0]},
    "up": {"block_0": [0.0, 1.0, 0.0]},
}
outfit_pipe.set_ip_adapter_scale(scale)

# Now load the IP Adapter image
ip_adapter_image = Image.open("/content/outfit_4.png")  # Outfit control image

inpainting_prompt = (
    "photo of sks character wearing only black shirt and no hat, "
    "modern cartoon style, happy,"
    "wearing only t-shirt, wearing only plain t-shirt, wearing only simple t-shirt, connected body parts "
    "jeans, white shoes, good anatomy, show waist and up"
    "good face, fit borders, smooth edges, detailed character design"
)

inpainting_negative_prompt = (
    "low-quality, blurry, deformed, disfigured, extra limbs, extra fingers,"
    "cropped, watermark, text, signature, nsfw, bad anatomy, unnatural proportions, "
    "distorted face, closed eyes, outdated style, monochrome, "
    "overexposed, underexposed, grainy, pixelated, poor lighting, artifacts, "
    "duplicate, ugly, tiling, mutated hands, long neck, disconnected limbs, multiple characters, bad eyes"
    "jacket, coat, sweater, blazer, outerwear, wearing hat"
    )

# Perform inpainting
with torch.no_grad():
  output_image = outfit_pipe(
      prompt=txt2img_prompt + "wearing black t-shirt",
      num_inference_steps=100,
      generator=generator,
      image=initial_img,
      mask_image=clothing_mask_image,
      ip_adapter_image=ip_adapter_image,
      strength = 0.7,
      height=1024,
      width=1024,
      guidance_scale=10.5,
      negative_prompt=inpainting_negative_prompt,
  ).images[0]

# Save or display the result
output_image.save("output_character_in_outfit.png")

plt.imshow(output_image)
plt.axis("off")
plt.show()


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/70 [00:00<?, ?it/s]