In [None]:
!pip install kaggle
!pip install transformers
!pip install torch
!pip install diffusers accelerate peft datasets wandb ftfy tensorboard datasets

In [None]:
!git clone https://github.com/huggingface/diffusers
%cd /content/diffusers/examples/text_to_image

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
!mkdir /content/finetuningoutput

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d markminerov/88500-car-images


In [None]:
!unzip 88500-car-images.zip

In [None]:
import os
from PIL import Image
dataset_dir = '/content/out'
print(os.listdir(dataset_dir)[:10])

# Load and display an image
image_path = os.path.join(dataset_dir, '1.jpg')
image = Image.open(image_path)
image.show()

In [None]:
import matplotlib.pyplot as plt
files = os.listdir(dataset_dir)

# Display the first 1-2 images
for file in files[:2]:
    image_path = os.path.join(dataset_dir, file)
    image = Image.open(image_path)
    plt.imshow(image)
    plt.axis('off')  # Hide the axis
    plt.show()

In [None]:
!pip install accelerate diffusers

In [None]:
from transformers import AutoProcessor, LlavaForConditionalGeneration

In [None]:
# Load the model and processor
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

In [None]:
import torch
# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
image_path = os.path.join(dataset_dir, files[15])
image = Image.open(image_path)

# Define the prompt
prompt = "USER: <image>\nDescribe what car is this including the brand, color and type. ASSISTANT:"

# Prepare inputs for the model
inputs = processor(text=prompt, images=image, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}

# Generate the description
generate_ids = model.generate(**inputs, max_new_tokens=50)
description = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(description)

In [None]:
import pandas as pd
df = pd.DataFrame(files, columns=['file_name'])
sampled_df = df.sample(n=200, random_state=42).reset_index(drop=True)

In [None]:
dataset_dir = '/content/out'

In [None]:
import base64
import json
from io import BytesIO
def extract_assistant_message(text):
    return text.split("ASSISTANT:")[-1].strip()

results = []

for index, row in sampled_df.iterrows():
    image_path = os.path.join(dataset_dir, row['file_name'])
    image = Image.open(image_path)
    buffered = BytesIO()
    image.save(buffered, format="JPEG")
    encoded_image = base64.b64encode(buffered.getvalue()).decode('utf-8')

    # Generate caption for the image
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}
    generate_ids = model.generate(**inputs, max_new_tokens=50)
    full_text = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    description = extract_assistant_message(full_text)
    results.append({'image': encoded_image, 'text': description})


In [None]:
results

In [None]:
results_df = pd.DataFrame(results)

features = Features({
    'image': Value('string'),
    'text': Value('string')
})

dataset_new = Dataset.from_pandas(df, features=features)
print(dataset_new)

In [None]:
dataset_new.push_to_hub("Vibhav99/150-sampled-car-images")

In [None]:
# Initialize lists to store results
file_names = []
descriptions = []

for index, row in sampled_df.iterrows():
    image_path = os.path.join(dataset_dir, row['file_name'])
    image = Image.open(image_path)

    inputs = processor(text=prompt, images=image, return_tensors="pt")

    generate_ids = model.generate(**inputs, max_new_tokens=50)
    description = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    file_names.append(row['file_name'])
    descriptions.append(description)


In [None]:
ft_df = results_df
# Function to extract the assistant's message
def extract_assistant_message(text):
    return text.split("ASSISTANT:")[-1].strip()

# Apply the function to the 'text' column
ft_df['text'] = results_df['text'].apply(extract_assistant_message)

In [None]:
# Function to display image with its description
def display_image_with_description(image_path, description):
    image = Image.open(image_path)
    plt.imshow(image)
    plt.title(description)
    plt.axis('off')  # Hide the axis
    plt.show()

# Display the first 5 images with descriptions as an example
for index, row in ft_df[4:6].iterrows():
    image_path = os.path.join(dataset_dir, row['image'])
    display_image_with_description(image_path, row['text'])

In [None]:
# Define the path to save the file in Google Drive
save_path = '/content/drive/My Drive/PixArt_FT_cars.csv'

# Save the DataFrame to the specified path
ft_df.to_csv(save_path, index=False)

print(f"Dataset with descriptions saved to '{save_path}'")

Loading PixArt-alpha

In [None]:
import torch
from diffusers import PixArtAlphaPipeline
pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-XL-2-512x512", torch_dtype=torch.float16)
pipe = pipe.to("cuda")
# if using torch < 2.0
# pipe.enable_xformers_memory_efficient_attention()

Fine-tuning job with Stable Diffusion

In [None]:
!accelerate launch --mixed_precision="fp16"  train_text_to_image_lora.py \
  --pretrained_model_name_or_path="runwayml/stable-diffusion-v1-5" \
  --dataset_name='Vibhav99/150-sampled-car-images' \
  --resolution=512 --center_crop --random_flip \
  --train_batch_size=1 \
  --gradient_accumulation_steps=4 \
  --max_train_steps=15000 \
  --learning_rate=1e-04 \
  --max_grad_norm=1 \
  --lr_scheduler="cosine" --lr_warmup_steps=0 \
  --output_dir="/content/finetuningoutput" \
  --push_to_hub \
  --report_to=wandb \
  --checkpointing_steps=500 \
  --validation_prompt="describe what car is this including the brand, color and type." \
  --seed=1337

In [None]:
dataset.save_to_disk('/content/finetuningoutput/Fine-tuning-data')