In [None]:
# 必要なライブラリのインストール
!pip install -q torch transformers accelerate Pillow

In [None]:
# # Hugging Face へのログイン（必要に応じて）
# from huggingface_hub import notebook_login
# notebook_login()

In [None]:
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch
from PIL import Image
import gc
import os
import zipfile

from google.colab import files, drive

In [None]:
class CaptionGenerator:
    def __init__(self, model_name="Salesforce/instructblip-vicuna-7b", batch_size=1):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.processor = InstructBlipProcessor.from_pretrained(model_name)
        self.model = InstructBlipForConditionalGeneration.from_pretrained(
            model_name, torch_dtype=torch.float16
        ).to(self.device)
        self.batch_size = batch_size
        self.prompt = """
            Describe the person in the image in detail.
            Focus on the following aspects:
            - Face: Describe their facial features, expression, and any visible emotions.
            - Clothing: Describe what they are wearing, including the type of clothing, color, and style.
            - Body: Describe their body shape, posture, and any notable physical characteristics.
            - Style tags: Provide relevant style tags that describe the overall aesthetic of the image (e.g., photorealistic, anime, painting, etc.).
            """

    def generate_caption(self, image_paths, output_dir="output_captions"):
        """
        複数の画像パスを受け取り、それぞれの画像に対するキャプションを生成し、テキストファイルに保存します。
        """
        os.makedirs(output_dir, exist_ok=True)  # 出力ディレクトリを作成

        images = []
        image_filenames = []  # ファイル名を保存するためのリスト
        for image_path in image_paths:
            try:
                image = Image.open(image_path).convert("RGB")
                images.append(image)
                image_filenames.append(os.path.basename(image_path)) # ファイル名を取得
            except Exception as e:
                print(f"Error: Could not open image file: {image_path}")
                print(e)
                continue

        if not images:
            return ["Error: Could not open any image files."]

        captions = []
        for i in range(0, len(images), self.batch_size):
            batch_images = images[i:i + self.batch_size]
            batch_filenames = image_filenames[i:i + self.batch_size] # ファイル名を取得

            inputs = self.processor(images=batch_images, text=self.prompt, return_tensors="pt", padding=True, truncation=True).to(self.device, torch.float16)

            with torch.no_grad():
                outputs = self.model.generate(**inputs, max_new_tokens=500)

            batch_captions = self.processor.batch_decode(outputs, skip_special_tokens=True)
            captions.extend(batch_captions)

            # キャプションをファイルに保存
            for j, caption in enumerate(batch_captions):
                # プロンプト部分を削除
                caption = caption.replace(self.prompt, "").strip()

                output_filename = os.path.splitext(batch_filenames[j])[0] + ".txt"
                output_path = os.path.join(output_dir, output_filename)
                with open(output_path, "w") as f:
                    f.write(caption)
                print(f"Caption for {batch_filenames[j]} saved to {output_path}")

            del inputs, outputs
            gc.collect()
            torch.cuda.empty_cache()

        # 生成されたキャプションとファイル名を返す
        return list(zip(captions, image_filenames))

In [None]:
# マウントされていない場合は、Google Driveをマウント
if not os.path.exists("/content/drive"):
    drive.mount('/content/drive')

In [None]:
def select_local_folder():
    """ローカルからフォルダをzipでアップロード"""
    uploaded = files.upload()
    if not uploaded:
        print("No files uploaded. Exiting.")
        return ""

    zip_file_name = list(uploaded.keys())[0]
    with zipfile.ZipFile(zip_file_name, 'r') as zip_ref:
        zip_ref.extractall("/content/uploaded_folder")  # /content/uploaded_folder に展開
    
    # アップロードされたzipファイルを削除
    os.remove(zip_file_name)

    return "/content/uploaded_folder"  # 展開したフォルダのパスを返す

def select_drive_folder():
    """Google Driveからフォルダを選択"""
    folder_path = input("Enter the path to the folder in your Google Drive: ")
    # 入力が空でないか検証
    if not folder_path:
        print("No path entered. Exiting.")
        return ""
    # パスが /content/drive/MyDrive で始まるかチェック
    if folder_path.startswith("/content/drive/MyDrive"):
        return folder_path
    else:
        # パスの先頭が /content/drive/MyDrive でない場合は追加
        return "/content/drive/MyDrive/" + folder_path

In [None]:
# ユーザーに選択方法を尋ねる
choice = input("Choose the source for image folder:\n1. Local (upload zip file)\n2. Google Drive\nEnter choice (1 or 2): ")

if choice == "1":
    folder_path = select_local_folder()
elif choice == "2":
    folder_path = select_drive_folder()
else:
    print("Invalid choice. Exiting.")
    exit()

# 選択されたフォルダパスの確認
print(f"Selected folder path: {folder_path}")

In [None]:
# 画像ファイルパスのリストを取得
image_paths = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f)) and f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]

# 出力先ディレクトリをリポジトリ内に設定
output_dir = "/content/drive/MyDrive/Colab Notebooks/CaptionGen_GCE/output_captions"  # 出力先をリポジトリ内の output_captions に設定
os.makedirs(output_dir, exist_ok=True)  # ディレクトリが存在しない場合は作成

In [None]:
# バッチサイズを指定
generator = CaptionGenerator(batch_size=1)  # バッチサイズを適宜設定

In [None]:
captions_with_filenames = generator.generate_caption(image_paths, output_dir=output_dir)

In [None]:
# 生成されたキャプションとファイル名を表示
for caption, filename in captions_with_filenames:
    print(f"Caption for {filename}: {caption}")