In [1]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

# ===============================
# 1. Load Phi-3-mini
# ===============================
model_name = "microsoft/phi-3-mini-4k-instruct"

print(" Loading Phi-3-mini model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

# ===============================
# 2. Function to extract noun phrases
# ===============================
def get_entities(caption: str):
    if not caption or caption.strip() == "":
        return []

    messages = [
        {"role": "user", "content": (
            "Extract all the noun phrases in the given sentence. "
            "Return them separated by commas, without rephrasing or extra text. "
            "Only keep phrases that contain a noun. "
            f"\nSentence: {caption}\nEntities:"
        )}
    ]
    generation_args = {
        "max_new_tokens": 50,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False
    }
    try:
        output = pipe(messages, **generation_args)
        text = output[0]['generated_text'].strip()
        entities = [ent.strip() for ent in text.split(",") if ent.strip()]
        return entities
    except Exception as e:
        print("Error processing caption:", caption, e)
        return []

# ===============================
# 3. Function to process a single CSV
# ===============================
def process_csv(file_path: str, caption_column: str):
    df = pd.read_csv(file_path, sep=None, engine='python')  # Automatically detects separator
    print(f"🔹 Processing {file_path} ({len(df)} rows)")

    entities_list = []

    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = {executor.submit(get_entities, str(caption)): i for i, caption in enumerate(df[caption_column])}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Extracting entities"):
            idx = futures[future]
            try:
                entities_list.append((idx, future.result()))
            except Exception as e:
                entities_list.append((idx, []))
                print("Error at row", idx, e)

    entities_list.sort(key=lambda x: x[0])
    df[f"{caption_column}_entities"] = [e for _, e in entities_list]

    out_csv = file_path.replace(".csv", "_entities.csv")
    df.to_csv(out_csv, index=False)
    print(f" Saved entities to {out_csv}")

# ===============================
# 4. Run for new files
# ===============================
base_path = "/content/drive/MyDrive/"  # change this if needed

files_to_process = {
    "mscoco_captions": {
        "file": base_path + "mscoco_captions.csv",
        "column": "mscoco_caption"
    },
    "sdxl": {
        "file": base_path + "sdxl_meta.csv",
        "column": "Meta Caption"
    },
    "sd2": {
        "file": base_path + "sd2_meta.csv",
        "column": "Meta Caption"
    },
    "fluxdev": {
        "file": base_path + "fluxdev_meta.csv",
        "column": "Meta Caption"
    },
}

for key, info in files_to_process.items():
    process_csv(info["file"], info["column"])


 Loading Phi-3-mini model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔹 Processing /content/drive/MyDrive/mscoco_captions.csv (200 rows)


Extracting entities:   1%|          | 2/200 [00:06<09:08,  2.77s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Extracting entities: 100%|██████████| 200/200 [02:15<00:00,  1.48it/s]


 Saved entities to /content/drive/MyDrive/mscoco_captions_entities.csv
🔹 Processing /content/drive/MyDrive/sdxl_meta.csv (200 rows)


Extracting entities: 100%|██████████| 200/200 [10:15<00:00,  3.08s/it]


 Saved entities to /content/drive/MyDrive/sdxl_meta_entities.csv
🔹 Processing /content/drive/MyDrive/sd2_meta.csv (196 rows)


Extracting entities: 100%|██████████| 196/196 [10:07<00:00,  3.10s/it]


 Saved entities to /content/drive/MyDrive/sd2_meta_entities.csv
🔹 Processing /content/drive/MyDrive/fluxdev_meta.csv (200 rows)


Extracting entities: 100%|██████████| 200/200 [10:02<00:00,  3.01s/it]

 Saved entities to /content/drive/MyDrive/fluxdev_meta_entities.csv



