In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
hf_token = "YOUR_TOKEN_HERE"
# ===============================
# 1. Load Phi-3-mini-4k
# ===============================
model_name = "microsoft/phi-3-mini-4k-instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",   # GPU if available
    torch_dtype="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,

)

# ===============================
# 2. Function: extract noun phrases
# ===============================
def get_entities(caption: str):
    messages = [
        {"role": "user", "content": (
            "Extract all the noun phrases in the given sentence. "
            "Return them separated by commas, without rephrasing or extra text. "
            "Only keep phrases that contain a noun. "
            f"\nSentence: {caption}\nEntities:"
        )}
    ]
    generation_args = {
        "max_new_tokens": 50,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False
    }
    output = pipe(messages, **generation_args)
    text = output[0]['generated_text'].strip()
    # Convert comma-separated string -> list
    entities = [ent.strip() for ent in text.split(",") if ent.strip()]
    return entities

# ===============================
# 3. Load captions CSV
# ===============================
df = pd.read_csv("mscoco_captions.csv")

df["entities"] = df["mscoco_caption"].apply(get_entities)

# ===============================
# 4. Save results
# ===============================
df.to_csv("mscoco_captions_entities.csv", index=False)
print("Saved mscoco_captions_entities.csv with entities extracted for", len(df), "captions.")
