In [24]:
# https://huggingface.co/deepset/bert-large-uncased-whole-word-masking-squad2
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

def get_answer(question, context):
    model_name = "deepset/bert-large-uncased-whole-word-masking-squad2"
    nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)
    QA_input = {
        'question': question,
        'context': context
    }
    res = nlp(QA_input)
    return res['answer']

# This is were the CV Model would output
objects = ['Dog', 'Ball', 'Person', 'House', 'Tree']
objects_str = ', '.join(objects)

# Test the function
question = 'What is in the photograph?'
context = f'The photograph contains {objects_str}'
answer = get_answer(question, context)


In [25]:
# Make the answer more human-like
# This line of code formats the model's answer in a more "human-like" sentence structure. 
# It adds an "a" before each object and an "and" before the last object.
# 1. It begins the sentence with "There is ".
# 2. The '.join([f'a {obj}' for obj in answer.split(', ')[:-1]])' part takes all objects from the answer except the last one,
#    prepends each with "a ", and joins them together with commas.
# 3. The '+ ', and a ' + answer.split(', ')[-1]' part takes the last object, prepends it with "and a ", and adds it to the sentence.
# 4. Finally, it appends " in the photo." to the end of the sentence.
human_like_answer = 'There is ' + ', '.join([f'a {obj}' for obj in answer.split(', ')[:-1]]) + ', and a ' + answer.split(', ')[-1] + ' in the photo.'
print(human_like_answer)

There is a Dog, a Ball, a Person, a House, and a Tree in the photo.


GPT2

In [32]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def describe_image(labels):
    # Initialize GPT2 model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left')
    model = GPT2LMHeadModel.from_pretrained('gpt2')

    # If 'pad_token' is not defined, set it as 'eos_token'
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Join labels with commas and turn into a string
    label_string = ", ".join(labels)
    # Add a prompt for the GPT2 model to generate a description from
    prompt = f"Describe a scene where a {label_string} are interacting with each other."
    # Encode the prompt and create attention_mask
    inputs = tokenizer.encode(prompt, return_tensors='pt', add_special_tokens=True)
    attention_mask = inputs.ne(tokenizer.pad_token_id).float()
    # Generate a text from the prompt. Adjust temperature up for more randomness, and max_length for longer outputs. top_p is top probability threshhold for words. ex: 0.5 would only consider the smallest possible set of words whos cumulative probability exceeds 0.5
    outputs = model.generate(inputs, max_length=80, temperature=0.4, do_sample=True, top_p=0.4, attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id)
    # Decode the output
    output_text = tokenizer.decode(outputs[0])
    return output_text

# Example usage:
labels = ["dog", "cat", "person", "girl", "boy", "house", "tree"]
print(describe_image(labels))


Using pad_token, but it is not set yet.


Describe a scene where a dog, cat, person, girl, boy, house, tree are interacting with each other.

The scene is a scene where a dog, cat, person, girl, boy, house, tree are interacting with each other.

The scene is a scene where a dog, cat, person, girl, boy, house, tree are interacting with each other
