In [1]:
import base64
import csv
import os
import requests
import cv2
import pandas as pd

def get_caption(frame, prompt, api_key):
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
    payload = {
        "model": "gpt-4-vision-preview",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}},
                ],
            }
        ],
        "max_tokens": 300,
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload, timeout=60)
    caption = response.json()["choices"][0]["message"]["content"]
    caption = caption.replace("\n", " ")
    return caption

def get_video_length(cap):
    return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

def extract_frames(video_path, points=(0.2, 0.5, 0.8), base_64=False):
    cap = cv2.VideoCapture(video_path)
    length = get_video_length(cap)
    points = [int(length * point) for point in points]
    frames = []
    if length < 3:
        return frames, length
    for point in points:
        cap.set(cv2.CAP_PROP_POS_FRAMES, point)
        ret, frame = cap.read()
        if not base_64:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
        else:
            _, buffer = cv2.imencode(".jpg", frame)
            frame = base64.b64encode(buffer).decode("utf-8")
        frames.append(frame)
    return frames, length


def to_base64(image):
    buffer = BytesIO()
    image.save(buffer, format="JPEG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")



def gpt4v_inference(path_video, prompt):
  # OpenAI API Key
  api_key = "*********"

  # Extract frames from video
  frames, length = extract_frames(path_video, base_64=True)

  headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
  }

  payload = {
    "model": "gpt-4-turbo",
    "messages": [
      {
        "role": "user",
        "content": [
          {
            "type": "text",
            "text": prompt
          },
          {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frames[0]}"}},
          {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frames[1]}"}},
          {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frames[2]}"}}
        ]
      }
    ],
    "max_tokens": 300
  }

  response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
  predict = response.json()["choices"][0]["message"]["content"]
  predict = predict.replace("\n", " ")
  return predict, response


In [2]:
test_path_video = "/home/artmed/Documents/autism_dataset/Data_10s_clips/20th_BAM_Hospital_Playtime_New Toys_1.mp4"
test_label = ['C+', 'O', 'Q+', 'S+', 'VI+']
test_num_label = [3, 8, 10, 11, 13]


prompt = '''
A video is given by providing three frames in chronological order. 
Please choose one or more appropriate interaction styles or behaviors in the video.
Please only reply with the numbers of the interaction styles or behaviors, separated by commas.
The candidates of the interaction styles or behaviors are as follows:
1. Appropriate verbal interactions
2. Parent affection
3. Positive contact
4. Complaint
5. Engaged activity of play
6. Multiple instruction
7. Non-compliance
8. Oppositional
9. Praise
10. Positive question
11. Positive social attention
12. Positive specific instruction
13. Positive vague instruction
'''

predict, response = gpt4v_inference(test_path_video, prompt = prompt)
predict

'5, 11'

In [3]:
from ast import literal_eval

df_validation = pd.read_csv(r'validation.csv')
df_FOS_used_label = pd.read_csv(r'FOS_used_label.csv')
df_FOS_used_label['index'] += 1
dict_used_label = df_FOS_used_label.set_index('mid').to_dict()['index']

df_gpt4_predict_label = pd.DataFrame(columns=['path_video', 'one_hot_predict', 'one_hot_labels', 'raw_predict', 'labels'])
for index, row in df_validation.iterrows():

    # Get the path of the video at first
    test_path_video = row['path']

    # Get the labels of the video
    test_labels = literal_eval(row['used_label'])
    num_labels = [dict_used_label[label] for label in test_labels]
    one_hot_labels = [1 if i in num_labels else 0 for i in range(1, 14)]

    # Get the prediction from GPT-4
    predict, response = gpt4v_inference(test_path_video, prompt = prompt)
    predict = list(literal_eval(predict))
    one_hot_predict = [1 if i in predict else 0 for i in range(1, 14)]

    df_gpt4_predict_label.loc[index] = [test_path_video, one_hot_predict, one_hot_labels, predict, test_labels]
    break
# test_path_video = "/home/artmed/Documents/autism_dataset/Data_10s_clips/20th_BAM_Hospital_Playtime_New Toys_1.mp4"
# test_label = ['C+', 'O', 'Q+', 'S+', 'VI+']
# test_num_label = [3, 8, 10, 11, 13]

In [4]:
df_gpt4_predict_label

Unnamed: 0,path_video,one_hot_predict,one_hot_labels,raw_predict,labels
0,/home/artmed/Documents/autism_dataset/Data_10s...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0]","[0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1]","[5, 11]","[C+, O, Q+, S+, VI+]"


In [35]:
prompt = "A video is given by providing three frames in chronological order. Describe this video and its style to generate a description. Pay attention to all objects in the video. Do not describe each frame individually. Do not reply with words like 'first frame'. The description should be useful for AI to re-generate the video. The description should be less than six sentences. Here are some examples of good descriptions: 1. A stylish woman walks down a Tokyo street filled with warm glowing neon and animated city signage. She wears a black leather jacket, a long red dress, and black boots, and carries a black purse. She wears sunglasses and red lipstick. She walks confidently and casually. The street is damp and reflective, creating a mirror effect of the colorful lights. Many pedestrians walk about. 2. Several giant wooly mammoths approach treading through a snowy meadow, their long wooly fur lightly blows in the wind as they walk, snow covered trees and dramatic snow capped mountains in the distance, mid afternoon light with wispy clouds and a sun high in the distance creates a warm glow, the low camera view is stunning capturing the large furry mammal with beautiful photography, depth of field. 3. Drone view of waves crashing against the rugged cliffs along Big Sur's garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff's edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff’s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway."
openAI_key = "*********"
def get_caption(frame, prompt, api_key):
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"}
    payload = {
        "model": "gpt-4-turbo",
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": prompt,
                    },
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[0]}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[1]}"}},
                    {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{frame[2]}"}},
                ],
            }
        ],
        "max_tokens": 300,
    }
    response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
    return response.json()
caption = get_caption(frame, prompt, openAI_key)
caption = caption["choices"][0]["message"]["content"]
caption.replace("\n", " ")
print(caption)

In this video, a child, dressed in a vibrant orange jacket and blue bottoms, is navigating a play area in an indoor setting while a woman, likely a caregiver, is present in the background. The child progresses through the room, which is filled with colorful children's furniture and various toys including a yellow ball, a blue play mat, and small cars. As the child moves, the focus subtly shifts between the bustling activities and the surrounding child-friendly environment. The atmosphere is casual and educational, subtly highlighting the dynamic interaction between the youthful exuberance of the child and the calm oversight by the adult. The video captures the essence of a day at a child-centric environment, emphasizing learning and play in a safe and nurturing space.


In [34]:
import base64
import csv
import cv2
from PIL import Image

def get_video_length(cap):
    return int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

def extract_frames(video_path, points=(0.2, 0.5, 0.8), base_64=False):
    cap = cv2.VideoCapture(video_path)
    length = get_video_length(cap)
    points = [int(length * point) for point in points]
    frames = []
    if length < 3:
        return frames, length
    for point in points:
        cap.set(cv2.CAP_PROP_POS_FRAMES, point)
        ret, frame = cap.read()
        if not base_64:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = Image.fromarray(frame)
        else:
            _, buffer = cv2.imencode(".jpg", frame)
            frame = base64.b64encode(buffer).decode("utf-8")
        frames.append(frame)
    return frames, length

'A young child, dressed in an orange jacket and pink pants, is captured bustling around a vibrantly colored nursery room filled with toys and kid-sized furniture. Accompanied by an adult woman in a dark sweater and glasses, who seems engaged in a task, the video focuses on the child who explores various toys and activities casually scattered around the room. The setting is informal and cozy with soft ambient lighting, highlighting the sense of a safe and nurturing play environment. As the child moves energetically, the camera follows, blurring slightly due to the quick movement, creating a dynamic and lively atmosphere in the footage. The video captures the essence of childhood curiosity and the warmth of a caretaker’s supervision.'

In [None]:
import os

def get_filelist(file_path):
    Filelist = []
    for home, dirs, files in os.walk(file_path):
        for filename in files:
            Filelist.append(os.path.join(home, filename))
    return Filelist

f = open("samples_training", "a")
paths_video = get_filelist('/home/artmed/PycharmProjects/dataset/UCF101')
writer = csv.writer(f)
for path_video in paths_video:
    length = get_video_length(cv2.VideoCapture(path_video))
    class_video = path_video.split('/UCF101/')[1].split('/')[0]
    writer.writerow([path_video, class_video, length])
