In [1]:
import os
import yaml
import json
from pprint import pprint
from copy import deepcopy
import cv2
import numpy as np

from langchain_community.document_loaders import JSONLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

In [2]:
with open("../creds.yaml", "r") as f:
            creds = yaml.safe_load(f)

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = creds["LANGCHAIN"]["LANGCHAIN_API_KEY"]

os.environ["OPENAI_API_KEY"] = creds["OPENAI"]["OPENAI_API_KEY"]
os.environ["OPENAI_BASE_URL"] = "https://pro.aiskt.com/v1"

In [3]:
os.environ["LANGCHAIN_PROJECT"]="VAST Captions"

video_dir = "../inputs/val_set"
file_path = "../inputs/annotations/val_annotations.json"
with open(file_path, "r") as f:
    ann = json.load(f)

cap_list = [value["cap"][0] for elem in ann["annotations"].values() for value in elem.values()]
cap_list = list(set(cap_list))

In [4]:
template = """
Introduction:
The user is creating ground truth captions for some MEDICAL video segments.
The original captions are formulated as some consumers' first aid, medical emergency, and medical educational questions.
Your task is to formulate these captions as declarative sentences and generate THREE different versions from different perspectives.

Objective
The final objective is to improve the generalization capabilities of a video captioning model by increasing the DIVERSITY of the ground truth captions.

Instructions:
1. Understand the Medical Question: Ensure that the core concerns of the patient, including symptoms and body parts, are preserved in all generated versions.
2. Reformulate the Sentence to be Declarative: Convert the original caption from a question format to a declarative sentence format. Ensure the declarative sentence describes a medical situation suitable for the context being questioned.
3. Diversify Perspectives: Vary the medical situation by considering different angles such as possible medical history, user's living habits, potential treatments or interventions.

Please separate new capations separated by newlines.

Examples:
Input: 
How to stretch the quadricep muscles to prevent arthritis?
Output:
Regularly stretching the quadricep muscles helps maintain joint flexibility and may reduce the risk of developing arthritis.
Performing quadricep stretches can improve muscle strength and support knee joint health, potentially preventing arthritis.
Engaging in daily quadricep stretches enhances muscle elasticity and may lower the likelihood of arthritis.

The original caption: {caption}
"""
prompt_perspectives = ChatPromptTemplate.from_template(template)
generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0.2) 
    | StrOutputParser() 
)

In [5]:
gen_output = {}
for cap in cap_list:
    docs = generate_queries.invoke({"caption": cap})
    gen_output[cap] = docs

with open("vast_ann_backup.json", "w") as f:
    json.dump(gen_output, f, indent=4)

In [6]:
def get_video_duration(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error: Could not open video.")
        return None
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    duration = frame_count / fps
    cap.release()
    return frame_count, duration

In [7]:
with open("../captions_ann/vast_ann_backup_val.json", "r") as f:
    gen_output = json.load(f)

vast_ann = []
for vid_name, labels in ann["annotations"].items():
    frame_count, duration = get_video_duration(os.path.join(video_dir, vid_name) + ".mp4")
    for i, elem in enumerate(labels.values()):
        raw_cap = gen_output[elem["cap"][0]]
        raw_cap = raw_cap.split("\n")
        
        ann_i = {
            "video_id": os.path.join(video_dir, vid_name) + ".mp4" + f"@{i}",
            "caption": raw_cap,
            "timestamp": (np.array(elem["z"]) / duration * frame_count).astype(int).tolist(),
            "event_id": i,
            "duration": duration,
            "frame_count": frame_count,
        }
        vast_ann.append(ann_i)

# validation
with open("vast_ann_val.json", "w") as f:
    json.dump(vast_ann, f, indent=4)
# validation
with open("vast_annfile.json", "w") as f:
    json.dump(
        {
            "type": "caption",
            "annotations": vast_ann
        },
        f,
        indent=4
    )
# training
with open("vast_ann_train.json", "w") as f:
    json.dump(vast_ann, f, indent=4)