In [None]:
from typing import TypedDict, Annotated, Sequence, List, Optional
import operator

from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
from langchain.pydantic_v1 import BaseModel, Field

In [None]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    temperature=0.0,
    azure_deployment="gpt4o",
    openai_api_version="2023-07-01-preview",
)

In [None]:
# 2. Create the state

class AgentState(TypedDict):
	search_queries: List[str]
	video_ids: List[str]
	clip_texts = List[str]
	clues = List[str]
	annotations = List[str]

In [None]:
# 3. Set prompts

GEN_QUERIES_PROMPT = (
    "You a helping the user to find a very large and diverse set of videos on a video hosting service.",
    "A user will only describe which videos they are looking for and how many queries they need.",
)

# prompt='I want to find instructional videos about how to do squats.',
# num_queries_prompt = f'I need {num_queries} queries'

EXTRACT_CLUES_PROMPT = """You are a highly intelligent data investigator.  
You take unstructured damaged data and look for clues that could help restore the initial information
and extract important insights from it.
You are the best one for this job in the world because you are a former detective. 
You care about even the smallest details, and your guesses about what happened in the initial file
even at very limited inputs are usually absolutely right.  
You use deductive and inductive reasoning at the highest possible quality.

#YOUR TODAY'S JOB
The user needs to learn about what happens in a specific segment of a video file. Your job is to help the user by providing clues that would help the user make the right assumption.
The user will provide you with: 
1. Instructions about what kind of information the user is trying to obtain.
2. A list of time codes of the segments in format "<HH:MM:SS.ms>-<HH:MM:SS.ms>". All the provided segment of the video contain what the user is looking for, but other parts of the video might have different content.
3. A transcript of the *full video* in format of "<HH.MM.SS>\\n<text>"

Your task:
1. Read the transcript.
2. Provide the clues in a given format.
3. Provied any other info requested by the user.

#RULES
!!! VERY IMPORTANT !!!
1. Rely only on the data provided in the transcript. Do not improvise. All the quotes and corresponding timestamps must be taken from the transcript. Quote timestamps must be taken directly from the transcript.
2. Your job is to find the data already provided in the transcript.
3. Analyze every segment. Only skip a segment if there is no information about it in the trascript.
4. For local clues, make sure that the quotes that you provide are located inside the segment. To do this, double check the timestamps from the transcript and the segment.
5. For all clues, make sure that the quotes exactly correspond to the timestamps that you provide.
6. When making clues, try as much as possible to make them describe specifically what is shown in the segment.
7. Follow the format output.
8. Be very careful with details. Don't generalize. Always double check your results.

Please, help the user find relevant clues to reconstruct the information they are looking for, for each provided segment.

WHAT IS A CLUE: A *clue*, in the context of reconstructing narratives from damaged data, 
is a fragment of information extracted from a corrupted or incomplete source that provides 
insight into the original content. These fragments serve as starting points for inference 
and deduction, allowing researchers to hypothesize about the fuller context or meaning of 
the degraded material. The process of identifying and interpreting clues involves both objective analysis of the 
available data and subjective extrapolation based on domain knowledge, contextual understanding, 
and logical reasoning.

Here is what the user expects to have from you:
1. *Local clues* that would help the user undestand how the thing they are looking for happens inside the segment. Local clues for a segment are generated from quotes inside a specific segment.
2. *Global clues* that would help the user understand how the thing they are looking for happens inside the segment. Global clues for a segment are generated from quotes all around the video, but are very relevant to the specific that they are provided for.
3. *Logical inferences* that could help the user understand how the thing they are looking for happens inside the segment. Logical inferences for a segment are deducted from local and global clues for this segment.

!!!IT IS EXTREMELY IMPORTANT TO DELIVER ALL THREE THINGS!!!
"""

# also MANY a structured output prompt

# EXTRACT_CLUES_PROMPT = """
# "User's instructions: The provided video is a tutorial about how to perform squats.

# I need to understand HOW THE PERSON SHOWN IN EACH SEGMENT PERFORMS SQUATS IN THIS SEGMENT.
# What is done correctly.
# What mistakes they make. Why these mistakes happen.
# How these mistakes could be improved.

# It is very improtant that the information that you provide would describe how the person shown in the segment is doing squats, and not some generic advice that is unrelated to the visual information.
# """

# prompt.append('Segment timecodes and optional additional information:\n' + '\n'.join([s.to_str(skip=[filter_by] if filter_by else []) for s in video_segments_part]))
# prompt.append('Transcript:\n' + transcript)


GEN_ANNOTATIONS_PROMPT = """You are a helpful assistant that performs high quality data investigation and transformation.
                You will be given a JSON object with clues and other helpful information about what's going on 
                in a specific part of a video file. This part is called a segment. Your job is to:
                1. Read this JSON object carefully
                2. Answer user's questions about this segment
                3. Provide the answer as a JSON object in a schema provided by the user
                Important rules:
                1. You can only rely on data presented in a provided JSON object. Don't improvise.
                2. Follow user's request carefully.
                3. Don't rush to deliver the answer. Take some time to think. Make a deep breath. Then start writing.
                4. If you want to output field as empty (null), output it as JSON null (without quotes), not as a string "null". 
"""


# human_prompt = """
# You are given a JSON object that contains clues about segments of a video with timecodes.
# !!!! For each segment provided in a JSON object you need to answer on the following questions:
# 1. Given the data found in the JSON object, what is a probability that this part contains a footage of a person doing squats? [the answer could be only "high", "medium", "low", or null (if impossible to infer from the provided data)]
# 2. Given the data found in the JSON object and even if the answer on the previous question is "low", does this person do squats right, wrong, or mixed? [the answer could be only "right", "wrong", "mixed", or null (if impossible to infer from the provided data)]
# 3. Given the data found in the JSON object, what exactly does thing person do right and/or wrong regarding their squats technique? [the answer should be clear and focused on body parts]
# 4. If the answer on the previous question contains description of wrong technique, explain how to fix these mistakes using your "own knowledge" like you are a sports coach.
# """

# for clue in clues_part:
#     prompt.append("Segment:\n" + json.dumps(clue))

In [None]:
from datagen import DatagenConfig, get_video_ids, download_videos, detect_segments_clip, generate_clues, generate_annotations

config_params = {
    "openai": {
        "type": "azure",  # openai/azure
        "temperature": "1",
        "deployment": "gpt4o",  # model for openai / deployment for azure
    },
    "data_dir": "./tmp/squats",
}

!mkdir -p {config_params["data_dir"]}

# this config handles all the bookeeping so you need to pass it everywhere.
config = DatagenConfig(**config_params)

In [None]:
# 4. Create nodes


def gen_queries_node(state: AgentState):
    class QueryList(BaseModel):
        """A list of queries to find videos on a video hosting service"""

        queries: list[str] = Field(default=None, description="a list of queries")

    messages = [
        SystemMessage(content=GEN_QUERIES_PROMPT),
        HumanMessage(content=state["task"]),
    ]

    model = llm.with_structured_output(QueryList)
    response = model.invoke(messages)

    return {"search_queries": response.content}


def get_video_ids_node(state: AgentState):
    ids = get_video_ids(
        state["search_queries"],
        config=config,
        videos_per_query=2,
        only_creative_commons=False,
    )
    return {"video_ids": ids}


def download_node(state: AgentState):
    videos = download_videos(ids, config)
    # save to state
    return {"something": videos}


def detect_segments_node(state: AgentState):
    segments = detect_segments_clip(
        # video_ids=['KvRK5Owqzgw'],
        text_prompts="a person doing squats",  # that's the text for CLIP to compare to images. You can provide a list of texts to use average distance.
        model=model,
        processor=processor,
        fps_sampling=2,  # the more fps, the more granular segment borders and more precise segments, at the cost of speed.
        device="cuda",  # 'cpu' for local
        frames_per_batch=100,  # 100 frames use about 10GB GPU RAM, so batch to fill your GPU RAM.
        config=config,
    )
    return {"segments": segments}


def extract_clues_node(state: AgentState):
    clues = []

    clues = generate_clues(
        # video_ids=['byxWus7BwfQ'],
        config=config,
        human_prompt=human_prompt,
        segments_per_call=5,  # the output might be quite long, so need to limit number of segments per gpt call to respect max output legnth
        raise_on_error=True,  # interrupt when encountering an error. Useful for debugging.
    )

    return {"clues": clues}


def gen_annotations_node(state: AgentState):

    class SegmentFeedback(BaseModel):
        """
        —> GOOD EXAMPLES:
            "wrong":"Knees caving in: This can stress the knees and reduce effectiveness"
            "correction":"Focus on keeping knees aligned with your toes."
            "wrong":"Rounding the back: This increases the risk of back injuries"
            "correction":"Keep your chest up and maintain a neutral spine throughout the movement."
            "wrong":"Heels are lifting off the ground: this shifts the weight forward, reducing stability"
            "correction":" Keep your weight on your heels and press through them as you rise."
            "right":"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso."
            "correction":null
        —> BAD EXAMPLES:
            "wrong":"knees"
            "correction":"fix knees"
            "wrong":"back looks funny"
            "correction":"make back better"
            "wrong":"feet are doing something"
            "correction":"feet should be different"
            "right":"arms"
            "correction":"arms are fine i think"
        —> BAD EXAMPLES END HERE
        """

        right: Optional[str] = Field(description="what was right in the performance")
        wrong: Optional[str] = Field(description="what was wrong in the performance")
        correction: Optional[str] = Field(
            description="how and in what ways it the performance could be improved"
        )

    # The segment timestamps are taken from the provided information.
    class SegmentAnnotation(BaseModel):
        squats_probability: Optional[str] = Field(
            description="how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)"
        )
        squats_technique_correctness: Optional[str] = Field(
            description="correctness of the squat technique."
        )
        squats_feedback: Optional[SegmentFeedback] = Field(
            description="what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them."
        )

    annotations = generate_annotations(
        human_prompt=human_prompt,
        config=config,
        segments_per_call=5,
        annotation_schema=SegmentAnnotation,
    )

    return {"annotations": annotations}

In [None]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, Annotated, List
import operator
from langgraph.checkpoint.memory import MemorySaver

from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage, ChatMessage

memory = MemorySaver()
# memory = SqliteSaver.from_conn_string(":memory:")

In [None]:
builder = StateGraph(AgentState)

builder.add_node("generate_queries", gen_queries_node)
builder.add_node("get_video_ids", get_video_ids_node)
builder.add_node("download", download_node)
builder.add_node("detect_segments", detect_segments_node)
builder.add_node("extract_clues", extract_clues_node)
builder.add_node("gen_annotations", gen_annotations_node)

builder.set_entry_point("generate_queries")

# builder.add_conditional_edges(
#     "generate", 
#     should_continue, 
#     {END: END, "reflect": "reflect"}
# )

builder.add_edge("generate_queries", "get_video_ids")
builder.add_edge("get_video_ids", "download")
builder.add_edge("download", "detect_segments")
builder.add_edge("detect_segments", "extract_clues")
builder.add_edge("extract_clues", "gen_annotations")
builder.add_edge("gen_annotations", END)

graph = builder.compile(checkpointer=memory)

In [None]:
thread = {"configurable": {"thread_id": "1"}}
for s in graph.stream(
    {
        "task": "what is the difference between langchain and langsmith",
    },
    thread,
):
    print(s)