In [1]:
from dotenv import load_dotenv
_ = load_dotenv()

In [2]:
from typing import TypedDict, Annotated, Sequence, List, Optional
import operator

from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
from langchain.pydantic_v1 import BaseModel, Field

In [3]:
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    temperature=0.0,
    azure_deployment="gpt4o",
    openai_api_version="2023-07-01-preview",
)

In [4]:
class VideoInfo(BaseModel):
    video_id: str
    url: str
    relative_video_path: str
    subs: str
    transcript: str


class SegmentInfo(BaseModel):  # , Generic[OutputSchema]):
    start_timestamp: str
    end_timestamp: str
    fps: float
    # segment_info: Optional[OutputSchema]
    video_id: str
    # _frames: Optional[
    # list[np.array]
    # ]  # List of raw frames that got into LLM. Added for debugging purposes.

    # @classmethod
    # def from_frames(cls, start_frame, end_frame, fps, **kwargs):
    #     return cls(
    #         start_timestamp=seconds_to_ts(start_frame / fps),
    #         end_timestamp=seconds_to_ts(end_frame / fps),
    #         fps=fps,
    #         **kwargs,
    #     )

    # @classmethod
    # def from_seconds(cls, start_seconds, end_seconds, **kwargs):
    #     return cls(
    #         start_timestamp=seconds_to_ts(start_seconds),
    #         end_timestamp=seconds_to_ts(end_seconds),
    #         **kwargs,
    #     )

    # def to_str(self, skip: list[str] = []):
    #     # skip -> fields from segment_info
    #     # dict() works both with pydantic model and with with unparsed dict
    #     if self.segment_info:
    #         d = dict(self.segment_info)
    #         for s in skip:
    #             del d[s]
    #         d = ": " + json.dumps(d)
    #     else:
    #         d = ""
    #     return f"{self.start_timestamp}-{self.end_timestamp}{d}"

In [5]:
# 2. Create the state

class AgentState(TypedDict):
	task: str
	search_queries: List[str]
	video_ids: List[str]
	video_infos: List[VideoInfo]
	clip_text_prompts: List[str]
	segment_infos: List[SegmentInfo]
	clues = List[str]
	annotations = List[str]

In [6]:
# 3. Set prompts

GEN_QUERIES_PROMPT = (
    "You a helping the user to find a very large and diverse set of videos on a video hosting service.",
    "A user will only describe which videos they are looking for and how many queries they need.",
)

# prompt='I want to find instructional videos about how to do squats.',
# num_queries_prompt = f'I need {num_queries} queries'

EXTRACT_CLUES_PROMPT = """You are a highly intelligent data investigator.  
You take unstructured damaged data and look for clues that could help restore the initial information
and extract important insights from it.
You are the best one for this job in the world because you are a former detective. 
You care about even the smallest details, and your guesses about what happened in the initial file
even at very limited inputs are usually absolutely right.  
You use deductive and inductive reasoning at the highest possible quality.

#YOUR TODAY'S JOB
The user needs to learn about what happens in a specific segment of a video file. Your job is to help the user by providing clues that would help the user make the right assumption.
The user will provide you with: 
1. Instructions about what kind of information the user is trying to obtain.
2. A list of time codes of the segments in format "<HH:MM:SS.ms>-<HH:MM:SS.ms>". All the provided segment of the video contain what the user is looking for, but other parts of the video might have different content.
3. A transcript of the *full video* in format of "<HH.MM.SS>\\n<text>"

Your task:
1. Read the transcript.
2. Provide the clues in a given format.
3. Provied any other info requested by the user.

#RULES
!!! VERY IMPORTANT !!!
1. Rely only on the data provided in the transcript. Do not improvise. All the quotes and corresponding timestamps must be taken from the transcript. Quote timestamps must be taken directly from the transcript.
2. Your job is to find the data already provided in the transcript.
3. Analyze every segment. Only skip a segment if there is no information about it in the trascript.
4. For local clues, make sure that the quotes that you provide are located inside the segment. To do this, double check the timestamps from the transcript and the segment.
5. For all clues, make sure that the quotes exactly correspond to the timestamps that you provide.
6. When making clues, try as much as possible to make them describe specifically what is shown in the segment.
7. Follow the format output.
8. Be very careful with details. Don't generalize. Always double check your results.

Please, help the user find relevant clues to reconstruct the information they are looking for, for each provided segment.

WHAT IS A CLUE: A *clue*, in the context of reconstructing narratives from damaged data, 
is a fragment of information extracted from a corrupted or incomplete source that provides 
insight into the original content. These fragments serve as starting points for inference 
and deduction, allowing researchers to hypothesize about the fuller context or meaning of 
the degraded material. The process of identifying and interpreting clues involves both objective analysis of the 
available data and subjective extrapolation based on domain knowledge, contextual understanding, 
and logical reasoning.

Here is what the user expects to have from you:
1. *Local clues* that would help the user undestand how the thing they are looking for happens inside the segment. Local clues for a segment are generated from quotes inside a specific segment.
2. *Global clues* that would help the user understand how the thing they are looking for happens inside the segment. Global clues for a segment are generated from quotes all around the video, but are very relevant to the specific that they are provided for.
3. *Logical inferences* that could help the user understand how the thing they are looking for happens inside the segment. Logical inferences for a segment are deducted from local and global clues for this segment.

!!!IT IS EXTREMELY IMPORTANT TO DELIVER ALL THREE THINGS!!!
"""

# also MANY a structured output prompt

# EXTRACT_CLUES_PROMPT = """
# "User's instructions: The provided video is a tutorial about how to perform squats.

# I need to understand HOW THE PERSON SHOWN IN EACH SEGMENT PERFORMS SQUATS IN THIS SEGMENT.
# What is done correctly.
# What mistakes they make. Why these mistakes happen.
# How these mistakes could be improved.

# It is very improtant that the information that you provide would describe how the person shown in the segment is doing squats, and not some generic advice that is unrelated to the visual information.
# """

# prompt.append('Segment timecodes and optional additional information:\n' + '\n'.join([s.to_str(skip=[filter_by] if filter_by else []) for s in video_segments_part]))
# prompt.append('Transcript:\n' + transcript)


GEN_ANNOTATIONS_PROMPT = """You are a helpful assistant that performs high quality data investigation and transformation.
                You will be given a JSON object with clues and other helpful information about what's going on 
                in a specific part of a video file. This part is called a segment. Your job is to:
                1. Read this JSON object carefully
                2. Answer user's questions about this segment
                3. Provide the answer as a JSON object in a schema provided by the user
                Important rules:
                1. You can only rely on data presented in a provided JSON object. Don't improvise.
                2. Follow user's request carefully.
                3. Don't rush to deliver the answer. Take some time to think. Make a deep breath. Then start writing.
                4. If you want to output field as empty (null), output it as JSON null (without quotes), not as a string "null". 
"""


# human_prompt = """
# You are given a JSON object that contains clues about segments of a video with timecodes.
# !!!! For each segment provided in a JSON object you need to answer on the following questions:
# 1. Given the data found in the JSON object, what is a probability that this part contains a footage of a person doing squats? [the answer could be only "high", "medium", "low", or null (if impossible to infer from the provided data)]
# 2. Given the data found in the JSON object and even if the answer on the previous question is "low", does this person do squats right, wrong, or mixed? [the answer could be only "right", "wrong", "mixed", or null (if impossible to infer from the provided data)]
# 3. Given the data found in the JSON object, what exactly does thing person do right and/or wrong regarding their squats technique? [the answer should be clear and focused on body parts]
# 4. If the answer on the previous question contains description of wrong technique, explain how to fix these mistakes using your "own knowledge" like you are a sports coach.
# """

# for clue in clues_part:
#     prompt.append("Segment:\n" + json.dumps(clue))

In [7]:
# from datagen import DatagenConfig, get_video_ids, download_videos, detect_segments_clip, generate_clues, generate_annotations

# config_params = {
#     "openai": {
#         "type": "azure",  # openai/azure
#         "temperature": "1",
#         "deployment": "gpt4o",  # model for openai / deployment for azure
#     },
#     "data_dir": "./tmp/squats",
# }

# !mkdir -p {config_params["data_dir"]}

# # this config handles all the bookeeping so you need to pass it everywhere.
# config = DatagenConfig(**config_params)

In [8]:
import scrapetube
import yt_dlp
from datetime import datetime
from pathlib import Path
from collections import defaultdict
from datagen.core.sub_utils import vtt_to_txt
from datagen.detect_segments import get_segments
import torch
from transformers import AutoModel, AutoProcessor
import pandas as pd
from tsmoothie.smoother import LowessSmoother

In [9]:
import decord
import math
import numpy as np

# decord.bridge.set_bridge("torch")


class VideoInferenceDataset(torch.utils.data.IterableDataset):
    def __init__(self, video_infos: List[VideoInfo], local_root: Path):
        super(VideoInferenceDataset).__init__()

        self.video_infos = video_infos
        self.local_root = local_root
        self.frame_generator = self.get_frame_generator(video_infos, local_root)

    @staticmethod
    def get_frame_generator(video_infos, local_root: Path):

        for video_idx, video_info in enumerate(video_infos):
            video_path = local_root.joinpath(video_info.relative_video_path)
            vr = decord.VideoReader(str(video_path))
            num_frames = len(vr)
            fps = vr.get_avg_fps()
            frame_indices = range(0, num_frames, round(fps))

            # print(f"Num frames: {num_frames}, fps: {fps}")
            # print(f"Len frame indices: {len(frame_indices)}")

            # frames = vr.get_batch(frame_indices)

            for frame_idx in frame_indices:
                print(f"Frame idx {frame_idx}")
                frame = vr[frame_idx].asnumpy()
                yield {
                    "frame": frame,
                    "frame_idx": frame_idx,
                    "video_id": video_idx,
                }

        #     print("video done")
        # print("all videos done")

    def __next__(self):
        return next(self.frame_generator)

    def __iter__(self):
        return self

In [10]:
import time
import math

# 4. Create nodes


def gen_queries_node(state: AgentState):
    class QueryList(BaseModel):
        """A list of queries to find videos on a video hosting service"""

        search_queries: list[str] = Field(default=None, description="a list of queries")

    messages = [
        SystemMessage(content=str(GEN_QUERIES_PROMPT)),
        HumanMessage(content=state["task"]),
    ]

    model = llm.with_structured_output(QueryList)
    response: QueryList = model.invoke(messages)

    return {"search_queries": response.search_queries[:2]}


def get_video_ids_node(state: AgentState):

    queries = state["search_queries"]
    videos_per_query = 1
    sleep = 0
    sort_by = "relevance"
    results_type = "video"
    only_creative_commons = False

    video_ids = set()
    for query in queries:
        for video in scrapetube.get_search(
            query=query,
            limit=videos_per_query,
            sleep=sleep,
            sort_by=sort_by,
            results_type=results_type,
        ):
            video_ids.add(video["videoId"])
    video_ids = list(video_ids)

    if only_creative_commons:
        video_ids_cc = []
        for i in video_ids:
            YDL_OPTIONS = {
                "quiet": True,
                "simulate": True,
                "forceurl": True,
            }
            with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:
                info = ydl.extract_info(f"youtube.com/watch?v={i}", download=False)
            if "creative commons" in info.get("license", "").lower():
                video_ids_cc.append(i)
        video_ids = video_ids_cc

    return {"video_ids": video_ids}


def download_node(state: AgentState):

    LOCAL_ROOT = Path("./tmp/agent_squats").resolve()
    video_dir = LOCAL_ROOT / "videos"
    sub_dir = LOCAL_ROOT / "subs"

    discard_path = LOCAL_ROOT / "videos_without_subs"
    discard_path.mkdir(parents=True, exist_ok=True)

    video_ids = state["video_ids"]

    downloaded_video_ids = [video_path.stem for video_path in video_dir.glob("*.mp4")]
    downloaded_video_ids += [
        video_path.stem for video_path in discard_path.glob("*.mp4")
    ]

    print(f"Downloaded video ids: {downloaded_video_ids}")

    only_with_transcripts = True

    YDL_OPTIONS = {
        "writeautomaticsub": True,
        "subtitleslangs": ["en"],
        "subtitlesformat": "vtt",
        "overwrites": False,
        "format": "mp4",
        "outtmpl": {
            "default": video_dir.as_posix() + "/%(id)s.%(ext)s",
            "subtitle": sub_dir.as_posix() + "/%(id)s.%(ext)s",
        },
    }

    video_infos = []

    with yt_dlp.YoutubeDL(YDL_OPTIONS) as ydl:
        for video_id in video_ids:
            url = f"https://www.youtube.com/watch?v={video_id}"

            if video_id not in downloaded_video_ids:
                try:
                    ydl.download(url)
                except Exception as e:
                    print(datetime.now(), f"Error at video {video_id}, skipping")
                    print(datetime.now(), e)
                    continue

            video_path = Path(ydl.prepare_filename({"id": video_id, "ext": "mp4"}))
            sub_path = Path(
                ydl.prepare_filename(
                    {"id": video_id, "ext": "en.vtt"}, dir_type="subtitle"
                )
            )

            with sub_path.open("r") as f:
                subs = f.read()

            transcript = vtt_to_txt(sub_path)

            video_info = VideoInfo(
                video_id=video_id,
                url=url,
                relative_video_path=video_path.relative_to(LOCAL_ROOT).as_posix(),
                subs=subs,
                transcript=transcript,
            )

            video_infos.append(video_info)

    if only_with_transcripts:
        filtered_video_infos = []
        for video_info in video_infos:
            if video_info.transcript:
                filtered_video_infos.append(video_info)
            else:
                video_path = LOCAL_ROOT / video_info.video_path
                video_path.rename(discard_path / video_path.name)
        video_infos = filtered_video_infos

    return {"video_infos": video_infos}


DATAFRAME = None


def detect_segments_node(state: AgentState):

    LOCAL_ROOT = Path("./tmp/agent_squats").resolve()

    clip_text_prompts = state["clip_text_prompts"]
    video_infos = state["video_infos"]

    CLIP_MODEL_ID = "google/siglip-so400m-patch14-384"

    model = AutoModel.from_pretrained(CLIP_MODEL_ID).to("cuda")
    processor = AutoProcessor.from_pretrained(CLIP_MODEL_ID)

    dataset = VideoInferenceDataset(video_infos, LOCAL_ROOT)

    # # Define a `worker_init_fn` that configures each dataset copy differently
    # def worker_init_fn(worker_id):
    #     worker_info = torch.utils.data.get_worker_info()
    #     worker_dataset = worker_info.dataset  # the dataset copy in this worker process
    #     video_infos = worker_dataset.video_infos

    #     chunk_size = math.ceil(len(video_infos) / worker_info.num_workers)
    #     video_infos_chunks = [
    #         video_infos[i : i + chunk_size]
    #         for i in range(0, len(video_infos), chunk_size)
    #     ]

    #     worker_dataset.video_infos = video_infos_chunks[worker_info.id]

    #     # print(worker_dataset.video_infos)
    #     print(f"Worker {worker_info.id} initialized")

    # configure the dataset to only process the split workload

    # per_worker = math.ceil(len(dataset) / float(worker_info.num_workers))

    # worker_id = worker_info.id
    # dataset.start = overall_start + worker_id * per_worker
    # dataset.end = min(dataset.start + per_worker, overall_end)

    dataloader = torch.utils.data.DataLoader(
        dataset,
        num_workers=1,
        batch_size=12,
        pin_memory=True,
        # worker_init_fn=worker_init_fn,
    )
    dataloader = iter(dataloader)

    smoother = LowessSmoother(smooth_fraction=0.02, iterations=1)

    clip_results_dict = defaultdict(list)

    print("Init model complete")

    batch_counter = 0
    MAX_BATCHES = 50

    while batch_counter < MAX_BATCHES:
        batch_counter += 1
        try:
            start_time = time.time()
            batch = next(dataloader)
            # print(f"Fetch time: {time.time() - start_time:.2f} seconds")
        except StopIteration:
            break

        # print("Batch fetched")
        # # print(batch)

        # time.sleep(30)

        start_time = time.time()
        inputs = processor(
            images=batch["frame"],
            text=clip_text_prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
        )
        inputs = {k: v.to("cuda") for k, v in inputs.items()}
        # print(f"Proc time: {time.time() - start_time:.2f} seconds")

        # print("Inputs prepared")
        # time.sleep(5)
        # print(inputs["pixel_values"].shape)
        # print(inputs["input_ids"].shape)

        start_time = time.time()
        outputs = model(**inputs)
        # print(f"Forward time: {time.time() - start_time:.2f} seconds")

        logits = outputs.logits_per_image
        probs = torch.nn.functional.sigmoid(logits).detach().cpu().numpy()

        # print("Forward pass complete")
        # print(f"video_id {len(batch["video_id"])}")
        # print(len(probs))

        for video_idx, frame_idx, prob in zip(
            batch["video_id"], batch["frame_idx"], probs
        ):
            # print(type(video_id.item()), type(frame_idx.item()), type(prob.item()))
            video_id = video_infos[video_idx.item()].video_id

            clip_results_dict["video_id"].append(video_id)
            clip_results_dict["frame_idx"].append(frame_idx.item())
            clip_results_dict["probs"].append(prob.item())

        # print(f"Len clip results: {len(clip_results_dict['video_id'])}")

        # print("Outputs parsed")

    print("All frames processed")
    clip_results = pd.DataFrame(clip_results_dict)
    print("Dataframe created")
    print(clip_results)

    max_gap_seconds = 1
    fps_sampling = 1
    min_prob = 0.1
    min_segment_seconds = 3
    fps = 25

    segment_infos = []
    for video_id, video_clip_results in clip_results.groupby("video_id"):
        probs = video_clip_results["probs"].values
        probs = smoother.smooth(probs).smooth_data[0]
        segments_start_end = get_segments(
            probs,
            max_gap=round(max_gap_seconds * fps_sampling),
            min_prob=min_prob,
            min_segment=round(min_segment_seconds * fps_sampling),
        )

        print(f"Segments for video {video_id}: {segments_start_end}")

        sec2ts = lambda s: time.strftime(
            f"%H:%M:%S.{round((s%1)*1000):03d}", time.gmtime(s)
        )

        
        for start, end in segments_start_end:
            segment_infos.append(
                SegmentInfo(
                    start_timestamp=sec2ts(start),
                    end_timestamp=sec2ts(end),
                    fps=fps,
                    video_id=video_id,
                )
            )

    return {"segment_infos": segment_infos}


# def extract_clues_node(state: AgentState):
#     clues = []

#     clues = generate_clues(
#         # video_ids=['byxWus7BwfQ'],
#         config=config,
#         human_prompt=human_prompt,
#         segments_per_call=5,  # the output might be quite long, so need to limit number of segments per gpt call to respect max output legnth
#         raise_on_error=True,  # interrupt when encountering an error. Useful for debugging.
#     )

#     return {"clues": clues}


# def gen_annotations_node(state: AgentState):

#     class SegmentFeedback(BaseModel):
#         """
#         —> GOOD EXAMPLES:
#             "wrong":"Knees caving in: This can stress the knees and reduce effectiveness"
#             "correction":"Focus on keeping knees aligned with your toes."
#             "wrong":"Rounding the back: This increases the risk of back injuries"
#             "correction":"Keep your chest up and maintain a neutral spine throughout the movement."
#             "wrong":"Heels are lifting off the ground: this shifts the weight forward, reducing stability"
#             "correction":" Keep your weight on your heels and press through them as you rise."
#             "right":"Chest and shoulders: The chest is up, and the shoulders are back, maintaining an upright torso."
#             "correction":null
#         —> BAD EXAMPLES:
#             "wrong":"knees"
#             "correction":"fix knees"
#             "wrong":"back looks funny"
#             "correction":"make back better"
#             "wrong":"feet are doing something"
#             "correction":"feet should be different"
#             "right":"arms"
#             "correction":"arms are fine i think"
#         —> BAD EXAMPLES END HERE
#         """

#         right: Optional[str] = Field(description="what was right in the performance")
#         wrong: Optional[str] = Field(description="what was wrong in the performance")
#         correction: Optional[str] = Field(
#             description="how and in what ways it the performance could be improved"
#         )

#     # The segment timestamps are taken from the provided information.
#     class SegmentAnnotation(BaseModel):
#         squats_probability: Optional[str] = Field(
#             description="how high is the probability that the person is doing squats in the segment: low, medium, high, unknown(null)"
#         )
#         squats_technique_correctness: Optional[str] = Field(
#             description="correctness of the squat technique."
#         )
#         squats_feedback: Optional[SegmentFeedback] = Field(
#             description="what was right and wrong in the squat perfomance in the segment. When the technique is incorrect, provide instructions how to correct them."
#         )

#     annotations = generate_annotations(
#         human_prompt=human_prompt,
#         config=config,
#         segments_per_call=5,
#         annotation_schema=SegmentAnnotation,
#     )

#     return {"annotations": annotations}

In [11]:
from langgraph.graph import StateGraph, END
from typing import TypedDict, Annotated, List
import operator
from langgraph.checkpoint.memory import MemorySaver

from langchain_core.messages import AnyMessage, SystemMessage, HumanMessage, AIMessage, ChatMessage

memory = MemorySaver()
# memory = SqliteSaver.from_conn_string(":memory:")

In [12]:
builder = StateGraph(AgentState)

builder.add_node("generate_queries", gen_queries_node)
builder.add_node("get_video_ids", get_video_ids_node)
builder.add_node("download", download_node)
builder.add_node("detect_segments", detect_segments_node)
# builder.add_node("extract_clues", extract_clues_node)
# builder.add_node("gen_annotations", gen_annotations_node)

builder.set_entry_point("generate_queries")

# builder.add_conditional_edges(
#     "generate", 
#     should_continue, 
#     {END: END, "reflect": "reflect"}
# )

builder.add_edge("generate_queries", "get_video_ids")
builder.add_edge("get_video_ids", "download")
builder.add_edge("download", "detect_segments")
builder.add_edge("detect_segments", END)

# builder.add_edge("detect_segments", "extract_clues")
# builder.add_edge("extract_clues", "gen_annotations")
# builder.add_edge("gen_annotations", END)

graph = builder.compile(checkpointer=memory)

In [13]:
thread = {"configurable": {"thread_id": "1"}}
for s in graph.stream(
    {
        "task": "i wanna teach people how to do squats",
        "clip_text_prompts": ["person doing squats"],
    },
    thread,
):
    print(s)

{'generate_queries': {'search_queries': ['how to do squats', 'squat exercise tutorial']}}
{'get_video_ids': {'video_ids': ['xqvCmoLULNY', 'IB_icWRzi4E']}}
Downloaded video ids: ['IB_icWRzi4E', 'xqvCmoLULNY']
{'download': {'video_infos': [VideoInfo(video_id='xqvCmoLULNY', url='https://www.youtube.com/watch?v=xqvCmoLULNY', relative_video_path='videos/xqvCmoLULNY.mp4', subs="WEBVTT\nKind: captions\nLanguage: en\n\n00:00:00.160 --> 00:00:01.829 align:start position:0%\n \nlet's<00:00:00.399><c> learn</c><00:00:00.560><c> how</c><00:00:00.719><c> to</c><00:00:00.880><c> properly</c><00:00:01.280><c> perform</c><00:00:01.760><c> a</c>\n\n00:00:01.829 --> 00:00:01.839 align:start position:0%\nlet's learn how to properly perform a\n \n\n00:00:01.839 --> 00:00:02.790 align:start position:0%\nlet's learn how to properly perform a\nsquat\n\n00:00:02.790 --> 00:00:02.800 align:start position:0%\nsquat\n \n\n00:00:02.800 --> 00:00:04.470 align:start position:0%\nsquat\nstart<00:00:03.120><c> with</

In [14]:
graph.get_state(thread).values

{'task': 'i wanna teach people how to do squats',
 'search_queries': ['how to do squats', 'squat exercise tutorial'],
 'video_ids': ['xqvCmoLULNY', 'IB_icWRzi4E'],
 'video_infos': [VideoInfo(video_id='xqvCmoLULNY', url='https://www.youtube.com/watch?v=xqvCmoLULNY', relative_video_path='videos/xqvCmoLULNY.mp4', subs="WEBVTT\nKind: captions\nLanguage: en\n\n00:00:00.160 --> 00:00:01.829 align:start position:0%\n \nlet's<00:00:00.399><c> learn</c><00:00:00.560><c> how</c><00:00:00.719><c> to</c><00:00:00.880><c> properly</c><00:00:01.280><c> perform</c><00:00:01.760><c> a</c>\n\n00:00:01.829 --> 00:00:01.839 align:start position:0%\nlet's learn how to properly perform a\n \n\n00:00:01.839 --> 00:00:02.790 align:start position:0%\nlet's learn how to properly perform a\nsquat\n\n00:00:02.790 --> 00:00:02.800 align:start position:0%\nsquat\n \n\n00:00:02.800 --> 00:00:04.470 align:start position:0%\nsquat\nstart<00:00:03.120><c> with</c><00:00:03.199><c> your</c><00:00:03.360><c> feet</c><00