## Install Dependencies

In [None]:
!pip install torchvision==0.19.0
!pip install opencv-python==4.10.0.84
!pip install diffusers==0.31.0
!pip install transformers==4.47.0
!pip install tokenizers==0.21.0
!pip install accelerate==1.2.0
!pip install pandas==2.2.3
!pip install numpy==1.26.4
!pip install einops==0.8.0
!pip install tqdm==4.66.4
!pip install loguru==0.7.3
!pip install imageio==2.36.1
!pip install imageio-ffmpeg==0.5.1
!pip install safetensors==0.4.5
!pip install packaging==24.1
!pip install ninja==1.11.1.1
!pip install flash-attn==2.5.9.post1
# hf_transfer = for hugging-face high speed transfers
!pip install huggingface_hub[hf_transfer]
!pip install hf_transfer


## Download Models & Text Encoders from HF

In [None]:
# For the token input
from getpass import getpass
import os

HF_TOKEN = getpass("Enter or paste your Hugging Face access token: ")

# Login and set environment variable
!huggingface-cli login --token $HF_TOKEN
os.environ['HF_HUB_ENABLE_HF_TRANSFER'] = '1'

# Change dir to ckpts
%cd ckpts

# Download the pretrained models
!huggingface-cli download tencent/HunyuanVideo --local-dir .

# Download Text Encoder
!huggingface-cli download xtuner/llava-llama-3-8b-v1_1-transformers --local-dir ./llava-llama-3-8b-v1_1-transformers

# Separate the lang model parts
!python3 ../hyvideo/utils/preprocess_text_encoder_tokenizer_utils.py --input_dir llava-llama-3-8b-v1_1-transformers --output_dir text_encoder

# Download clip text encoder
!huggingface-cli download openai/clip-vit-large-patch14 --local-dir ./text_encoder_2

# Go back one dir
%cd ..

## Start Inferencing

In [None]:
!python3 sample_video.py \
--video-size 720 1280 \
--video-length 129 \
--infer-steps 30 \
--prompt " a teddy bear dancing, realistic." \
--flow-reverse \
--seed 0 \
--use-cpu-offload \
--save-path ./results\