In [1]:
from transformers import AutoModel, BitsAndBytesConfig
import math
import numpy as np

from moviepy.editor import VideoFileClip
import tempfile
import librosa
import soundfile as sf
import torch
from PIL import Image
from transformers import AutoModel, AutoTokenizer
from decord import VideoReader, cpu  

In [2]:
# quant_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=getattr(torch, "float16"),
#     bnb_4bit_use_double_quant=False,
# )

In [3]:
#download "openbmb/MiniCPM-o-2_6-int4"
model = AutoModel.from_pretrained(
    'openbmb/MiniCPM-o-2_6',
    trust_remote_code=True,
    attn_implementation='sdpa', # sdpa or flash_attention_2
    torch_dtype=torch.bfloat16,
    init_vision=True,
    # init_vision=False,
    init_audio=True,
    init_tts=True,
    # quantization_config=quant_config,
    low_cpu_mem_usage=True,
    device_map='cuda:0',
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
print(model)

MiniCPMO(
  (llm): Qwen2ForCausalLM(
    (model): Qwen2Model(
      (embed_tokens): Embedding(151700, 3584)
      (layers): ModuleList(
        (0-27): 28 x Qwen2DecoderLayer(
          (self_attn): Qwen2SdpaAttention(
            (q_proj): Linear(in_features=3584, out_features=3584, bias=True)
            (k_proj): Linear(in_features=3584, out_features=512, bias=True)
            (v_proj): Linear(in_features=3584, out_features=512, bias=True)
            (o_proj): Linear(in_features=3584, out_features=3584, bias=False)
            (rotary_emb): Qwen2RotaryEmbedding()
          )
          (mlp): Qwen2MLP(
            (gate_proj): Linear(in_features=3584, out_features=18944, bias=False)
            (up_proj): Linear(in_features=3584, out_features=18944, bias=False)
            (down_proj): Linear(in_features=18944, out_features=3584, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): Qwen2RMSNorm((3584,), eps=1e-06)
          (post_attention_layernorm): Q

In [3]:
# model = model.eval().cuda()
tokenizer = AutoTokenizer.from_pretrained('openbmb/MiniCPM-o-2_6', trust_remote_code=True)

# Chat with image

In [18]:
def get_video_chunk_content(video_path, flatten=True):
    if isinstance(video_path, str):
        video = VideoFileClip(video_path)
    else:
        video = video_path
    print('video_duration:', video.duration)

    with tempfile.NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio_file:
        temp_audio_file_path = temp_audio_file.name
        video.audio.write_audiofile(temp_audio_file_path, codec="pcm_s16le", fps=16000)
        audio_np, sr = librosa.load(temp_audio_file_path, sr=16000, mono=True)
    num_units = math.ceil(video.duration)

    # 1 frame + 1s audio chunk
    contents= []
    for i in range(num_units):
        frame = video.get_frame(i+1)
        image = Image.fromarray((frame).astype(np.uint8))
        audio = audio_np[sr*i:sr*(i+1)]
        if flatten:
            contents.extend(["<unit>", image, audio])
        else:
            contents.append(["<unit>", image, audio])

    return contents

In [6]:
image_path = r'WIN_20250227_10_14_19_Pro.jpg'


In [7]:
image = Image.open(image_path)

In [8]:
question = 'Describe the image'

In [9]:
msgs = [{'role': 'user', 'content': [image, question]}]

In [10]:
res = model.chat(
    image=None,
    msgs=msgs,
    tokenizer=tokenizer
)

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.


In [None]:
print(res)
#The image shows a person in an office setting. The individual is wearing glasses and appears to be looking directly at the camera with a neutral expression. In the background, there are desks with computers and other office equipment. Another person can be seen working on a computer further back. There are also some boxes stacked up against the wall, indicating that this might be a workspace where deliveries or storage take place. The overall atmosphere seems typical of a professional environment.

NameError: name 'res' is not defined

# chat with video

In [None]:
  # pip install decord


MAX_NUM_FRAMES=64 # if cuda OOM set a smaller number

def encode_video(video_path):
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]

    vr = VideoReader(video_path, ctx=cpu(0))
    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
    frame_idx = [i for i in range(0, len(vr), sample_fps)]
    if len(frame_idx) > MAX_NUM_FRAMES:
        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
    frames = vr.get_batch(frame_idx).asnumpy()
    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
    print('num frames:', len(frames))
    return frames

video_path="videos/WIN_20250227_10_27_42_Pro.mp4"
frames = encode_video(video_path)
question = "Play rock-paper-scissors with me"
msgs = [
    {'role': 'user', 'content': frames + [question]}, 
]

# Set decode params for video
params = {}
params["use_image_id"] = False
params["max_slice_nums"] = 2 # use 1 if cuda OOM and video resolution > 448*448

answer = model.chat(
    msgs=msgs,
    tokenizer=tokenizer,
    **params
)
print(answer)

num frames: 4


I'm sorry, but I can't play rock-paper-scissors with you as I don't have a physical presence. However, if you'd like to learn how to play or practice the game virtually, I can help Let me know what information you need about playing rock-paper-scissors.


# Chat with audio

In [5]:
model.init_tts()
model.tts.float()

ConditionalChatTTS(
  (projector): MultiModalProjector(
    (linear1): Linear(in_features=3584, out_features=768, bias=True)
    (relu): ReLU()
    (linear2): Linear(in_features=768, out_features=768, bias=True)
  )
  (emb_code): ModuleList(
    (0-3): 4 x Embedding(626, 768)
  )
  (emb_text): Embedding(21178, 768)
  (head_code): ModuleList(
    (0-3): 4 x ParametrizedLinear(
      in_features=768, out_features=626, bias=False
      (parametrizations): ModuleDict(
        (weight): ParametrizationList(
          (0): _WeightNorm()
        )
      )
    )
  )
  (dvae): DVAE(
    (downsample_conv): Sequential(
      (0): Conv1d(100, 512, kernel_size=(3,), stride=(1,), padding=(1,))
      (1): GELU(approximate='none')
      (2): Conv1d(512, 512, kernel_size=(4,), stride=(2,), padding=(1,))
      (3): GELU(approximate='none')
    )
    (encoder): DVAEDecoder(
      (conv_in): Sequential(
        (0): Conv1d(512, 128, kernel_size=(3,), stride=(1,), padding=(1,))
        (1): GELU(approximat

In [10]:
file_id = 320
mimick_prompt = "Please repeat each user's speech, including voice style and speech content."
audio_input, _ = librosa.load(f'audio/{file_id}.wav', sr=16000, mono=True) # load the audio to be mimicked

# `./assets/input_examples/fast-pace.wav`, 
# `./assets/input_examples/chi-english-1.wav` 
# `./assets/input_examples/exciting-emotion.wav` 
# for different aspects of speech-centric features.

msgs = [{'role': 'user', 'content': [mimick_prompt, audio_input]}]
res = model.chat(
    msgs=msgs,
    tokenizer=tokenizer,
    sampling=True,
    max_new_tokens=128,
    use_tts_template=True,
    temperature=0.3,
    generate_audio=True,
    output_audio_path=f'audio/output_mimick_{file_id}.wav', # save the tts result to output_audio_path
)

In [11]:
from IPython.display import Audio
Audio(f'audio/output_mimick_{file_id}.wav')