In [1]:
import os
# os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['http_proxy'] = 'http://127.0.0.1:7897'
os.environ['https_proxy'] = 'http://127.0.0.1:7897'

from vertexai.preview.generative_models import GenerativeModel, Part, Content, Image
from google.cloud import aiplatform
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
import vertexai
import base64
import logging
import colorlog



def init_logging():
    # 配置日志
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s %(message)s',
        handlers=[
            # 把日志同时输出到文件和控制台
            logging.FileHandler('./video_describe.txt', mode='w', encoding='utf-8'),
        ]
    )

    # 创建一个控制台处理器，并使用 colorlog 格式化
    console_handler = logging.StreamHandler()
    formatter = colorlog.ColoredFormatter(
        '%(log_color)s%(asctime)s %(message)s',
        log_colors={
            'DEBUG': 'cyan',
            'INFO': 'green',
            'WARNING': 'yellow',
            'ERROR': 'red',
            'CRITICAL': 'red,bg_white',
        }
    )
    console_handler.setFormatter(formatter)

    # 获取日志记录器
    logger = logging.getLogger(__name__)
    logger.addHandler(console_handler)
    return logger


logger = init_logging()
MODEL_DICT = {'gemini-2.0': 'gemini-2.0-flash-001', 'gemini-2.0-lite': 'gemini-2.0-flash-lite-001',
              'gemini-embedding': 'gemini-embedding-001', 'text_embedding': 'text-embedding-005',
              'text-multilingual-embedding': 'text-multilingual-embedding-002', 'multimodalembedding': 'multimodalembedding@001'}


def init_valid(vertex_client_json_path=r'E:\Data\datasets\Video_Datasets\Koala-36M\Code\APIServer\vertex_client.json',
               port=9002,
               project=None,
               token_json=None):
    """验证用户"""

    scopes = ['https://www.googleapis.com/auth/cloud-platform']
    if token_json is not None and os.path.exists(token_json):
        # TODO: 根据token验证
        creds = Credentials.from_authorized_user_file(token_json, scopes=scopes)
    else:
        # 加载OAuth 2.0 凭证
        flow = InstalledAppFlow.from_client_secrets_file(
            vertex_client_json_path, scopes=scopes
        )
        creds = flow.run_local_server(port=port)
    aiplatform.init(credentials=creds, project=project)



def video2part(video_path):
    """视频转为模型输入"""
    with open(video_path, 'rb') as f:
        video_bytes = f.read()
    video_base64 = base64.b64encode(video_bytes).decode('utf-8')
    video_part = Part.from_data(data=video_base64, mime_type='video/mp4')
    return video_part


def load_local_data(content_lst):
    """加载数据"""
    res_content_lst = []
    if isinstance(content_lst, str):
        content_lst = [content_lst]
    for content in content_lst:
        try:
            if os.path.exists(content):
                # 图像
                if os.path.basename(content).split('.')[-1] in ['jpg', 'png', 'jpeg']:
                    image = Image.load_from_file(content)
                    res_content_lst.append(image)
                    logger.debug(f'图像:{content}')
                # 视频
                elif os.path.basename(content).split('.')[-1] in ['mp4']:
                    video = video2part(content)
                    res_content_lst.append(video)
                    logger.debug(f'视频:{content}')
                else:
                    raise ValueError(f'输入路径文件错误: {content}')
            # 文本
            else:
                res_content_lst.append(content)
                logger.debug(f'文本: {content}')
        except Exception as e:
            logger.error(f'load local file error: {e}')
    return res_content_lst



model_name = MODEL_DICT['gemini-2.0-lite']

# 用户验证
init_valid(vertex_client_json_path='./vertex_client.json', port=9002, project='video-describe')





Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=127057636192-venn5l6jmo6mj0bc34lp75fpkn5f75o4.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A9002%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform&state=gOs4PK89WPsbgHcnGI6b5xXabFeOgg&access_type=offline


## 多轮对话

In [2]:
answering_style_message = """**Answering Style**:
Answers should be comprehensive, conversational, and use complete sentences. The answer should be in English no matter what the user‘s input is. Provide context where necessary and maintain a certain tone.  Begin directly without introductory phrases like “The image/video showcases” “The photo/video captures” “In the first/second video” and more. For example, say “A woman is on a beach”, instead of “A woman is depicted in the image”. Cannot appear as vague expressions. Please note that there should be no expressions such as video1 or video2 when describing."""
note_message = """**Note**:
 When describing, first describe the character and scene, then describe the events that occurred in the video, as well as the actions of the characters."""
user_input_message = """**User Input**:
Please detailed describe each video in order and express the same elements in different videos in the same way. When describing the characters, it is necessary to give actor1, actor2, etc. and describe who actor1 and actor2 are"""
output_format = """
output_format:
 [the first video description]
 [the first video description]
 [the first video description]
"""
system_message = f"{answering_style_message}\n{note_message}\n{output_format}\n{user_input_message}"


# 加载模型
logger.info(f'加载模型中: {model_name}...')
model = GenerativeModel(model_name, system_instruction=system_message)
logger.info('加载模型成功!!!')
logger.info('***\t\t system_instruction \t\t***: \n' + system_message)

chat = model.start_chat()

[32m2025-06-17 13:06:13,892 加载模型中: gemini-2.0-flash-lite-001...[0m
[32m2025-06-17 13:06:13,894 加载模型成功!!![0m
[32m2025-06-17 13:06:13,895 ***		 system_instruction 		***: 
**Answering Style**:
Answers should be comprehensive, conversational, and use complete sentences. The answer should be in English no matter what the user‘s input is. Provide context where necessary and maintain a certain tone.  Begin directly without introductory phrases like “The image/video showcases” “The photo/video captures” “In the first/second video” and more. For example, say “A woman is on a beach”, instead of “A woman is depicted in the image”. Cannot appear as vague expressions. Please note that there should be no expressions such as video1 or video2 when describing.
**Note**:
 When describing, first describe the character and scene, then describe the events that occurred in the video, as well as the actions of the characters.

output_format:
 [the first video description]
 [the first video description]


In [5]:
content_lst = [
               r"E:\Data\datasets\Video_Datasets\Koala-36M\videos\0-ggn3z52oU_76\split\The Silent Pocket 20 Liter Faraday Pack A Quick Shabazz Review-Scene-002.mp4",
                r"E:\Data\datasets\Video_Datasets\Koala-36M\videos\0-ggn3z52oU_76\split\The Silent Pocket 20 Liter Faraday Pack A Quick Shabazz Review-Scene-004.mp4"
               ]

# 从本地加载数据
logger.info('加载数据中...')
model_input_content_lst = load_local_data(content_lst)
logger.info('加载完成')

# 模型预测
logger.info('模型预测中...')
response1 = chat.send_message(content=['请描述视频', model_input_content_lst[0]])
logger.info(f'\n\n{response1.text}')
response2 = chat.send_message(content=['请描述视频', model_input_content_lst[1]])
logger.info(f'\n\n{response2.text}')



[32m2025-06-17 13:12:32,248 加载数据中...[0m
[32m2025-06-17 13:12:32,280 加载完成[0m
[32m2025-06-17 13:12:32,287 模型预测中...[0m
[32m2025-06-17 13:13:09,165 

Sure, here is a detailed description of the video you requested.

The video begins with a shot of a black bag laying on a tan carpet. The bag is a silent pocket 20 liter pack. Actor 1, who is off camera, starts to unzip the bag and opens it up to reveal the inside. As Actor 1 continues to open the bag it becomes more apparent that it is a waterproof bag. Finally, Actor 1 zooms in and begins to show the details of the outside of the bag.[0m
[32m2025-06-17 13:13:48,535 

Here is a detailed description of the video you requested.

The video starts with a black bag laying flat on the floor, with the side facing the camera. The bag has a zipper pocket in the middle of the bag. The bag is a silent pocket 20 liter pack. Actor 1, who is off camera, pulls a zipper up the pocket.[0m
