<a href="https://colab.research.google.com/github/thd2020/text-2-clip/blob/main/%E2%80%9Ctext_to_clip_ipynb%E2%80%9D%E7%9A%84%E5%89%AF%E6%9C%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
# For security purposes, please check the contents of collect_env.py before running it.
!python collect_env.py

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!sudo apt-get update && apt-get install ffmpeg libsm6 libxext6  -y
!pip install --upgrade torchvision torchaudio
!pip install pytorch-pretrained-biggan
!pip install huggingface_hub
!pip install transformers
!pip install sentencepiece
!pip install sacremoses
!pip install tacotron2
!pip install modelscope==1.8.4
!pip install xformers==0.0.20
!pip install open_clip_torch>=2.0.2
!pip install opencv-python-headless
!pip install opencv-python
!pip install einops>=0.4
!pip install rotary-embedding-torch
!pip install fairscale
!pip install scipy
!pip install imageio
!pip install pytorch-lightning
!pip install torchsde

In [None]:
!pip install --upgrade torchvision==0.16.1+cu121 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!pip install --upgrade torchaudio==2.1.1+cu121 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
!pip uninstall torch
!pip uninstall torch
!pip install torch==2.1.1+cu121 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
# 导入所需的库
import torch
import torch.hub
import torchvision
import torchaudio
import numpy as np
import cv2
import moviepy.editor as mpy
import os
import requests, json
import nltk.stem.wordnet
import open_clip
from pytorch_pretrained_biggan import (BigGAN, one_hot_from_names, truncated_noise_sample,
                                       save_as_images, display_in_terminal, convert_to_images)
from google.colab.patches import cv2_imshow
from tacotron2.text import text_to_sequence
from modelscope.pipelines import pipeline
from modelscope.outputs import OutputKeys

In [None]:
!python3 -m nltk.downloader wordnet
!unzip /root/nltk_data/corpora/wordnet.zip -d /root/nltk_data/corpora/

In [None]:
# 定义使用自然语言处理（NLP）模型提取文本中的关键词和类别的函数
def extract_keywords_and_category_with_nlp(text):
  # 加载预训练的BERT模型
  model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-base-cased')
  # 加载预训练的BERT分词器
  tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased')
  # 设置模型为评估模式
  model.eval()
  # 将文本转换为张量
  tokens = tokenizer.encode(text, add_special_tokens=True)
  tokens = torch.LongTensor(tokens).unsqueeze(0)
  # 创建一个空的关键词列表
  keywords = []
  # 创建一个空的类别列表
  categories = []
  # 对每个单词进行处理
  for i in range(len(tokens[0])):
    # 获取单词的文本
    word = tokenizer.decode(tokens[0][i])
    # 如果单词是一个名词或形容词，且不是一个停用词，那么将其视为一个关键词
    if word in imagenet:
      # 将单词添加到关键词列表中
      keywords.append(word)
  return keywords

In [None]:
# 定义使用BigGAN生成图像的函数
def generate_image_with_biggan(text):
  images=[]
  # 加载预训练的BigGAN模型
  model = BigGAN.from_pretrained('biggan-deep-512')
  # 设置模型为评估模式
  model.eval()
  # 使用自然语言处理（NLP）模型提取文本中的关键词
  keywords = extract_keywords_and_category_with_nlp(text)

  # Prepare a input
  truncation = 0.4
  class_vector = one_hot_from_names(keywords, batch_size=len(keywords))
  print("keywords = ", keywords)
  noise_vector = truncated_noise_sample(truncation=truncation, batch_size=len(keywords))

  # All in tensors
  noise_vector = torch.from_numpy(noise_vector)
  class_vector = torch.from_numpy(class_vector)

  # If you have a GPU, put everything on cuda
  noise_vector = noise_vector.to('cuda')
  class_vector = class_vector.to('cuda')
  model.to('cuda')

  # Generate an image
  with torch.no_grad():
      output = model(noise_vector, class_vector, truncation)

  # If you have a GPU put back on CPU
  output = output.to('cpu')
  save_as_images(output)
  output = convert_to_images(output)
  images=[]
  for image in output:
    images.append(cv2.cvtColor(np.asarray(image),cv2.COLOR_RGB2BGR))

  # 返回生成的图像
  return images

In [None]:
# 定义使用Tacotron 2生成音频的函数
def generate_audio_with_tacotron2(text):
  # 加载预训练的Tacotron 2模型
  model = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_tacotron2', pretrained=True, force_reload=True)
  # 设置模型为评估模式
  model.eval()
  # 将模型移动到CUDA设备上
  model = model.to('cuda')
  # 加载预训练的WaveGlow模型
  waveglow = torch.hub.load('nvidia/DeepLearningExamples:torchhub', 'nvidia_waveglow', pretrained=True, force_reload=True)
  # 设置模型为评估模式
  waveglow.eval()
  # 将模型移动到CUDA设备上
  waveglow = waveglow.to('cuda')
  # 将文本转换为张量
  text = torch.LongTensor(text_to_sequence(text, ['english_cleaners'])).unsqueeze(0).to('cuda')
  # 使用Tacotron 2模型生成音频的梅尔频谱
  with torch.no_grad():
    mel, _, _ = model.infer(text)
  # 使用WaveGlow模型生成音频的波形
  with torch.no_grad():
    audio = waveglow.infer(mel)
  # 将音频转换为numpy数组
  audio = audio[0].detach().cpu().numpy()
  # 返回生成的音频
  return audio

In [None]:
# 定义使用Video GPT生成视频的函数
def generate_video_with_videogpt(image):
  pipe = pipeline(task='image-to-video', model='damo/Image-to-Video', model_revision='v1.1.0')
  # IMG_PATH: your image path (url or local file)
  output_video_path = pipe(image, output_video='./output_0.mp4')[OutputKeys.OUTPUT_VIDEO]
  print(output_video_path)
  # 返回生成的视频
  return video

In [None]:
# 定义文本转视频的函数
def text_to_video(text):
  # 将文本分割成句子
  sentences = text.split(".")
  # 创建一个空的视频列表
  video_list = []
  # 对每个句子进行处理
  for sentence in sentences:
    # 使用BigGAN生成图像
    image = generate_image_with_biggan(sentence)
    # 使用Tacotron 2生成音频
    audio = generate_audio_with_tacotron2(sentence)
    # 使用Video GPT生成视频
    video = generate_video_with_videogpt(image, audio)
    # 将视频添加到视频列表中
    video_list.append(video)
  # 将视频列表拼接成一个完整的视频
  final_video = mpy.concatenate_videoclips(video_list)
  # 返回最终的视频
  return final_video

In [None]:
# 定义要打开的文本文件的路径
file_path = "drive/MyDrive/imagenet_classes.txt"

# 以读取模式打开文本文件
with open(file_path, "r") as file:
    # 读取文本文件的内容
    content = file.read()
    imagenet = content.replace("\n", ",").replace("\n","").replace(" ","").split(",")
    imagenet = [x.replace("\n","") for x in imagenet]

In [None]:
erer = "tiger lion"


images = generate_image_with_biggan(erer)

for image in images:
  cv2_imshow(image)

In [None]:
generate_video_with_videogpt("output_0.png")

In [None]:
# 定义一个示例文本
text = "tiger mushroom bee."

# 调用文本转视频的函数
video = text_to_video(text)

# 保存视频到本地
video.write_videofile("text_to_video.mp4")
