<a href="https://colab.research.google.com/github/zuzanakf/text-to-video-artist/blob/main/artist_to_video.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install diffusers transformers accelerate torch




**Setting up text-to-video diffusion model**

In [2]:
import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
from diffusers.utils import export_to_video

In [3]:
pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

text_encoder/model.safetensors not found


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

**Setting up open ai call**

In [4]:
!pip install openai



In [5]:
import json

OPEN_API_KEY = ''
with open('/content/OpenAI.json', 'r') as file_to_read:
  json_data = json.load(file_to_read)
  OPEN_API_KEY = json_data["OPEN_API_KEY"]

In [6]:
import os
import openai

openai.api_key = OPEN_API_KEY

In [7]:
artist_description = """
Big Picture is a music producer hailing from Birmingham (UK), whose passion lies in exploring the intricacies of sound and emotion.
This alternative electronic artist prefers to maintain a low profile, keeping their identity concealed in order to avoid the typical stresses of the modern music industry.
Big Picture focuses on the music itself, experimenting with AI and other new techniques to create a distinct and captivating visual identity.
Their music is a journey of discovery, evoking a sense of mystery and depth that draws listeners in without the need for persuasion.
"""

single_name = "One Fine Day"

lyrics_summary = "No lyrica in this single"

analysis = """
Genre: Electronic Musical Style: House, Trip Hop, Ambient
Central Themes: Evokes feelings of melancholy with an uplifting sense of movement
Visual Aesthetics: Moments of reflection transitioning from a sunset through dusk to stargazing at night

"""

In [13]:
prompt = f"""
You are a creative art director for a musician. The musician has asked you to come up with three briefs for TikTok videos promoting an upcoming single.

The name of the single is:{single_name}

This is a summary of the lyrics for each song:
{lyrics_summary}

This is an analysis of the themes and mood for the song:
{analysis}

This is a description of the artist:
{artist_description}

The following are categories which must be included in each video brief.

Title of the video
Description of the video, such as cityscape or nature avoiding people or characters as the main subject.
For each shot, provide a brief description of the main visual element, a few word description of the lighting, and a creative color description. Minimum 5 shots.

For each title, the shots MUST all have the exact same lighting and colour description, eg shot1_lighting = shot2_lighting

Return the examples as a JSON Blob, in the following format:

[
    {{
        "title": "",
        "description": "",
        "shot1_action": "Description.",
        "shot1_lighting": "Lighting description.",
        "shot1_colour": "Color description",
        "shot2_action": "...",
        "shot2_lighting": "...",
        "shot2_colour": "..."
        // ... you can continue adding other fields in the same manner
    }}
]



"""
message = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
    ]

In [14]:
response = openai.ChatCompletion.create(
  model="gpt-4",
  messages=message,
  temperature=0.6
)

In [15]:
print(response['choices'][0]['message']['content'])

[
    {
        "title": "Dusk to Night",
        "description": "A time-lapse of a cityscape transitioning from sunset to starry night.",
        "shot1_action": "The sun setting over the cityscape.",
        "shot1_lighting": "Soft orange glow of the sunset.",
        "shot1_colour": "Warm hues of sunset orange and pink.",
        "shot2_action": "City lights starting to twinkle as dusk sets in.",
        "shot2_lighting": "Soft orange glow transitioning to cool blue dusk.",
        "shot2_colour": "Mix of warm orange and cool blue hues.",
        "shot3_action": "The sky darkening, stars becoming visible.",
        "shot3_lighting": "Fading dusk light, stars starting to twinkle.",
        "shot3_colour": "Deepening blue of the night sky.",
        "shot4_action": "Full night sky with twinkling stars.",
        "shot4_lighting": "Dark with bright points of starlight.",
        "shot4_colour": "Deep midnight blue with bright white stars.",
        "shot5_action": "Cityscape under the 

In [16]:
import json

# Let's assume response has already been defined somewhere in your code.
data_string = response['choices'][0]['message']['content']
data_list = json.loads(data_string)


In [17]:
objects_data = next(item for item in data_list if item["title"] == "Dusk to Night")

In [18]:


# Create a video for each shot of the "Objects" subject
paths = []
for i in range(1, 6):  # Assuming you have shot1, shot2, and shot3
    action_key = f"shot{i}_action"
    lighting_key = f"shot{i}_lighting"
    colour_key = f"shot{i}_colour"

    # Construct the prompt
    prompt = f"Artist {objects_data[action_key]}. '{lighting_key}': '{objects_data[lighting_key]}'. '{colour_key}': '{objects_data[colour_key]}'"

    # Generate video frames
    video_frames = pipe(prompt, num_inference_steps=40, height=576, width=320, num_frames=24).frames
    video_path = export_to_video(video_frames)

    # Optionally print the video path or take any action
    print(f"Video for {action_key} saved at {video_path}")
    paths.append(video_path)

  0%|          | 0/40 [00:00<?, ?it/s]

Video for shot1_action saved at /tmp/tmpqnwrcu61.mp4


  0%|          | 0/40 [00:00<?, ?it/s]

Video for shot2_action saved at /tmp/tmpkdbdtffy.mp4


  0%|          | 0/40 [00:00<?, ?it/s]

Video for shot3_action saved at /tmp/tmp4gwjlkev.mp4


  0%|          | 0/40 [00:00<?, ?it/s]

Video for shot4_action saved at /tmp/tmpnfwbmp7j.mp4


  0%|          | 0/40 [00:00<?, ?it/s]

Video for shot5_action saved at /tmp/tmpbkc3nxhl.mp4


In [19]:
!pip install moviepy



In [20]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))


Saving One Fine Day.mp3 to One Fine Day (1).mp3
User uploaded file "One Fine Day (1).mp3" with length 9528923 bytes


In [21]:
from moviepy.editor import VideoFileClip, AudioFileClip, concatenate_videoclips

# 1. Load and concatenate the video clips
clip1 = VideoFileClip(paths[0])
clip2 = VideoFileClip(paths[1])
clip3 = VideoFileClip(paths[2])
clip4 = VideoFileClip(paths[3])
clip5 = VideoFileClip(paths[4])
final_video = concatenate_videoclips([clip1, clip2, clip3, clip4, clip5])

# 2. Load the audio file
audio = AudioFileClip("One Fine Day.mp3")

# 3. Crop the audio to match the duration of the concatenated video
audio = audio.subclip(0, final_video.duration)

# 4. Set the cropped audio to the concatenated video
final_video = final_video.set_audio(audio)

# 5. Export the final video with the new audio
final_video.write_videofile("video_with_new_audio.mp4")


Moviepy - Building video video_with_new_audio.mp4.
MoviePy - Writing audio in video_with_new_audioTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video video_with_new_audio.mp4





Moviepy - Done !
Moviepy - video ready video_with_new_audio.mp4


In [22]:
from google.colab import files
files.download('video_with_new_audio.mp4')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
if not objects_data:
    print("Couldn't find 'Objects' in response!")
else:
    # Create a video for each shot of the "Objects" subject
    for i in range(1, 4):  # Assuming you have shot1, shot2, and shot3
        action_key = f"shot{i}_action"
        lighting_key = f"shot{i}_lighting"
        colour_key = f"shot{i}_colour"

        # Construct the prompt
        prompt = f"Artist {objects_data[action_key]}. '{lighting_key}': '{objects_data[lighting_key]}'. '{colour_key}': '{objects_data[colour_key]}'"

        # Generate video frames
        video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames
        video_path = export_to_video(video_frames)

        # Optionally print the video path or take any action
        print(f"Video for {action_key} saved at {video_path}")

In [None]:
prompt = "Artist reaching a serene lake, releasing a paper boat symbolizing letting go.'shot5_lighting': 'Twilight, semi-darkness.'shot5_colour': 'Cool blue with streaks of silver'"
video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames
video_path = export_to_video(video_frames)