In [1]:
from moviepy.editor import VideoFileClip, AudioFileClip
import torch
from transformers import pipeline
import cv2
import tempfile
import os
import shutil

# Example usage
video_input_path = r"D:\backend\instagram_vedio_caption_generator\test.mp4"


def extract_audio_from_video(input_video_path, output_audio_path):
    try:
        # Load the video clip
        video_clip = VideoFileClip(input_video_path)

        # Extract audio from the video
        audio_clip = video_clip.audio

        # Write the audio to a temporary file
        temp_audio_path = os.path.join(tempfile.mkdtemp(), "temp_audio.mp3")
        audio_clip.write_audiofile(temp_audio_path, codec='mp3')

        print(f"Audio extracted successfully: {temp_audio_path}")

        return temp_audio_path

    except Exception as e:
        print(f"Error: {e}")
        return None


audio_temp_path = extract_audio_from_video(video_input_path, None)

# Set up automatic speech recognition pipeline
device = "cuda:0" if torch.cuda.is_available() else "cpu"
asr_pipeline = pipeline(
  "automatic-speech-recognition",
  model='openai/whisper-large-v3',
  chunk_length_s=30,
  device=device,
)

# Perform speech recognition on the audio
speech_prediction = asr_pipeline(audio_temp_path, batch_size=8, return_timestamps=True)

# Function to get caption based on timestamp
def get_caption(timestamp, speech_chunks):
    for chunk in speech_chunks:
        start, end = chunk['timestamp']
        if start <= timestamp < end:
            return chunk['text']
    return ''  # No caption found for the given timestamp

output_video_path = os.path.join(tempfile.mkdtemp(), "temp_output.mp4")

cap = cv2.VideoCapture(video_input_path)

if not cap.isOpened():
    print("Error opening video file")

# Get video properties
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define the codec and create VideoWriter object
video_codec = cv2.VideoWriter_fourcc(*'mp4v')  # You can change the codec as needed
video_writer = cv2.VideoWriter(output_video_path, video_codec, fps, (width, height))

while cap.isOpened():
    ret, frame = cap.read()

    if not ret:
        print("End of video")
        break

    current_timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000.0
    caption = get_caption(current_timestamp, speech_prediction['chunks'])

    # Add caption to frame
    cv2.putText(frame, caption, (50, 50), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

    # Save the processed frame to the output video file
    video_writer.write(frame)

# Release video capture and writer objects
cap.release()
video_writer.release()
cv2.destroyAllWindows()

# Merge audio and video files
def merge_audio_video(audio_path, video_path, output_path):
    # Load the video clip
    video_clip = VideoFileClip(video_path)

    # Load the audio clip
    audio_clip = AudioFileClip(audio_path)

    # Set the audio of the video clip to the loaded audio clip
    video_clip = video_clip.set_audio(audio_clip)

    # Write the result to a temporary file
    temp_output_path = os.path.join(tempfile.mkdtemp(), "temp_merged_output.mp4")
    video_clip.write_videofile(temp_output_path, codec="libx264", audio_codec="aac")

    return temp_output_path

# Example usage
final_output_path = r"D:\backend\instagram_vedio_caption_generator\output1.mp4"
merged_video_temp_path = merge_audio_video(audio_temp_path, output_video_path, final_output_path)

# Clean up temporary files
os.remove(audio_temp_path)
os.remove(output_video_path)

shutil.move(merged_video_temp_path, final_output_path)



MoviePy - Writing audio in C:\Users\umang\AppData\Local\Temp\tmpw4b2frnq\temp_audio.mp3


                                                                   

MoviePy - Done.
Audio extracted successfully: C:\Users\umang\AppData\Local\Temp\tmpw4b2frnq\temp_audio.mp3


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


End of video
Moviepy - Building video C:\Users\umang\AppData\Local\Temp\tmpq9pisk5s\temp_merged_output.mp4.
MoviePy - Writing audio in temp_merged_outputTEMP_MPY_wvf_snd.mp4


                                                                    

MoviePy - Done.
Moviepy - Writing video C:\Users\umang\AppData\Local\Temp\tmpq9pisk5s\temp_merged_output.mp4



                                                              

Moviepy - Done !
Moviepy - video ready C:\Users\umang\AppData\Local\Temp\tmpq9pisk5s\temp_merged_output.mp4


'D:\\backend\\instagram_vedio_caption_generator\\output1.mp4'