# YouTube Data Collector V2

**Released in May 2023**

## Functionalities 

1. Download YouTube Videos in .mp4 format from URL
2. Convers the Video images to HD Frames (Images in JPG format)
3. Speach Recognition from the Video, and get speech to Text.
4. Summarise the speech in bullet points using ChatGPT (V3 Upcoming Feature)

### Pre-Condition for local speech converter model:
Dowbload the model file from : 
Input the speeck conveted model to folder link : 

In [None]:
# importing the module

from pytube import YouTube  #fordownloading video from a YouTube url
import cv2  #Function to convert video to frames


import speech_recognition as sr #For Speach recogntion from the video

from moviepy.editor import * #Helper supprt linraries for extracting audio from video
import vosk #For Voice to Speech SDK

In [None]:
# This is a referance fruntion to create clean filename from the YoutTube vide title
# Code Source from GitHub Url: https://gist.github.com/wassname/1393c4a57cfcbf03641dbc31886123b8

import unicodedata
import string

valid_filename_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
char_limit = 255

# Helper function to handle file names from the YouTube URL.
def clean_filename(filename, whitelist=valid_filename_chars, replace=' '):
    # replace spaces
    for r in replace:
        filename = filename.replace(r,' ')
    
    # keep only valid ascii chars
    cleaned_filename = unicodedata.normalize('NFKD', filename).encode('ASCII', 'ignore').decode()
    
    # keep only whitelisted chars
    cleaned_filename = ''.join(c for c in cleaned_filename if c in whitelist)
    if len(cleaned_filename)>char_limit:
        print("Warning, filename truncated because it was over {}. Filenames may no longer be unique".format(char_limit))
    return cleaned_filename[:char_limit]

# Helper function to save text to a text file from python
def save_text_to_file(text, filename):
    with open(filename, 'w') as file:
        file.write(text)

## YouTube Video to Frames Function

In [None]:
def convert_yt_video_to_frames(video_url, fps=10):
    try:
        #PyTube object is instantiated
        yt = YouTube(video_url)
        #Printing the name of the video
        print(yt.title)

        #createing a file name - Using a cleasner fucntion
        folder_name = clean_filename(yt.title)
        video_file_name = clean_filename(yt.title) +'.mp4'
        print('Folder Name: ' + folder_name)
        print('Video File Name: ' + video_file_name)
        
        video_destination = './'+ folder_name
        
        #Filtering the HigRes with mp4 format with 1080p Res
        hig_res = yt.streams.filter(file_extension='mp4', resolution='1080p')
        #Saving the filtered object
        hig_res

        #Getting the itag of the filter object as stream
        stream = yt.streams.get_by_itag(hig_res[0].itag)
        #downloading the stream
        stream.download(output_path=video_destination, filename = video_file_name)
        
        
        #FPS is to only write speicific frames only
        FramesPerSecond = fps  #This value is how many frames to be selected - Not actual FPS.

        #read the file from the erlier downloadded location
        vidcap = cv2.VideoCapture('./'+ folder_name + '/'+ video_file_name)
        vidcap.set(cv2.CAP_PROP_FPS, FramesPerSecond)

        #Run a while loop through the video to frames
        success,image = vidcap.read()
        count = 0

        while success:
            if(count%FramesPerSecond) == 0:
                cv2.imwrite('./'+ video_destination +'/'+ "frame%d.jpg" % count, image)     # save frame as JPEG file  

            success,image = vidcap.read()
            print('Read a new frame: ', success)
            count += 1
            
        return print('Freames converted Sucessfully..!')
    
    except Exception as e:
        print(f"Error occurred while converting video to text: {str(e)}")


### Example Usage

In [None]:
# Example usge
video_url= 'https://www.youtube.com/watch?v=cZKcCG26VRI' # Replace this with the YouTube url you want to convert
fps = 50 # Optional input. 
convert_yt_video_to_frames(video_url, fps=fps)

## YouTube Video Speech to Text funtion

In [None]:
# Function to convert YouTube video to text
def convert_video_to_text(video_url, model_folder):
    try:
        
        #PyTube object is instantiated
        yt = YouTube(video_url)
        #Printing the name of the video
        print(yt.title)

        #createing a file name - Using a cleasner fucntion
        folder_name = clean_filename(yt.title)
        video_file_name = folder_name +'.mp4'
       
        print('Folder Name: ' + folder_name)
        print('Video File Name: ' + video_file_name)
        
        #stream the video for processing.
        video = yt.streams.get_highest_resolution().download()
        
        # Convert the video file to an audio file in WAV format
        video_clip = VideoFileClip(video)
        audio_clip = video_clip.audio
        audio_file = './'+ folder_name + '/' + folder_name + ".wav"
        audio_clip.write_audiofile(audio_file)

        # Initialize the speech recognition recognizer
        r = sr.Recognizer()

        # Open the audio file and read the audio using Vosk
        with sr.AudioFile(audio_file) as source:
            audio = r.record(source)  # Read the entire audio file

        # Convert speech to text using Vosk
        model = vosk.Model(model_folder)
        recognizer = vosk.KaldiRecognizer(model, audio.sample_rate)
        recognizer.AcceptWaveform(audio.get_wav_data())

        text = recognizer.FinalResult()

        
        # Save the text to the disk (Same folder)
        txt_file_name = './'+ folder_name + '/'+ folder_name + '.txt'
        save_text_to_file(text, txt_file_name)

        return print('Video to Text Convetsion is sucessfull..!') #text

    except Exception as e:
        print(f"Error occurred while converting video to text: {str(e)}")



### Example Usage

In [None]:
# Example usage

model_folder = "C:/Users/User_Name/Desktop/vosk-model-small-en-us-0.15/vosk-model-small-en-us-0.15" 
#Change this with respect to the model folder in your computer
conversation_text = convert_video_to_text(video_url, model_folder) # The conversion will be saved in the same folder as .TXT

print(conversation_text)

*[ --- End of the Code ---]*