<a href="https://colab.research.google.com/github/whygit-dot/machine-learning/blob/main/YoutubeSummarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
# Install necessary libraries
!pip install spacy gradio youtube-transcript-api faiss-cpu

# Importing necessary libraries
import spacy
import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi
import faiss
import numpy as np

# Load spaCy's pre-trained model
nlp = spacy.load('en_core_web_sm')

# Function to extract transcript from YouTube video
def get_video_transcript(video_url):
    video_id = video_url.split("v=")[-1]  # Extract video ID from URL
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    text = " ".join([item['text'] for item in transcript])  # Combine all transcript text
    return text

# Function to build a simple retrieval system using FAISS
def build_faiss_index(text):
    # Process text into sentences
    doc = nlp(text)
    sentences = [sent.text for sent in doc.sents]

    # Convert sentences to vectors using spaCy embeddings
    sentence_vectors = np.array([nlp(sentence).vector for sentence in sentences])

    # Build the FAISS index for retrieval
    index = faiss.IndexFlatL2(sentence_vectors.shape[1])  # Use L2 distance
    index.add(sentence_vectors)  # Add the sentence vectors to the index

    return index, sentences

# Function to retrieve relevant sentences using FAISS
def retrieve_relevant_sentences(query, index, sentences):
    # Convert query to vector
    query_vector = nlp(query).vector
    query_vector = np.expand_dims(query_vector, axis=0)

    # Retrieve the most similar sentences from the FAISS index
    _, indices = index.search(query_vector, k=5)  # Retrieve top 5 sentences
    relevant_sentences = [sentences[i] for i in indices[0]]

    return relevant_sentences

# Function to generate summary using simple text concatenation (since no model is used here)
def generate_summary(relevant_sentences):
    return " ".join(relevant_sentences)

# Summarizer function for Gradio interface
def summarize_video(url, query):
    # Get the transcript for the YouTube video
    transcript = get_video_transcript(url)

    # Build the FAISS index for retrieval
    index, sentences = build_faiss_index(transcript)

    # Retrieve relevant sentences based on the query
    relevant_sentences = retrieve_relevant_sentences(query, index, sentences)

    # Generate summary (in this case, just returning relevant sentences)
    summary = generate_summary(relevant_sentences)
    return summary

# Gradio interface
interface = gr.Interface(fn=summarize_video, inputs=["text", "text"], outputs="text",
                         title="YouTube Video Summarizer",
                         description="Enter a YouTube video URL and a query to get the summarized content of the video.")
interface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1a04e66f368091cec7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


