<a href="https://colab.research.google.com/github/vitchierath/NLPtasks/blob/main/ytvdosummarizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.51 (from langchain-community)
  Downloading langchain_core-0.3.51-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain<1.0.0,>=0.3.23 (from langchain-community)
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain<1.0.0,>=0.3.23->langchain-community)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Downloading langchain_community-0.3.21-py3-none-any.whl (2.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m28.3 MB/s[0m eta [36m0:

In [None]:
# 🛠 Install Required Libraries
!pip install -q transformers gradio

# ✅ Imports
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image
import gradio as gr

# ✅ Load Free Vision Model
vision_model_id = "nlpconnect/vit-gpt2-image-captioning"
vision_model = VisionEncoderDecoderModel.from_pretrained(vision_model_id)
vision_processor = ViTImageProcessor.from_pretrained(vision_model_id)
vision_tokenizer = AutoTokenizer.from_pretrained(vision_model_id)

# ✅ Image Captioning Function
def describe_image(image: Image.Image) -> str:
    inputs = vision_processor(images=image, return_tensors="pt")
    output_ids = vision_model.generate(**inputs, max_length=64, num_beams=4)
    return vision_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# ✅ Gradio UI
gr.Interface(
    fn=describe_image,
    inputs=gr.Image(type="pil", label="🖼️ Upload an Image"),
    outputs="text",
    title="🖼️ Free Image Captioning Bot",
    description="Upload an image and get an AI-generated caption using ViT-GPT2 (no login required)"
).launch(share=True)


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.50.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_to

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://36d20d549f321a2f63.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [12]:
# 🛠️ Install dependencies
!pip install -q gradio youtube-transcript-api transformers

# ✅ Imports
import gradio as gr
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from transformers import MarianMTModel, MarianTokenizer, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import re

# ✅ Summarization model (Flan-T5)
summ_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
summ_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
summarizer = pipeline("text2text-generation", model=summ_model, tokenizer=summ_tokenizer)

# ✅ Detect YouTube video ID (robust)
def extract_video_id(url):
    # Handles both long and short YouTube URL formats
    match = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11}).*", url)
    return match.group(1) if match else None

# ✅ Step 1: Fetch Transcript
def fetch_transcript(video_url):
    try:
        video_id = extract_video_id(video_url)
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        text = " ".join([item['text'] for item in transcript])
        return text
    except TranscriptsDisabled:
        return None
    except Exception as e:
        return None

# ✅ Step 2: Translate (optional)
def translate_text(text, target_lang):
    if target_lang == "en":
        return text  # No need to translate
    try:
        model_name = f"Helsinki-NLP/opus-mt-en-{target_lang}"
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
        translated = model.generate(**inputs)
        translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
        return translated_text
    except:
        return "⚠️ Translation failed. Language may not be supported."

# ✅ Step 3: Summarize
def summarize_text(text):
    input_text = "summarize: " + text
    summary = summarizer(input_text, max_new_tokens=200)[0]['generated_text']
    return summary

# ✅ Gradio Handler
def process_youtube_summary(video_url, target_lang):
    transcript = fetch_transcript(video_url)
    if not transcript:
        return "❌ Failed to fetch transcript. It may be disabled or unavailable."

    summary_en = summarize_text(transcript)

    if target_lang != "en":
        summary_final = translate_text(summary_en, target_lang)
    else:
        summary_final = summary_en

    return summary_final

# ✅ Gradio UI
gr.Interface(
    fn=process_youtube_summary,
    inputs=[
        gr.Textbox(label="📺 YouTube Video URL"),
        gr.Textbox(label="🌐 Target Language Code (e.g. en, fr, es, hi, de)")
    ],
    outputs="text",
    title="🎬 YouTube Video Summarizer & Translator",
    description="Paste a YouTube URL and get a summary in your preferred language using FLAN-T5 and Helsinki translation!"
).launch(share=True)


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m1.7/2.2 MB[0m [31m24.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.2/2.2 MB[0m [31m26.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h

Device set to use cpu


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a2c0d3df92ba7abb45.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


