In [None]:
from langchain_community.document_loaders import YoutubeLoader

video_id = "5MgBikgcWnY"  # TEDx video
docs = []
try:
    
    loader = YoutubeLoader.from_youtube_url(
        "https://www.youtube.com/watch?v=QsYGlZkevEg", add_video_info=False
    )
    docs = loader.load()
    print(docs)

except Exception as e:
    print("❌ Failed to load transcript.")
    print(type(e).__name__ + ":", e)


In [None]:
from huggingface_hub import InferenceClient
import os
from dotenv import load_dotenv
load_dotenv()
# Paste your HF token here or set via environment variable
token = os.getenv('HF_TOKEN')

# Pick a chat-supported model
model_id = "HuggingFaceH4/zephyr-7b-beta"  # Example

client = InferenceClient(model=model_id, token=token)

# OpenAI-style chat message format
messages = [
    {"role": "system", "content": "You are a helpful coding assistant."},
    {"role": "user", "content": "Write a Python function to reverse a list."}
]

# Call chat_completion
response = client.chat_completion(
    messages=messages,
    max_tokens=200,
    temperature=0.3
)

print("✅ Chat Response:\n", response.choices[0].message["content"])


In [None]:
# ───────────────────────────────────────────────
# 1. Imports and Environment Setup
# ───────────────────────────────────────────────
import os
from dotenv import load_dotenv
load_dotenv()

from crewai import Agent, Task, Crew, Process, LLM
from crewai_tools import YoutubeChannelSearchTool

# Prevent any OpenAI fallback errors
os.environ["OPENAI_API_KEY"] = "DUMMY"  # Safety fallback

# ───────────────────────────────────────────────
# 2. Configure Groq LLM
# ───────────────────────────────────────────────
llm = LLM(
    model="llama3-8b-8192",  # Fast and supported
    base_url="https://api.groq.com/openai/v1",
    api_key=os.getenv("GROQ_API_KEY")
)

# ───────────────────────────────────────────────
# 3. Set up YouTube Tool (force using Groq LLM)
# ───────────────────────────────────────────────
yt_tool = YoutubeChannelSearchTool(
    youtube_channel_handle="@IBMTechnology",
    llm=llm  # 👈 inject your Groq LLM to avoid OpenAI fallback
)

# ───────────────────────────────────────────────
# 4. Define Agents
# ───────────────────────────────────────────────
researcher = Agent(
    role="Video Summarizer",
    goal="Summarize insights from a YouTube video",
    backstory="Expert at analyzing YouTube educational content and summarizing key takeaways.",
    tools=[yt_tool],
    verbose=True,
    llm=llm
)

writer = Agent(
    role="Blog Writer",
    goal="Write a blog post from a video summary",
    backstory="An AI blogger who transforms summaries into engaging articles.",
    verbose=True,
    llm=llm
)

# ───────────────────────────────────────────────
# 5. Define Tasks
# ───────────────────────────────────────────────
research_task = Task(
    description=(
        "Use the YouTube tool to find a video about '{topic}'. "
        "Extract and summarize the video content."
    ),
    expected_output="A clear, concise summary of the selected video.",
    tools=[yt_tool],
    agent=researcher
)

write_task = Task(
    description=(
        "Write a 500-word blog post from the summary. Use a clear, journalistic tone with strong structure "
        "and simple explanations, aimed at general readers interested in defense and geopolitics."
    ),
    expected_output="A titled blog post with intro, body, and conclusion.",
    agent=writer
)

# ───────────────────────────────────────────────0
# 6. Create and Run Crew
# ───────────────────────────────────────────────
crew = Crew(
    agents=[researcher, writer],
    tasks=[research_task, write_task],
    process=Process.sequential,
    memory=False,
    cache=False,
    embedder={"provider": "none"},
    llm=llm,
    verbose=True
)

result = crew.kickoff(inputs={"topic": "What is LangChain?"})
print("\n📄 Final Blog Output:\n")
print(result)


In [None]:
!pip install yt-dlp youtube-transcript-api crewai crewai_tools langchain_groq python-dotenv tiktoken requests tqdm pytube

In [None]:
# ───────────────────────────────────────────────
# 0. Setup – environment keys & imports
# ───────────────────────────────────────────────
import os, time, subprocess, tempfile
from pathlib import Path
from urllib.parse import urlparse, parse_qs
from dotenv import load_dotenv
from crewai import Agent, Task, Crew, Process, LLM
import requests
import re

load_dotenv()

# ───────────────────────────────────────────────
# 1. Helpers: video ID & shell
# ───────────────────────────────────────────────
def extract_video_id(url):
    q = parse_qs(urlparse(url).query)
    return q.get("v", [""])[0]

def run_cmd(cmd):
    proc = subprocess.run(cmd, capture_output=True, text=True)
    if proc.returncode != 0:
        raise RuntimeError(proc.stderr.strip() or "Command failed")
    return proc.stdout.strip()

# ───────────────────────────────────────────────
# 2. Download captions or fallback to audio
# ───────────────────────────────────────────────
def fetch_captions_or_audio(video_url, work_dir):
    vid_id = extract_video_id(video_url)

    # Try downloading captions
    try:
        print("🔎 Checking for captions …")
        sub_cmd = [
            "yt-dlp", "--skip-download",
            "--write-auto-sub", "--sub-lang", "en",
            "-o", f"{work_dir}/%(id)s.%(ext)s",
            video_url
        ]
        run_cmd(sub_cmd)
        vtt = next(work_dir.glob(f"{vid_id}.en.vtt"))
        print("✅ Captions downloaded.")
        return vtt
    except:
        print("⚠️ No captions found, downloading audio …")

    # Download audio
    audio_cmd = [
        "yt-dlp", "-f", "bestaudio",
        "--extract-audio", "--audio-format", "mp3",
        "-o", f"{work_dir}/%(id)s.%(ext)s",
        video_url
    ]
    run_cmd(audio_cmd)
    mp3 = next(work_dir.glob(f"{vid_id}.mp3"))
    print("✅ Audio downloaded.")
    return mp3

# ───────────────────────────────────────────────
# 3. Transcript Tools
# ───────────────────────────────────────────────
def clean_vtt_transcript(vtt_path):
    with vtt_path.open("r", encoding="utf8") as f:
        raw_text = f.read()
    
    print("\n📄 RAW VTT FILE CONTENT (First 500 chars):\n")
    print(raw_text[:500])  # Don't skip this!

    if not raw_text.strip():
        raise ValueError("🛑 VTT file is completely empty. No captions were downloaded.")

    # Continue cleaning...


def extract_transcript(vtt_text):
    # Remove timestamps like "00:00:00.160 --> 00:00:01.990"
    text = re.sub(r"\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}", "", vtt_text)
    # Remove captions metadata (e.g., "align:start position:0%")
    text = re.sub(r"align:start position:\d+%", "", text)
    # Remove lines like "Kind: captions Language: en"
    text = re.sub(r"Kind:.*Language:.*", "", text)
    # Remove all excess whitespace
    text = re.sub(r"\s+", " ", text)
    print('extract_transcript \n' + text)
    return text.strip()

def assemblyai_transcribe(audio_path):
    ASSEMBLYAI_KEY = os.getenv("ASSEMBLY_API_KEY")
    headers = {"authorization": ASSEMBLYAI_KEY}

    print("⬆️ Uploading to AssemblyAI …")
    with audio_path.open("rb") as f:
        res = requests.post("https://api.assemblyai.com/v2/upload", headers=headers, data=f)
        upload_url = res.json()["upload_url"]

    res = requests.post(
        "https://api.assemblyai.com/v2/transcript",
        headers=headers, json={"audio_url": upload_url}
    )
    transcript_id = res.json()["id"]

    print("⏳ Transcribing …")
    while True:
        poll = requests.get(f"https://api.assemblyai.com/v2/transcript/{transcript_id}", headers=headers).json()
        if poll["status"] == "completed":
            return poll["text"]
        elif poll["status"] == "error":
            raise RuntimeError(poll["error"])
        time.sleep(5)

# ───────────────────────────────────────────────
# 4. Groq LLM Setup
# ───────────────────────────────────────────────
groq_llm = LLM(
    model="llama3-8b-8192",
    base_url="https://api.groq.com/openai/v1",
    api_key=os.getenv("GROQ_API_KEY")
)

# ───────────────────────────────────────────────
# 5. Main Pipeline
# ───────────────────────────────────────────────
def youtube_blog_pipeline(video_url):
    with tempfile.TemporaryDirectory() as tmp:
        tmp_path = Path(tmp)
        file_path = fetch_captions_or_audio(video_url, tmp_path)

        if file_path.suffix == ".vtt":
            transcript = clean_vtt_transcript(file_path)
            transcript_text = extract_transcript(transcript)
            print("Transcript Text \n" + transcript_text)
        else:
            transcript_text = assemblyai_transcribe(file_path)

        # ✅ Limit input to 14,000 characters
        if len(transcript_text) > 14000:
            print(f"⚠️ Transcript truncated to 14,000 chars from {len(transcript_text)}")
            transcript_text = transcript_text[:14000]

        # ── Agents
        summarizer = Agent(
            role="Video Summarizer",
            goal="Summarize the transcript clearly and concisely.",
            backstory="Expert in extracting concise information from long-form content.",
            verbose=True,
            llm=groq_llm
        )

        blogger = Agent(
            role="Blog Writer",
            goal="Create a compelling blog post from the summary.",
            backstory="Skilled blogger who turns complex info into readable articles.",
            verbose=True,
            llm=groq_llm
        )

        # ── Tasks
        summary_task = Task(
            description=f"""You are a professional video summarizer. Summarize the following transcript into 10–12 clear bullet points, covering all key topics and insights from the video. Do **not** include any explanation or meta-commentary, just return the bullet points as your final output.

        Transcript:
        {transcript_text}
        """,
            expected_output="10–12 bullet points summarizing the video.",
            agent=summarizer
        )


        # ── Summarize First
        crew_summary = Crew(
            agents=[summarizer],
            tasks=[summary_task],
            process=Process.sequential,
            memory=False,
            cache=False,
            embedder={"provider": "none"},
            llm=groq_llm,
            verbose=True
        )

        summary_output = crew_summary.kickoff()

        print("\n📌 Video Summary:\n", summary_output.tasks_output)

        # ── Write Blog Based on Summary
        blog_task = Task(
            description=f"""Write a 500-word blog post based on this summary:
            
            {summary_output}
            
        Use a catchy title, intro, subheadings, and conclusion. Format the output in Markdown.""",
            expected_output="Markdown-formatted blog article.",
            agent=blogger
        )

        crew_blog = Crew(
            agents=[blogger],
            tasks=[blog_task],
            process=Process.sequential,
            memory=False,
            cache=False,
            embedder={"provider": "none"},
            llm=groq_llm,
            verbose=True
        )

        blog_result = crew_blog.kickoff()
        return blog_result

# ───────────────────────────────────────────────
# 6. Run
# ───────────────────────────────────────────────
video = "https://www.youtube.com/watch?v=rUCOwCJDh8o&ab_channel=Fireship"
blog_output = youtube_blog_pipeline(video)

print("\n📝 Final Blog Output:\n")
print(blog_output)
