In [12]:
from api import openai_api_key
from langchain_openai import OpenAIEmbeddings, OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.metrics.pairwise import cosine_similarity
from typing import List
import numpy as np
from urllib.parse import urlparse, parse_qs
import os
import ast
import time
# os.environ["OPENAI_API_KEY"] = openai_api_key

In [13]:
from langchain_community.tools import YouTubeSearchTool
from langchain_community.document_loaders import YoutubeLoader

tool = YouTubeSearchTool()

In [14]:
def get_youtube_id(url: str) -> str:
    """Extract video ID from YouTube URL"""
    try:
        parsed_url = urlparse(url)
        if parsed_url.hostname in ['www.youtube.com', 'youtube.com']:
            return parse_qs(parsed_url.query)['v'][0]
        elif parsed_url.hostname == 'youtu.be':
            return parsed_url.path[1:]
    except:
        return None
    


In [15]:

# urls_string = tool.run("lex friedman, 5")
# # A tool to make string into a list of urls
# urls = ast.literal_eval(urls_string)

In [16]:
import requests
from bs4 import BeautifulSoup
import re

# A tool to get the title of a youtube video. It is much faster that "yt_dpl"
def get_youtube_title(url):
    try:
        # Get the page content
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        
        # Parse with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find title meta tag
        title = soup.find('meta', property='og:title')
        if title:
            return title['content']
            
        # Alternative method: find title tag
        title = soup.find('title')
        if title:
            # Clean up the title (remove "- YouTube" suffix)
            return re.sub(r'\s*-\s*YouTube$', '', title.string)
            
        return None
        
    except Exception as e:
        print(f"Error getting title from {url}: {e}")
        return None


title = get_youtube_title(url="https://youtu.be/cY-0TRj-teI?si=mEpB7q-tXZtroHnr")


In [17]:
def get_video_content(url: str) -> tuple:
    """Safely get video content and metadata"""
    try:
        loader = YoutubeLoader.from_youtube_url(
            url,
            add_video_info=False,
            language=['en', 'ko']  # Support both English and Korean
        )
        
        content = loader.load()
        
        if not content or len(content) == 0:
            return None, None
            
        return content[0].page_content, content[0].metadata
        
    except Exception as e:
        print(f"Error loading content for {url}: {str(e)}")
        return None, None

In [18]:
# We construct a Dictionary of titles as a key of the dictionary and values as "content", "metadata" and "url"
def get_dict(urls):
    video_dict = {}
    for url in urls:

        time.sleep(1)

        title = get_youtube_title(url)
        content, metadata = get_video_content(url)

        if content is None:
            print(f"Skipping {url} due to content loading error")
            continue

        video_dict[title] = {"content": content,
                        "metadata": metadata,
                        "url": url}  
    return video_dict



In [19]:
def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [20]:
def check_similarity(title, content):
    # Given title and the content, return the similarity of the title and content in percentage
    llm = OpenAI(temperature=0)
    openai_embed = OpenAIEmbeddings()

    title_embed = openai_embed.embed_query(title)
    
    content_embed = openai_embed.embed_query(content)
    similarity = cosine_similarity(title_embed, content_embed)
    return similarity

In [41]:

def fact_checker(query, num_videos=5):
    similarity_list = []
    tool = YouTubeSearchTool()
    string_urls = tool.run(f"{query}, {num_videos}")
    urls = ast.literal_eval(string_urls) # Now we have the list of urls

    video_dict = get_dict(urls)
    similarity_key_dict = {}

    for key in video_dict.keys():
        title=key
        content = video_dict[key]["content"]
        url = video_dict[key]["url"]
        similarity = check_similarity(title=title, content=content)
        similarity = round(similarity*100, 2)
        similarity_list.append(similarity)
        video_dict[key]["similarity"] = str(similarity)

        similarity_key_dict[str(similarity)] = {"title": title,
                                                "content": content,
                                                "url": url}

    return video_dict, similarity_list, similarity_key_dict

In [42]:
similarity_dict, similarity_list, similarity_key_dict = fact_checker("한국 대통령 탄핵 ", num_videos=10)




In [43]:
print(similarity_list)

[88.31, 86.31, 85.12, 83.15, 85.08, 89.37, 87.8, 87.36, 84.56, 87.02]


In [44]:
new_list = sorted(similarity_list,reverse=True)
print(new_list)

[89.37, 88.31, 87.8, 87.36, 87.02, 86.31, 85.12, 85.08, 84.56, 83.15]


In [45]:
for num in new_list:
    title = similarity_key_dict[str(num)]["title"]
    url = similarity_key_dict[str(num)]["url"]
    print(f"Similarity: {num} | Title: {title} | url: {url}")

Similarity: 89.37 | Title: [에디터픽] 윤 대통령 탄핵안 가결..미국 정부가 밝힌 공식 입장 / YTN | url: https://www.youtube.com/watch?v=4TkqFjgeAho&pp=ygUY7ZWc6rWtIOuMgO2GteuguSDtg4TtlbUg
Similarity: 88.31 | Title: [LIVE] '윤석열 대통령 탄핵소추안' 가결...광화문 일대 '탄핵 반대' 집회 현장 상황/2024년 12월 14일(토)/KBS | url: https://www.youtube.com/watch?v=vNj4lCzL4KQ&pp=ygUY7ZWc6rWtIOuMgO2GteuguSDtg4TtlbUg
Similarity: 87.8 | Title: 미 언론, 탄핵 가결 일제히 속보 타전...미 정부도 '촉각' / YTN | url: https://www.youtube.com/watch?v=FAxPAVmkOz0&pp=ygUY7ZWc6rWtIOuMgO2GteuguSDtg4TtlbUg
Similarity: 87.36 | Title: 8년 전처럼…트럼프 취임 때마다 '한국 대통령은 탄핵 중' / JTBC News | url: https://www.youtube.com/watch?v=cR94yJG7dUw&pp=ygUY7ZWc6rWtIOuMgO2GteuguSDtg4TtlbUg
Similarity: 87.02 | Title: 윤석열 대통령 '내란' 탄핵안 가결‥직무정지 - [LIVE] MBC 특집 뉴스데스크 2024년 12월 14일 | url: https://www.youtube.com/watch?v=tdr5t21hbj8&pp=ygUY7ZWc6rWtIOuMgO2GteuguSDtg4TtlbUg
Similarity: 86.31 | Title: 외신들, '윤 탄핵' 긴급타전…중국 포털 검색어 1위에 (자막뉴스) / SBS | url: https://www.youtube.com/watch?v=L2SO--lHJu0&pp=ygUY7ZWc6rWtIOuMgO2Gteu

In [None]:
from transformers import pipeline

In [5]:
from transformers import pipeline
nlp_pipline = pipeline("device=")

nli_pipeline = pipeline("text-classification", model="roberta-large-mnli")
headline = "Wildfire Spreads Rapidly"
text = "Due to strong winds, the wildfire has spread faster than expected."

result = nli_pipeline(f"{headline} [SEP] {text}")
print(result)  # {'label': 'ENTAILMENT', 'score': 0.85}

RuntimeError: At least one of TensorFlow 2.0 or PyTorch should be installed. To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ To install PyTorch, read the instructions at https://pytorch.org/.