In [None]:
!pip3 install python-dotenv
!pip3 install langchain
!pip3 install google-generativeai
!pip3 install duckduckgo-search
!pip3 install requests
!pip3 install beautifulsoup4
!pip3 install lxml
!pip3 install youtube-transcript-api
!pip3 install pytube
!pip3 install youtube-search-python
!pip3 install pytube
!pip3 install  youtube-transcript-api

In [None]:
from langchain.llms import GooglePalm
from dotenv import load_dotenv
load_dotenv()

In [None]:
import requests
from bs4 import BeautifulSoup
def scrape_text(url: str) -> str:
    """
    Retrieve text content from a webpage.

    Parameters:
    - url (str): The URL of the webpage to scrape.

    Returns:
    - str: The text content of the webpage.

    If the retrieval is successful (status code 200), the function parses the HTML content
    using BeautifulSoup and returns the text content, stripping leading and trailing whitespaces.

    If the retrieval fails, an error message is returned, including the HTTP status code.
    """
    try:
        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "lxml")
            page_text = soup.get_text(separator=" ", strip=True)
            return page_text
        else:
            return f"Failed to retrieve the webpage: Status code {response.status_code}"

    except Exception as e:
        print(e)
        return f"Failed to retrieve the webpage: {e}"


In [None]:
#url ="https://blog.langchain.dev/announcing-langsmith/"

In [None]:
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
SUMMARY_TEMPLATE = """{text}

-----------------
Using the above text, answer in short the following questions:

> {question}

------------------
if the question cannot be answered using the text,imply summarize the text. Include all factual information,numbers,stats etc
"""

SUMMARY_PROMPT = ChatPromptTemplate.from_template(SUMMARY_TEMPLATE)
model = GooglePalm( temperature = 0)

In [None]:
page_content = scrape_text(url)[:10000]
print(page_content)

In [256]:
from langchain.schema.runnable import RunnablePassthrough

scrape_and_summarize_chain = RunnablePassthrough.assign(summary = RunnablePassthrough.assign(text=lambda x: scrape_text(x["url"])[:min(10000, len(x["url"]))]
)|SUMMARY_PROMPT | model | StrOutputParser()) | (lambda x:f"url : {x['url']}\n\nsummary : {x['summary']}")





In [257]:
from langchain.utilities import DuckDuckGoSearchAPIWrapper
RESULTS_PER_QUESTION = 3
ddg_search = DuckDuckGoSearchAPIWrapper()

def web_search(query: str, num_results: int = RESULTS_PER_QUESTION):
    results = ddg_search.results(query,num_results)
    return [r['link'] for r in results]

In [258]:
web_search("India and its allies?")

['https://time.com/6288459/india-ally-us-modi-biden-visit/',
 'https://www.ft.com/content/95a2074f-8491-41d4-9d8b-68b3124723a9',
 'https://www.theatlantic.com/international/archive/2023/03/india-relations-us-china-modi/673237/']

In [259]:
link_extraction_chain = RunnablePassthrough.assign(
    urls = lambda x: web_search(x["question"])
)| (lambda x : [{"question":x["question"],"url": u} for u in x["urls"]])

print(link_extraction_chain.invoke({
    "question":"India and its allies?"
}))

[{'question': 'India and its allies?', 'url': 'https://time.com/6288459/india-ally-us-modi-biden-visit/'}, {'question': 'India and its allies?', 'url': 'https://www.ft.com/content/95a2074f-8491-41d4-9d8b-68b3124723a9'}, {'question': 'India and its allies?', 'url': 'https://www.theatlantic.com/international/archive/2023/03/india-relations-us-china-modi/673237/'}]


In [260]:

print(scrape_and_summarize_chain.invoke(
{'question': 'India and its allies?', 'url': 'https://time.com/6288459/india-ally-us-modi-biden-visit/'}
))


url : https://time.com/6288459/india-ally-us-modi-biden-visit/

summary : India's closest allies are Russia, Israel, and Iran.


In [None]:
# chain = RunnablePassthrough.assign(
#     urls = lambda x: web_search(x["question"])
# )| (lambda x : [{"question":x["question"],"url": u} for u in x["urls"]]) | scrape_and_summarize_chain.map()


text_single_question_multilink_web_search_chain = link_extraction_chain | scrape_and_summarize_chain.map()

In [None]:
print(text_single_question_multilink_web_search_chain.invoke({
    "question":"India and its allies?"
}))

In [None]:
# SEARCH_PROMPT = ChatPromptTemplate.from_messages(
#     [
#         ("user",
#          "Write 3 google search queries to search online to form an "
#          "objective opinion of from the following: {question}\n"
#          "You must respond with a list of strings in the following format:"
#          '["query 1","query 2","query 3"]',
#         )
#     ]
# )


from langchain.prompts import HumanMessagePromptTemplate

SEARCH_TEMPLATE ="""
Write 3 google search queries to search online to form an objective opinion of from the following: 
{question} 
-----------------

You must respond with a list of strings in the following format:
-----------------
["query 1","query 2","query 3"]
"""

SEARCH_PROMPT = ChatPromptTemplate.from_messages(
    [
        HumanMessagePromptTemplate.from_template(SEARCH_TEMPLATE)
    ]
)


In [None]:
import json
create_question_chain = SEARCH_PROMPT | model | StrOutputParser() | json.loads

In [None]:
create_question_chain.invoke(
    {
        "question":"What is the difference between Machine Learning and Deep Learning"
    }
)

In [None]:
full_text_web_search_chain =create_question_chain |(lambda x : [ {"question": q} for q in x]) |text_single_question_multilink_web_search_chain.map()



In [None]:
full_text_web_search_chain.invoke(
       {
        "question":"ISRO vs NASA",
    }
)

In [None]:
from langchain.schema.runnable import ConfigurableField

WRITER_SYSTEM_PROMPT = "You are an AI critical thinker research assistant. Your sole purpose is to write well written, critically acclaimed, objective and structured reports on given text."


RESEARCH_REPORT_TEMPLATE = """Information: 
--------
{research_summary}
--------

Using the above information, answer the following question or topic: "{question}" in a detailed report -- \
The report should focus on the answer to the question, should be well structured, informative, \
in depth, with facts and numbers if available and a minimum of 1,200 words.

You should strive to write the report as long as you can using all relevant and necessary information provided.
You must write the report with markdown syntax.
You MUST determine your own concrete and valid opinion based on the given information. Do NOT deter to general and meaningless conclusions.
Write all used source urls at the end of the report, and make sure to not add duplicated sources, but only one reference for each.
You must write the report in apa format.
Please do your best, this is very important to my career.""" 


RESOURCE_REPORT_TEMPLATE = """Information: 
--------
{research_summary}
--------

Based on the above information, generate a bibliography recommendation report for the following question or topic: "{question}". \
The report should provide a detailed analysis of each recommended resource, explaining how each source can contribute to finding answers to the research question. \
Focus on the relevance, reliability, and significance of each source. \
Ensure that the report is well-structured, informative, in-depth, and follows Markdown syntax. \
Include relevant facts, figures, and numbers whenever available. \
The report should have a minimum length of 1,200 words.

Please do your best, this is very important to my career."""  

OUTLINE_REPORT_TEMPLATE = """Information: 
--------
{research_summary}
--------

Using the above information, generate an outline for a research report in Markdown syntax for the following question or topic: "{question}". \
The outline should provide a well-structured framework for the research report, including the main sections, subsections, and key points to be covered. \
The research report should be detailed, informative, in-depth, and a minimum of 1,200 words. \
Use appropriate Markdown syntax to format the outline and ensure readability.

Please do your best, this is very important to my career."""  


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", WRITER_SYSTEM_PROMPT),
        ("user", RESEARCH_REPORT_TEMPLATE),
    ]
).configurable_alternatives(
    ConfigurableField("report_type"),
    default_key="research_report",
    resource_report=ChatPromptTemplate.from_messages(
        [
            ("system", WRITER_SYSTEM_PROMPT),
            ("user", RESOURCE_REPORT_TEMPLATE),
        ]
    ),
    outline_report=ChatPromptTemplate.from_messages(
        [
            ("system", WRITER_SYSTEM_PROMPT),
            ("user", OUTLINE_REPORT_TEMPLATE),
        ]
    ),
)




In [None]:
def collapse_list_of_lists(list_of_list):
    content =[]
    for l in list_of_lists:
        content.append("\n\n".join(l))
    return "\n\n".join(content)



# chain = RunnablePassthrough.assign(
#     research_summary= full_text_web_search_chain | (lambda x: collapse_list_of_lists(x))
# ) |prompt |model | StrOutputParser()


full_text_chain = RunnablePassthrough.assign(
    research_summary= full_text_web_search_chain | (lambda x: "\n\n".join(["\n\n".join(l) for l in x]))

) | prompt | model | StrOutputParser()

In [None]:
full_text_chain.invoke(
       {
        "question":"dollar vs INR",
    }
)

In [239]:
MAX_CHARACTERS_EXTRACTED_FROM_PAGE = 10000
model = GooglePalm( temperature = 0)

In [238]:
# from langchain.document_loaders import YoutubeLoader

# loader = YoutubeLoader.from_youtube_url(
#     "https://www.youtube.com/watch?v=cQUUkZnyoD0", add_video_info=True,language=["en", "hi"],translation="en",
# )

# loader.load()

In [240]:
def get_youTubeTranscript_text(url):
    try:
        loader = YoutubeLoader.from_youtube_url(
    url, add_video_info=True,language=["en","hi"],translation="en",
)

        data = loader.load()
        data= (data[0].page_content[:10000]).strip()
        clean_text = re.sub(r'[^A-Za-z0-9\s]', '', data)
        clean_text = re.sub(' +', ' ', clean_text)  # Remove extra spaces
        return clean_text

    except Exception as e:
        print(e)
        return f"Failed to retrieve the transcript or the language is not available:"



# import re
# from youtube_transcript_api import YouTubeTranscriptApi


# def clean_text(text):
#     # Remove special characters and unnecessary spaces
#     clean_text = re.sub(r'[^A-Za-z0-9\s]', '', text)
#     clean_text = re.sub(' +', ' ', clean_text)  # Remove extra spaces
#     return clean_text

# def get_youtube_transcript_text(url):
#     try:
#         # Use YoutubeTranscriptApi to get the transcript
#         transcript = YoutubeTranscriptApi.get_transcript(url, languages=["en", "hi"])

#         # Extract the text from the transcript
#         text = " ".join(entry["text"] for entry in transcript)

#         # Clean the text
#         clean_text = clean_text(text)

#         return clean_text

#     except TranscriptNotFoundError:
#         return f"Transcript not found for the provided URL."

#     except Exception as e:
#         print(e)
#         return f"Failed to retrieve the transcript or the language is not available."


In [None]:
print(get_youTubeTranscript_text("https://www.youtube.com/watch?v=wiK6LNTJ0rc"))

In [266]:
YOUTUBE_SUMMARY_TEMPLATE = """{text}

-----------------
Using the above youtube transcript text, answer in short the following questions:

> {question}

------------------
if the question cannot be answered using the text,imply summarize the text. Include all factual information,numbers,stats etc
"""
YOUTUBE_SUMMARY_PROMPT = ChatPromptTemplate.from_template(YOUTUBE_SUMMARY_TEMPLATE)
model = GooglePalm( temperature = 0)

get_youTubeTranscript_and_summarize_chain = RunnablePassthrough.assign(summary = RunnablePassthrough.assign(text=lambda x: get_youTubeTranscript_text(x["url"])
)|YOUTUBE_SUMMARY_PROMPT | model | StrOutputParser()) | (lambda x:f"url : {x['url']}\n\nsummary : {x['summary']}")





In [None]:
# get_youTubeTranscript_and_summarize_chain.invoke({
#     "question":"What does the youtuber teach about",
#     "url":"https://youtu.be/DjuXACWYkkU?si=gdiDJf0P4gjikzgZ"
# })

# [{'question': 'Who will win 2024 India General Election',
#   'url': 'https://www.youtube.com/watch?v=5gCxjsh-sHU'},
#  {'question': 'Who will win 2024 India General Election',
#   'url': 'https://www.youtube.com/watch?v=ZBo-fPzWoZw'},
#  {'question': 'Who will win 2024 India General Election',
#   'url': 'https://www.youtube.com/watch?v=Yx-e9bxbhgM'}]
get_youTubeTranscript_and_summarize_chain.invoke({'question': 'Who will win 2024 India General Election',
  'url': 'https://www.youtube.com/watch?v=5gCxjsh-sHU'})


In [None]:
!


In [243]:
# from youtubesearchpython import VideosSearch
# from pytube import YouTube
# def get_youtube_links(search_query, RESULTS_PER_QUESTION = 3,MAX_DURATION=1800):
#     videos_search = VideosSearch(search_query, limit=30)
#     results = videos_search.result()

#     # video_links = []
#     # for video in results["result"]:
#     #     video_links.append(video["link"])

    
#     video_links = []
#     for video in results["result"]:
#         video_url = video["link"]
#         try:
#             yt = YouTube(video_url)
#             duration_seconds = yt.length
#             if duration_seconds <= MAX_DURATION:
#                 video_links.append(video_url)
#                 if(len(video_links)>=RESULTS_PER_QUESTION):
#                     break
#         except Exception as e:
#             print(f"Error processing video {video_url}: {e}")

#     return video_links




def has_english_captions(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
        return len(transcript) > 0
    except Exception as e:
        return False

def get_youtube_links(search_query, RESULTS_PER_QUESTION=3, MAX_DURATION=1800):
    videos_search = VideosSearch(search_query, limit=30)
    results = videos_search.result()

    video_links = []
    for video in results["result"]:
        video_url = video["link"]
        try:
            yt = YouTube(video_url)
            duration_seconds = yt.length
            title = yt.title
            video_id = yt.video_id

            # Check if the video has English title and descriptions
            if duration_seconds <= MAX_DURATION  and has_english_captions(video_id):
                video_links.append(video_url)
                if len(video_links) >= RESULTS_PER_QUESTION:
                    break
        except Exception as e:
            print(f"Error processing video {video_url}: {e}")

    return video_links




In [244]:
get_youtube_links("Langchain tutorial")

['https://www.youtube.com/watch?v=aywZrzNaKjs',
 'https://www.youtube.com/watch?v=MlK6SIjcjE8',
 'https://www.youtube.com/watch?v=I4mFqyqFkxg']

In [None]:
search_query = "Langchain tutorial"
max_results = 5

youtube_links = get_youtube_links(search_query, max_results)
print(youtube_links)

In [245]:
# get list of dictionary of question and youTube link from a question
youTubelink_extraction_chain = RunnablePassthrough.assign(
    urls = lambda x: get_youtube_links(x["question"])
)| (lambda x : [{"question":x["question"],"url": u} for u in x["urls"]])

In [263]:
youTubelink_extraction_chain.invoke({
    "question":"Deep Learning",
})

[{'question': 'Deep Learning',
  'url': 'https://www.youtube.com/watch?v=6M5VXKLf4D4'},
 {'question': 'Deep Learning',
  'url': 'https://www.youtube.com/watch?v=aircAruvnKk'},
 {'question': 'Deep Learning',
  'url': 'https://www.youtube.com/watch?v=q6kJ71tEYqM'}]

In [264]:
youTube_single_question_multilink_web_search_chain = youTubelink_extraction_chain | get_youTubeTranscript_and_summarize_chain.map()


In [265]:
youTube_single_question_multilink_web_search_chain.invoke({
    "question":"Deep learning",
})

['url : https://www.youtube.com/watch?v=6M5VXKLf4D4\n\nsummary : Deep learning is a subset of machine learning which in turn is a subset of artificial intelligence.\nDeep learning is inspired by the structure of the human brain and is a type of machine learning that allows computers to learn without being explicitly programmed.\nDeep learning requires a massive volume of data to train, and is used in applications such as customer support, medical care, and self-driving cars.\nDeep learning has some limitations, including the need for a lot of data, computational power, and training time.\nSome popular deep learning frameworks include TensorFlow, PyTorch, Keras, Deep Learning 4j, Cafe, and Microsoft Cognitive Toolkit.',
 'url : https://www.youtube.com/watch?v=aircAruvnKk\n\nsummary : Deep learning is a type of machine learning that uses artificial neural networks to learn from data. Neural networks are inspired by the human brain, and they can be used to solve a variety of problems, inc

In [249]:
YOUTUBE_SEARCH_TEMPLATE ="""
Write 3 youtube search queries to search youtube to form an objective opinion of from the following: 
{question} 
-----------------

You must respond with a list of strings in the following format:
-----------------
["query 1","query 2","query 3"]
"""

YOUTUBE_SEARCH_PROMPT = ChatPromptTemplate.from_messages(
    [
        HumanMessagePromptTemplate.from_template(SEARCH_TEMPLATE)
    ]
)

In [250]:
import json
create_youtube_question_chain = YOUTUBE_SEARCH_PROMPT | model | StrOutputParser() | json.loads

In [251]:
create_youtube_question_chain.invoke({
    "question":"Deep learning"
})

['deep learning pros and cons',
 'deep learning applications',
 'deep learning limitations']

In [252]:
#given a question it creates question then for each question finds multiple urls then for each such url data is fetched,creates a list of list
full_youTube_web_search_chain =create_youtube_question_chain |(lambda x : [ {"question": q} for q in x]) |youTube_single_question_multilink_web_search_chain.map()


In [None]:
YOUTUBE_RESEARCH_REPORT_TEMPLATE = """Information: 
--------
{research_summary}
--------

Using the above information, answer the following question or topic: "{question}" in a detailed report -- \
The report should focus on the answer to the question, should be well structured, informative, \
in depth, with facts and numbers if available and a minimum of 1,200 words.

You should strive to write the report as long as you can using all relevant and necessary information provided.
You must write the report with markdown syntax.
You MUST determine your own concrete and valid opinion based on the given information. Do NOT deter to general and meaningless conclusions.
Write all used source urls at the end of the report, and make sure to not add duplicated sources, but only one reference for each.
You must write the report in apa format.
The data above comes from youtube transcripts so make the reference accordingly
Please do your best, this is very important to my career.""" 

In [253]:
full_youTube_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", WRITER_SYSTEM_PROMPT),
        ("user", YOUTUBE_RESEARCH_REPORT_TEMPLATE),
    ]
)

In [254]:
full_youTube_chain = RunnablePassthrough.assign(
    research_summary= full_youTube_web_search_chain | (lambda x: "\n\n".join(["\n\n".join(l) for l in x]))

) | full_youTube_prompt | model | StrOutputParser()


In [262]:
full_youTube_chain.invoke(
       {
        "question":"Deep Learning",
    }
)

'## Deep Learning\n\nDeep learning is a type of machine learning that uses artificial neural networks to learn from data. Neural networks are inspired by the human brain, and they can be used to solve a wide variety of problems, including image recognition, natural language processing, and speech recognition.\n\n### What are the pros and cons of deep learning?\n\n**Pros:**\n\n* Deep learning models can achieve state-of-the-art results on a variety of tasks.\n* They are able to learn from large amounts of data, even if the data is noisy or incomplete.\n* Deep learning models can be used to solve problems that are difficult or impossible for traditional machine learning algorithms to solve.\n\n**Cons:**\n\n* Deep learning models can be complex and difficult to understand.\n* They can be computationally expensive to train.\n* Deep learning models can be biased, and it is important to be aware of this when using them.\n\n### What are some applications of deep learning?\n\nDeep learning is 