## Webpage Loaders
- Load the webpage and extract the data using the `WebBaseLoader` and `BeautifulSoup` libraries.
- Use LLM to extract meaningful data from the webpage.

### Project 1: Share Market Data Analysis Based on Global Cues
- We will extract the data from the stock market website and analyze the data to understand the impact of global cues on the Indian share market.

In [None]:
from dotenv import load_dotenv

load_dotenv('./../.env')

In [7]:
from langchain_community.document_loaders import WebBaseLoader


urls = ["https://www.moneycontrol.com/",
        "https://economictimes.indiatimes.com/markets/stocks/news", 
        "https://www.livemint.com/latest-news"]

loader = WebBaseLoader(web_paths=urls)
docs = []
async for doc in loader.alazy_load():
    docs.append(doc)

def format_docs(docs):
    return "\n\n".join([x.page_content for x in docs])


context = format_docs(docs)

In [8]:
import re

def text_clean(text):
    text = re.sub(r"\n\n+", "\n\n", text)
    text = re.sub(r"\t+", "\t", text).strip()
    return text

In [None]:
context = text_clean(context)
len(context)

In [10]:
### QnA with LLM
from scripts import llm

In [None]:
doc

In [None]:
question = """Extract stock market related news if present in the text. 
                                        Do not write preamble or explaination. Extract all news in points."""

response = llm.ask_llama(context[:10_000], question)
print(response)

In [14]:
# Get the answer from chunks of 10_000 characters with 100 characters overlap
def chunk_text(text, chunk_size=10_000, overlap=100):
    chunks = []
    for i in range(0, len(text), chunk_size-overlap):
        chunks.append(text[i:i+chunk_size])
    return chunks

chunks = chunk_text(context)



In [15]:
chunk_summary = []
for chunk in chunks:
    response = llm.ask_llama(chunk, question)
    chunk_summary.append(response)

In [None]:
for summary in chunk_summary:
    print(summary)
    break

In [28]:
summary = "\n\n".join(chunk_summary)
response = llm.ask_llama(context = summary, 
                             question = """Write a detailed market news report in markdown format. Think carefully then write the report.""")

In [29]:
import os

# print(response)

os.makedirs("data", exist_ok=True)
with open("data/market_report.md", "w") as f:
    f.write(response)

In [None]:
print(summary)