# Building Autonomous Agents to Create Analysis Reports

In [None]:
# Install required packages
!pip install -q deeplake==4.2.11 langchain==0.3.26 langchain-openai==0.3.23 langchain-core==0.3.66 langchain-deeplake==0.1.0 langchain-community==0.3.26 \
                openai==1.92.0 jedi==0.19.2 lxml_html_clean==0.4.2 newspaper4k==0.9.3 langgraph==0.5.0 langchain-text-splitters==0.3.8

In [None]:
import os

# os.environ["OPENAI_API_KEY"] = "<YOUR-OPENAI-API-KEY>"
# os.environ["ACTIVELOOP_TOKEN"] = "<YOUR-ACTIVELOOP-TOKEN>"

from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["ACTIVELOOP_TOKEN"] = userdata.get('ACTIVELOOP_TOKEN')

In [None]:
# Scrape articles
import requests
from newspaper import Article
import time

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}

# Fixed: Added missing comma in the URL list
article_urls = [
    "https://www.artificialintelligence-news.com/2023/05/23/meta-open-source-speech-ai-models-support-over-1100-languages/",
    "https://www.artificialintelligence-news.com/2023/05/18/beijing-launches-campaign-against-ai-generated-misinformation/",
    "https://www.artificialintelligence-news.com/2023/05/16/openai-ceo-ai-regulation-is-essential/",
    "https://www.artificialintelligence-news.com/2023/05/15/jay-migliaccio-ibm-watson-on-leveraging-ai-to-improve-productivity/",
    "https://www.artificialintelligence-news.com/2023/05/15/iurii-milovanov-softserve-how-ai-ml-is-helping-boost-innovation-and-personalisation/",
    "https://www.artificialintelligence-news.com/2023/05/11/ai-and-big-data-expo-north-america-begins-in-less-than-one-week/",
    "https://www.artificialintelligence-news.com/2023/05/11/eu-committees-green-light-ai-act/",
    "https://www.artificialintelligence-news.com/2023/05/09/wozniak-warns-ai-will-power-next-gen-scams/",
    "https://www.artificialintelligence-news.com/2023/05/09/infocepts-ceo-shashank-garg-on-the-da-market-shifts-and-impact-of-ai-on-data-analytics/",
    "https://www.artificialintelligence-news.com/2023/05/02/ai-godfather-warns-dangers-and-quits-google/",
    "https://www.artificialintelligence-news.com/2023/04/28/palantir-demos-how-ai-can-used-military/",
    "https://www.artificialintelligence-news.com/2023/04/26/ftc-chairwoman-no-ai-exemption-to-existing-laws/",
    "https://www.artificialintelligence-news.com/2023/04/24/bill-gates-ai-teaching-kids-literacy-within-18-months/",
    "https://www.artificialintelligence-news.com/2023/04/21/google-creates-new-ai-division-to-challenge-openai/"
]

session = requests.Session()
pages_content = []

for url in article_urls:
    try:
        time.sleep(2)
        response = session.get(url, headers=headers, timeout=10)

        if response.status_code == 200:
            article = Article(url)
            article.download()
            article.parse()
            pages_content.append({"url": url, "text": article.text})
        else:
            print(f"Failed to fetch article at {url}")
    except Exception as e:
        print(f"Error occurred while fetching article at {url}: {e}")

print(f"Successfully scraped {len(pages_content)} articles")

In [None]:
# Set up embeddings and DeepLake vector store
from langchain_openai import OpenAIEmbeddings
from langchain_deeplake.vectorstores import DeeplakeVectorStore

# Use updated OpenAI embeddings
embeddings = OpenAIEmbeddings( model="text-embedding-3-small")

# Configure DeepLake dataset path

# Option 1: Local storage (faster for development)
dataset_path_local = "./my_deeplake_analysis/"

# Option 2: Activeloop cloud storage
#
activeloop_org_id = ""  # TODO: Update this with your org id
dataset_path_cloud = f"hub://{activeloop_org_id}/langchain_ai_analysis"

# Choose your preferred storage option
USE_CLOUD_STORAGE = True  # Set to True to use Activeloop cloud storage

if USE_CLOUD_STORAGE:
    dataset_path = dataset_path_cloud
    print(f"Using Activeloop cloud storage: {dataset_path}")
else:
    dataset_path = dataset_path_local
    print(f"Using local storage: {dataset_path}")

print(f"DeepLake dataset will be stored at: {dataset_path}")

In [None]:
# Split documents and create DeepLake vector store
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

# Process documents and create Document objects with metadata
documents = []
for i, content in enumerate(pages_content):
    chunks = text_splitter.split_text(content["text"])
    for chunk in chunks:
        documents.append(Document(
            page_content=chunk,
            metadata={
                "source": content["url"],
                "article_id": i,
                "chunk_type": "article_content"
            }
        ))

print(f"Created {len(documents)} document chunks")

In [None]:
# Initialize DeepLake vector store and add documents
try:
    # Create the vector store with documents
    vectorstore = DeeplakeVectorStore.from_documents(
        documents=documents,
        embedding=embeddings,
        dataset_path=dataset_path,
        overwrite=True  # Set to False if you want to append to existing data
    )

    print(f"Successfully created DeepLake vector store with {len(documents)} documents!")
    print(f"Dataset path: {dataset_path}")

    # Display some basic information about the dataset
    if hasattr(vectorstore, 'dataset'):
        print(f"Dataset info: {vectorstore.dataset.info}")

except Exception as e:
    print(f"Error creating DeepLake vector store: {e}")

    # Fallback to local storage
    print("\nFalling back to local storage...")
    dataset_path = dataset_path_local
    vectorstore = DeeplakeVectorStore.from_documents(
        documents=documents,
        embedding=embeddings,
        dataset_path=dataset_path,
        overwrite=True
    )
    print(f"Successfully created local DeepLake vector store!")

In [None]:
# Create retrieval tool with DeepLake
from langchain.tools import Tool

# Get retriever from vector store
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

def retrieve_ai_news(query: str) -> str:
    """Searches for relevant AI news documents that may contain the answer to the query."""
    try:
        docs = retriever.invoke(query)

        if not docs:
            return "No relevant documents found."

        # Format the results
        results = []
        for i, doc in enumerate(docs, 1):
            content = doc.page_content
            source = doc.metadata.get('source', 'Unknown source')
            article_id = doc.metadata.get('article_id', 'Unknown')
            results.append(f"--- Document {i} (Article {article_id}) ---\nSource: {source}\nContent: {content}\n")

        return "\n".join(results)
    except Exception as e:
        return f"Error retrieving documents: {str(e)}"

# Create the tool
search_tool = Tool(
    name="Search_AI_News",
    func=retrieve_ai_news,
    description="Search for relevant AI news and information to answer questions about artificial intelligence developments, regulations, and industry trends."
)

print("Retrieval tool created successfully!")

In [None]:
# Create agent using LangGraph
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import MemorySaver

# Initialize the language model
llm = ChatOpenAI(model="gpt-4.1-mini", temperature=0)

# Create memory saver for conversation history
memory = MemorySaver()

# Create the agent using LangGraph's create_react_agent
agent = create_react_agent(
    model=llm,
    tools=[search_tool],
    checkpointer=memory,
    prompt="""You are an AI research analyst specializing in artificial intelligence news and trends.
    You have access to a comprehensive DeepLake vector database containing recent AI news articles.

    Use the Search_AI_News tool to find relevant information before answering questions.

    When creating reports or analyses:
    1. Search for relevant information using the available tool
    2. Analyze the information critically
    3. Provide well-structured, comprehensive responses
    4. Include specific examples and data when available
    5. Cite sources when mentioning specific information
    6. Consider multiple perspectives when discussing controversial topics

    Always search for information before providing answers, even if you think you know the answer.

    The DeepLake vector store contains articles about AI developments, regulations, industry trends,
    and expert opinions from 2023. Use this rich dataset to provide informed, data-driven responses."""
)

print("Agent created successfully!")

In [None]:
# Test the agent
config = {"configurable": {"thread_id": "analysis_session"}}

response = agent.invoke(
    {"messages": [{"role": "user", "content": "Write an overview of Artificial Intelligence regulations by governments by country"}]},
    config
)


print(response['messages'][-1].content)