In [1]:
import os
from dotenv import load_dotenv
import cohere

load_dotenv()

api_key = os.getenv("COHERE_API_KEY")
if not api_key:
    raise ValueError("No API key found. Please set the API_KEY environment variable.")

cohere.api_key = api_key

from langchain_cohere import ChatCohere

llm = ChatCohere(model="command-r-plus")

A Document is an object with some page_content (str) and metadata (dict).

WebBaseLoader: uses urllib to load HTML from web URLs.

BeatifulSoap: to parse html to text.

In [10]:
import bs4
from langchain_community.document_loaders import WebBaseLoader

# Only keep post title, headers, and content from the full HTML.
bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()

len(docs[0].page_content)

43131

DocumentLoader: Object that loads data from a source as list of Documents.

#### siguiente paso: chunkear

We set add_start_index=True so that the character index at which each split Document starts within the initial Document is preserved as metadata attribute “start_index”.

In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

66

In [13]:
len(all_splits[0].page_content)

969

In [14]:
all_splits[10].metadata

{'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/',
 'start_index': 7056}

Haystack en reemplazo de Chroma

In [30]:
from haystack import Pipeline
from haystack.components.builders import DynamicChatPromptBuilder
from haystack.dataclasses import ChatMessage
from haystack_integrations.components.generators.cohere import CohereChatGenerator
from haystack.utils import Secret
import os
from dotenv import load_dotenv

load_dotenv()

COHERE_API_KEY = os.getenv("COHERE_API_KEY")

pipe = Pipeline()
pipe.add_component("prompt_builder", DynamicChatPromptBuilder())
pipe.add_component("llm", CohereChatGenerator(Secret.from_token(COHERE_API_KEY)))
pipe.connect("prompt_builder", "llm")

location = "Berlin"
system_message = ChatMessage.from_system("You are an assistant giving out valuable information to language learners.")
messages = [system_message, ChatMessage.from_user("Tell me about {{location}}")]

res = pipe.run(data={"prompt_builder": {"template_variables": {"location": location}, "prompt_source": messages}})
print(res)

{'llm': {'replies': [ChatMessage(content="Berlin is the capital of Germany and, with a population of over 3.5 million people, is the country's largest city. It's located in northeastern Germany, along the River Spree. Berlin is well known for its rich history, vibrant nightlife, and cultural diversity. \n\nThe city's past can be traced back to the 13th century, but its modern history was largely shaped by the divisions of the Cold War. Berlin was divided into East and West from 1949 to 1990, with the Berlin Wall acting as a physical barrier between the two sides for over 28 years. The fall of the Wall in 1989 was a significant moment in world history, and its legacy can still be felt throughout the city. \n\nBerlin has numerous famous landmarks, many of which are iconic reminders of its divided past. The Berlin Wall's East Side Gallery, where a portion of the Wall is adorned with paintings, and the Brandenburg Gate, which stood between East and West Berlin, are both popular tourist att

In [31]:
messages = [system_message, ChatMessage.from_user("What's the weather forecast for {{location}} in the next {{day_count}} days?")]

res = pipe.run(data={"prompt_builder": {"template_variables": {"location": location, "day_count": "5"},
                                    "prompt_source": messages}})

print(res)

{'llm': {'replies': [ChatMessage(content="As an AI chatbot, I have no access to real-time weather information. However, I can provide you with the general forecast for the next five days in Berlin. Please note that the actual weather might vary:\n\nDay 1: Sunny intervals with a high of 23°C and a low of 14°C. There's a possibility of scattered thunderstorms.\n\nDay 2: Partly cloudy with a high of 24°C and a low of 15°C. Chance of rain is 60%. \n\nDay 3: Mostly cloudy with a high near 25°C and a low of 16°C. There might be a few showers. \n\nDay 4: Cloudy with a high of 26°C and a low of 17°C. Rain is likely.\n\nDay 5: More clouds than sun with a high of 25°C and a low of 16°C. Rain is possible.\n\nPlease check with your local weather apps or channels for more accurate and up-to-date information. The weather can be very unpredictable these days!", role=<ChatRole.ASSISTANT: 'assistant'>, name=None, meta={'model': 'command-r', 'usage': 269, 'index': 0, 'finish_reason': 'COMPLETE', 'docume

Trafilatura

In [35]:
import trafilatura
downloaded = trafilatura.fetch_url('https://github.blog/2019-03-29-leader-spotlight-erin-spiceland/')
trafilatura.extract(downloaded)

'Leader spotlight: Erin Spiceland\nWe’re spending Women’s History Month with women leaders who are making history every day in the tech community. Read more about Erin Spiceland: Software Engineer at SpaceX.\nEvery March we recognize the women who have shaped history—and now, we’re taking a look forward. From driving software development in large companies to maintaining thriving open source communities, we’re spending Women’s History Month with women leaders who are making history every day in the tech community. Erin Spiceland is a Software Engineer for SpaceX. Born and raised in rural south Georgia, she is a Choctaw and Chickasaw mother of two now living in downtown Los Angeles. Erin didn’t finish college—she’s a predominantly self-taught software engineer. In her spare time, she makes handmade Native American beadwork and regalia and attends powwows.\nHow would you summarize your career (so far) in a single sentence?\nMy career has been a winding road through periods of stimulation

In [36]:
from haystack import Document
from haystack import Pipeline
from haystack.components.builders import DynamicChatPromptBuilder
from haystack.components.generators.utils import print_streaming_chunk
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.dataclasses import ChatMessage
from haystack.utils import Secret

from haystack_integrations.components.generators.cohere import CohereChatGenerator

fetcher = LinkContentFetcher()
converter = HTMLToDocument()
prompt_builder = DynamicChatPromptBuilder(runtime_variables=["documents"])
llm = CohereChatGenerator(Secret.from_token(COHERE_API_KEY))

message_template = """Answer the following question based on the contents of the article: {{query}}\n
               Article: {{documents[0].content}} \n 
           """
messages = [ChatMessage.from_user(message_template)]

rag_pipeline = Pipeline()
rag_pipeline.add_component(name="fetcher", instance=fetcher)
rag_pipeline.add_component(name="converter", instance=converter)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", llm)

rag_pipeline.connect("fetcher.streams", "converter.sources")
rag_pipeline.connect("converter.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder.prompt", "llm.messages")

question = "What are the capabilities of Cohere?"

result = rag_pipeline.run(
    {
        "fetcher": {"urls": ["https://docs.cohere.com/reference/about"]},
        "prompt_builder": {"template_variables": {"query": question}, "prompt_source": messages},
      
        "llm": {"generation_kwargs": {"max_tokens": 165}},
    },
)
print(result)

ImportError: Failed to import 'trafilatura'. Run 'pip install trafilatura'. Original error: No module named 'trafilatura'