# Mendable Firecrawl -> Weaviate -> DSPy!

In [133]:
!pip install firecrawl-py==0.0.14 > /dev/null

In [66]:
from firecrawl import FirecrawlApp

app = FirecrawlApp(api_key="YOUR-FIRECRAWL-KEY")


In [69]:
scraped_data = app.scrape_url("https://www.databricks.com/blog/accelerating-innovation-jetblue-using-databricks")

In [70]:
scraped_data

 'metadata': {'title': 'Accelerating Innovation at JetBlue Using Databricks | Databricks Blog',
  'robots': 'index, follow',
  'ogTitle': 'Accelerating Innovation at JetBlue Using Databricks',
  'ogUrl': 'https://www.databricks.com/blog/accelerating-innovation-jetblue-using-databricks',
  'ogImage': 'https://www.databricks.com/sites/default/files/2023-06/Databricks-OG-jetBlue.jpg?v=1687537398',
  'ogLocale': 'en_US',
  'ogLocaleAlternate': [],
  'ogSiteName': 'Databricks',
  'modifiedTime': 'Fri, 05/17/2024 - 22:37',
  'publishedTime': 'Thu, 06/22/2023 - 18:38',
  'sourceURL': 'https://www.databricks.com/blog/accelerating-innovation-jetblue-using-databricks'}}

In [82]:
from typing import List

def get_markdown_from_Firecrawl(website_urls: List[str]) -> List[str]:
    results = []
    for website_url in website_urls:
        crawl_result = app.scrape_url(website_url)
        # Get the markdown
        results.append({
            "content": crawl_result["content"],
            "weblink": website_url
        })
    return results

In [107]:
results = get_markdown_from_Firecrawl(["https://www.databricks.com/blog/accelerating-innovation-jetblue-using-databricks"])

# Create Weaviate WebChunk Collection

`!pip install weaviate-client==4.6.4`

In [95]:
import weaviate
import weaviate.classes.config as wvcc

weaviate_client = weaviate.connect_to_local()

web_chunks = weaviate_client.collections.create(
    name="WebChunk",
    vectorizer_config=wvcc.Configure.Vectorizer.text2vec_cohere
    (
        model="embed-multilingual-v3.0"
    ),
    properties=[
            wvcc.Property(name="content", data_type=wvcc.DataType.TEXT),
            wvcc.Property(name="weblink", data_type=wvcc.DataType.TEXT),
      ]
)

  web_chunks = weaviate_client.collections.create(


In [108]:
weblink = results[0]["weblink"] # haha, clean this up later
results = results[0]["content"].split()

chunk_size = 300
for i in range(0, len(results), chunk_size):
    chunk = results[i:i+chunk_size]
    web_chunks.data.insert(
        properties={
            "content": " ".join(chunk),
            "weblink": weblink
        }
    )

In [110]:
response = web_chunks.query.hybrid(
    query="How does JetBlue use Databricks?",
    limit=3
)

for o in response.objects:
    print(o.properties)

{'content': 'Catalog](https://www.databricks.com/product/unity-catalog) role-based access to documents in the vector database document store. Using this framework, any JetBlue user can access the same chatbot hidden behind Azure AD SSO protocols and Databricks Unity Catalog Access Control Lists (ACLs). Every product, including the BlueSky real-time digital twin, ships with embedded LLMs. ![JetBlue’s Chatbot based on Microsoft Azure OpenAI APIs and Databricks Dolly](https://www.databricks.com/sites/default/files/inline-images/image5.png?v=1687203897) JetBlue’s Chatbot based on Microsoft Azure OpenAI APIs and Databricks Dolly By deploying AI and ML enterprise products on Databricks using data in lakehouse, JetBlue has thus far unlocked a relatively high Return-on-Investment (ROI) multiple within two years. In addition, Databricks allows the Data Science and Analytics teams to rapidly prototype, iterate and launch data pipelines, jobs and ML models using the [lakehouse](https://www.databr

# Connect to DSPy RAG

`!pip install dspy-ai==2.4.9`

In [130]:
import dspy # !pip install dspy-ai==2.4.9
from dspy.retrieve.weaviate_rm import WeaviateRM
import weaviate

retriever_model = WeaviateRM("WebChunk", weaviate_client=weaviate_client)

command_r_plus = dspy.Cohere(model="command-r-plus",
                             api_key="YOUR-COHERE-API-KEY",
                             max_tokens=4000)

dspy.settings.configure(lm=command_r_plus, rm=retriever_model)

In [131]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()
        
        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought("question, contexts -> precise_answer")
    
    def forward(self, question):
        contexts = "".join(self.retrieve(question).passages)
        prediction = self.generate_answer(question=question, contexts=contexts).precise_answer
        return dspy.Prediction(answer=prediction)

In [132]:
rag = RAG()

print(rag("How does JetBlue use Databricks?").answer)

JetBlue uses Databricks in several ways, including providing role-based access to documents, deploying AI and ML enterprise products, enabling rapid prototyping and iteration for data pipelines and ML models, and using Delta Live Tables and their custom BlueML library for AI and ML model training and inference.
