In [22]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='.env')

True

## Indexing approaches

- Product quantization
- Locally sensitive hashing
- Navigatable small world graphs 
- Hierarchical Navigable Small World graphs
- Maximum Marginal Relevance

# RAG - Retrieval Augmented Generation

# Index data

In [2]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

file_path = "./mixed_data/ESLII_print12_toc (1).pdf"

loader = PyPDFLoader(file_path)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0,
)

data = loader.load_and_split(text_splitter=text_splitter)

In [3]:
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

vector1 = embeddings.embed_query("Hello, world!")
print(vector1)

[0.0015633959929713234, 0.0033545716070579423, -0.012787511877914966, -0.03341191714403767, -0.009461612666816454, 0.0047339919223508475, -0.015380694898565025, 0.0017139217061836536, -0.0029770627863812964, -0.02497610695758453, 0.029792927917733907, 0.0071296591153324545, -0.016807901143473084, -0.018005734739963888, 0.01039184468493336, -0.002722204495100479, 0.02519273631890871, -0.015074865321557083, 0.011271105417323141, 0.010793246586832908, -0.00821917803116117, -0.0017983434220381646, 0.017139217294667185, 0.006017840145582797, -0.01431029044771463, -0.0072698315480659425, 0.0034915579851875116, -0.015915897123990223, 0.03720930307641858, -0.025778910761383625, 0.009843899172415084, -0.006664543106274001, -0.004797706495171052, -0.0138642884379732, 0.011787194109092614, -0.018999681330901, 0.005030264732530928, -0.011334819990143345, 0.018948710045173872, -0.011819050929841418, 0.004947435694732403, 0.005555909725466965, 0.0030280345377697196, -0.006173940988690688, -0.0266836

In [4]:
import numpy as np
from numpy.linalg import norm

def cosine_similarity(a, b):
    return np.dot(a, b) / (norm(a) * norm(b))

vector1 = embeddings.embed_query("artificial intelligence")
vector2 = embeddings.embed_query("machine learning")

print(cosine_similarity(vector1, vector2))

vector3 = embeddings.embed_query("pepperoni pizza")

print(cosine_similarity(vector1, vector3))



0.882098879127071
0.7421712111996903


In [5]:
# From here no FAISS because it has issues on macos

In [6]:
# from langchain.chains import RetrievalQA
# from langchain.chat_models import ChatOpenAI

# retrever = index.as_retriever()
# retrever.search_kwargs["fetch_k"] = 20 # Get 20 results
# retrever.search_kwargs["maximal_marginal_relevance"] = True # Use MMR to rerank results
# retrever.search_kwargs["k"] = 10 

# llm = ChatOpenAI()

# chain = RetrievalQA(
#     llm=llm,
#     retriever=retrever,
#     verbose=True,
# )

# Loading data into vector database

In [9]:
import pinecone, os
from langchain.vectorstores.pinecone import Pinecone

pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),
    environment=os.getenv("PINECONE_ENVIRONMENT"),
)

vecdb = Pinecone(
    pinecone.Index("langchain-demo"),
    embeddings,
    "text" # DB text field
)

vecdb.add_documents(data)



['3719e133-99c8-4595-98e4-9a93ee22c70e',
 '48a2cee6-300a-4bd4-821d-f69abf41c455',
 '48d94cf6-edb6-4650-89f5-0d9ad09db4e6',
 '6995750c-618a-44b2-aa11-c9f5c8e44c57',
 '27e428fa-2343-4b68-8d2f-5cd2432b18a8',
 '6598a2c2-ec06-4af1-a225-28d70033f7f7',
 'a00f434a-c8c2-4941-b9ad-804b3122612c',
 '19d3ff36-f4e0-4f26-b9aa-4a25ec2c3300',
 '72b139de-c7e3-4c62-a381-a602e6bef070',
 'e265fc2f-e769-4ab5-b8a1-069ce558cdc2',
 'e3056229-9323-4602-965a-a82dd9791e58',
 '71a1bbd1-bea4-4fcf-baa2-f2e68e54b1c8',
 'a1577831-23f4-4c84-940c-48b681dbbd8f',
 'd3b94055-d29d-4298-91ac-c24923edc654',
 '058be3cb-1366-476b-84ed-322d0aceeceb',
 '44470f27-2b94-4332-9818-26118997d7c1',
 '172e49c2-f91f-44a1-a8f7-15f7c318ebd5',
 'a17b7ce5-70d9-4417-bb38-d7ecb71b0da6',
 '19aa5b07-ab9f-4efb-8d8b-f58e094e3e48',
 'bd17ddfe-be3e-4917-b651-2430af2622ee',
 '3e12464f-91fc-4677-b099-bab21b085d91',
 '053beba5-efec-4aa3-b81b-a45eb4493236',
 'd542d592-92ca-4f0a-9dfb-cb70cfc14b47',
 '5a2e0936-a3ae-4a56-a1aa-24f3c4be959a',
 '7733c9b5-4e3a-

In [10]:

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks import StdOutCallbackHandler

llm = ChatOpenAI()

handler = StdOutCallbackHandler()

chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vecdb.as_retriever(),
    verbose=True,
)

chain.run(
    "what is a machine learning?",
    callbacks=[handler]
)



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
While the approach is statistical, theemphasis is on concepts rather than mathematics. Many examples are given, with a liberaluse of color graphics. It should be a valuable resource for statisticians and anyone interestedin data mining in science or industry. The book’s coverage is broad, from supervised learning(prediction) to unsupervised learning. The many topics include neural networks, supportvector machines, classification trees and boosting—the first comprehensive treatment of thistopic

The challenges in learning from data have led to a revolution in the sta-
tisticalsciences.Sincecomputationplayssuchakeyrole, itisnots

'Machine learning is a field of study and application of artificial intelligence (AI) that focuses on the development of algorithms and models that enable computers to learn and make predictions or decisions without being explicitly programmed. It involves using statistical techniques and algorithms to analyze and interpret data, identify patterns, and make informed predictions or decisions. Machine learning is often used in areas such as data mining, pattern recognition, natural language processing, and computer vision.'

# Providing sources

In [14]:
from datetime import date, timedelta
from newsapi import NewsApiClient

newsapi = NewsApiClient(api_key=os.getenv("NEWSAPI_API_KEY"))

today = date.today()
last_week = today - timedelta(days=7)

latest_news = newsapi.get_everything(
    q="artificial intelligence",
    from_param=last_week.strftime("%Y-%m-%d"),
    to=today.strftime("%Y-%m-%d"),
    language="en",
    sort_by="relevancy",
)

In [15]:
print(latest_news["articles"])



In [16]:
from langchain.docstore.document import Document

docs = []

for article in latest_news["articles"]:
    if article["description"] is None:
        print("Skipping article with no description " + article["url"])
        continue

    docs.append(Document(
        page_content=article["title"] + "\n\n" + article["description"],
        metadata={
            "source": article["url"]
        }
    ))

print(docs[0].page_content)
print(docs[0].metadata)

Skipping article with no description https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_cbd1874f-4917-49da-aea1-0f6d7ca0bee9
Skipping article with no description https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_c46a6497-8373-4184-a8e2-a6336345dbc1
Skipping article with no description https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_4b0742a6-2d10-4447-93cf-569917952906
Skipping article with no description https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_7cb8d15f-a0cd-4fe2-b0cc-0908315f8651
Skipping article with no description https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_b1b7de76-4b06-442e-9d01-17a0e2958637
Skipping article with no description https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_f4a0003d-1b7e-40b2-ad3f-d6c71ba78fe2
Skipping article with no description https://consent.yahoo.com/v2/collectConsent?sessionId=1_cc-session_fd93e5f1-3e13-4270-8e6f-8b7c092693d3
Can artificia

In [17]:
from langchain.chains import create_qa_with_sources_chain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate
from langchain.vectorstores import FAISS

qa_chain = create_qa_with_sources_chain(llm=llm)

doc_prompt = PromptTemplate(
    template="Content: {page_content} \nSource: {source}",
    input_variables=["page_content", "source"],
)

final_qa_chain = StuffDocumentsChain(
    llm_chain=qa_chain,
    document_variable_name="context",
    document_prompt=doc_prompt,
)

index = FAISS.from_documents(docs, embeddings)

chain = RetrievalQA(
    retriever=index.as_retriever(),
    combine_documents_chain=final_qa_chain,
)

In [18]:
chain.run("What is the most important news about artificial intelligence from the last week?")

'{\n  "answer": "The most important news about artificial intelligence from the last week is that chip maker Intel announced its latest ambitions in AI by establishing an independent company called Articul8 AI, dedicated to generative AI. This news was reported by Quartz. Additionally, Forbes highlighted the impressive new AI software from Google in a tech showdown with OpenAI\'s Gemini. This news was reported by Forbes. Lastly, CNET emphasized the importance of getting familiar with generative AI tools like ChatGPT as they are rewriting how we live and work today. This news was reported by CNET.",\n  "sources": [\n    "https://qz.com/intels-ai-plans-xs-plunging-value-the-week-in-tech-an-1851144926",\n    "https://www.forbes.com/sites/jonmarkman/2024/01/03/tech-showdown-openais-triumph-over-googles-gemini-exposed/",\n    "https://www.cnet.com/tech/computing/features/everyday-ai-chatbots-are-rewriting-how-we-live-and-work-today/"\n  ]\n}'

# Indexing a website

In [25]:
from langchain.utilities import ApifyWrapper
from langchain.docstore.document import Document

apify = ApifyWrapper()

loader = apify.call_actor(
    actor_id="apify/website-content-crawler",
    run_input={
        "startUrls": [{"url": "https://newsletter.theaiedge.io"}],
        "aggressibePrune": True,
    },
    dataset_mapping_function=lambda x: Document(
        page_content=x["text"] or "",
        metadata={
            "source": x["url"],
        }
    ),
)

In [26]:
from langchain.indexes import VectorstoreIndexCreator

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=0,
)

index = VectorstoreIndexCreator(
    # Uses OpenAI embeddings, and Chroma local index by default
    text_splitter=text_splitter,
).from_loaders([loader])

index

VectorStoreIndexWrapper(vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x17e2190c0>)

In [27]:
query = "What is the main subject of the aiedge newsletter?"

index.query_with_sources(query)

{'question': 'What is the main subject of the aiedge newsletter?',
 'answer': ' The main subject of the AiEdge Newsletter is artificial intelligence and its applications. \n',
 'sources': 'https://newsletter.theaiedge.io/'}

In [29]:
retriever = index.vectorstore.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
)

query = "What is the most recent article of the aiedge newsletter?"

qa.run(
    query=query,
    callbacks=[handler]
)



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
Keep reading with a 7-day free trial
Subscribe to The AiEdge Newsletter to keep reading this post and get 7 days of free access to the full post archives.

Keep reading with a 7-day free trial
Subscribe to The AiEdge Newsletter to keep reading this post and get 7 days of free access to the full post archives.

Keep reading with a 7-day free trial
Subscribe to The AiEdge Newsletter to keep reading this post and get 7 days of free access to the full post archives.

Keep reading with a 7-day free trial
Subscribe to The AiEdge Newsletter to keep reading this post and get 7 days of free access to the full post archives.
Human: What

"I'm sorry, but as an AI language model, I don't have access to real-time information or the ability to browse the internet. Therefore, I don't have the capability to provide you with the most recent article from the AiEdge Newsletter. You would need to subscribe to the newsletter or visit their website to get the most up-to-date information."

# Indexing Github

In [31]:
from langchain.document_loaders import GitLoader

loader = GitLoader(
    clone_url="https://github.com/langchain-ai/langchain",
    repo_path="./gitadata/langchain",
    file_filter=lambda file_path: file_path.endswith(".py"),
    branch="master",
)

documents = loader.load()

print(documents[0].page_content)

import json
import sys
import os

LANGCHAIN_DIRS = {
    "libs/core",
    "libs/langchain",
    "libs/experimental",
    "libs/community",
}

if __name__ == "__main__":
    files = sys.argv[1:]
    dirs_to_run = set()

    if len(files) == 300:
        # max diff length is 300 files - there are likely files missing
        raise ValueError("Max diff reached. Please manually run CI on changed libs.")

    for file in files:
        if any(
            file.startswith(dir_)
            for dir_ in (
                ".github/workflows",
                ".github/tools",
                ".github/actions",
                "libs/core",
                ".github/scripts/check_diff.py",
            )
        ):
            dirs_to_run.update(LANGCHAIN_DIRS)
        elif "libs/community" in file:
            dirs_to_run.update(
                ("libs/community", "libs/langchain", "libs/experimental")
            )
        elif "libs/partners" in file:
            partner_dir = file.split("/")[2]


In [32]:
len(documents)

3959

In [33]:
from langchain.text_splitter import Language

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=1000,
    chunk_overlap=200,
)

documents = python_splitter.split_documents(documents)

In [35]:
len(documents)

18505

In [36]:
index = FAISS.from_documents(documents, embeddings)

retriever = index.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
)

query = "What is the stuff chain?"

qa.run(query, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
chain = prompt | fake_llm

def _load_stuff_chain(
    llm: BaseLanguageModel,
    prompt: BasePromptTemplate = stuff_prompt.PROMPT,
    document_prompt: BasePromptTemplate = stuff_prompt.EXAMPLE_PROMPT,
    document_variable_name: str = "summaries",
    verbose: Optional[bool] = None,
    **kwargs: Any,
) -> StuffDocumentsChain:
    llm_chain = LLMChain(llm=llm, prompt=prompt, verbose=verbose)
    return StuffDocumentsChain(
        llm_chain=llm_chain,
        document_variable_name=document_variable_name,
        document_prompt=document_prompt,
        verbose=verbose,
        **kwargs,
    )

def _load_stuff_chain(
    llm

'The stuff chain is a part of a larger system called the StuffDocumentsChain. It is created by the function "_load_stuff_chain()" and consists of an LLMChain object and some additional parameters. The purpose of the stuff chain is not explicitly mentioned in the provided context, so I don\'t have enough information to give a more detailed explanation.'

In [38]:
retriever.search_kwargs["fetch_k"] = 200
retriever.search_kwargs["k"] = 20
retriever.search_kwargs["maximal_marginal_relevance"] = True
retriever.search_kwargs["distance_metric"] = "cos"

qa = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
)

In [40]:
query = "When should I use map reduce chain?"

qa.run(query, callbacks=[handler])



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSystem: Use the following pieces of context to answer the user's question. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.
----------------
class MapReduceChain(Chain):
    """Map-reduce chain."""

    combine_documents_chain: BaseCombineDocumentsChain
    """Chain to use to combine documents."""
    text_splitter: TextSplitter
    """Text splitter to use."""
    input_key: str = "input_text"  #: :meta private:
    output_key: str = "output_text"  #: :meta private:

collapse_documents_chain=collapse_chain,
        token_max=token_max,
        verbose=verbose,
    )
    return MapReduceDocumentsChain(
        llm_chain=map_chain,
        document_variable_name=map_reduce_document_variable_name,
        reduce_documents_chain=reduce_documents_chain,
        verbose=

'You should use the MapReduce chain when you want to perform a map-reduce operation on a collection of documents. This chain first applies a map step to each document individually, using the specified language model chain. Then, it combines the results of the map step using a reduce step, which is typically another chain such as the ReduceDocumentsChain. This allows you to efficiently process and analyze large amounts of text data by distributing the computation across multiple documents.'