In [5]:
print("Hello Jupyter notebook!")


Hello Jupyter notebook!


In [27]:
#import for loading env variables
from dotenv import load_dotenv
import os
#now let's import the ai stuff
import openai
import langchain
#import vectorestore we will be using
from langchain.vectorstores import FAISS
#load the easy loader for text
from langchain.document_loaders import TextLoader
#load wikipedia loader
from langchain.document_loaders import WikipediaLoader
#now import the embedding engine
from langchain.embeddings.openai import OpenAIEmbeddings


In [31]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")


## Now let's load a wikipedia document. 
### Langchain has standard capabilities to load many types of data to its loader. see link **[here](https://python.langchain.com/docs/modules/data_connection/document_loaders)**.

In [21]:
# Load content from Wikipedia using WikipediaLoader
loader = WikipediaLoader("Klarna",doc_content_chars_max=100000, load_max_docs=1)
doc = loader.load()

In [23]:
print(f"you have loaded {len(doc)} documents")
print(f"the document contains {len(doc[0].page_content)} characters")
print(f"here are the first 100 characters: {doc[0].page_content[0:100]}")

you have loaded 1 documents
the document contains 8682 characters
here are the first 100 characters: Klarna Bank AB, commonly referred to as Klarna, is a Swedish fintech company that provides online fi


### We need to split our wikipedia doc into smaller chunks. we do this with text splitter, there are several to choose from offered by langchain. Check **[this](https://python.langchain.com/docs/modules/data_connection/document_transformers/)** out for langchain's document loader built-in integrations.

In [24]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=400)
docs = splitter.split_documents(doc)

### Let's check that it worked and we have multiple documents from the single wikipedia one we loaded

In [25]:
total_chars_in_docs = sum([len(x.page_content) for x in docs])
print(f"nowe have {len(docs)} documents with an average {total_chars_in_docs / len(docs):,.0f} chars per document")

nowe have 7 documents with an average 1,296 chars per document


###  Create embeddings and store in **[FAISS](https://python.langchain.com/docs/modules/data_connection/vectorstores/integrations/faiss)** vectorestore

In [30]:
#Get embeddings engine ready
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
#embed the documents and combine with raw text in a pseudo vectorstore. This will make a call to openai 
docsearch = FAISS.from_documents(docs, embeddings)


100%|██████████| 1/1 [00:00<00:00,  1.23it/s]


### Before we can ask questions about our document, we need to use a retriver. 
Read about retrivers  **[here](https://python.langchain.com/docs/modules/data_connection/retrievers/)**