### Imports

In [15]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

### Create LLM

In [6]:
%pip install -qU \
    datasets==2.14.6 \
    fsspec==2023.9.2

Note: you may need to restart the kernel to use updated packages.


Ask question and get answer

In [2]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split='train[:10000]')
data

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 10000
})

In [6]:
for i in data:
    print(f"{i["title"]}")

April
August
Art
A
Air
Autonomous communities of Spain
Alan Turing
Alanis Morissette
Adobe Illustrator
Andouille
Farming
Arithmetic
Addition
Australia
American English
Aquaculture
Abbreviation
Angel
Ad hominem
Native American
Apple
Abrahamic religion
Algebra
Atom
Astronomy
Architecture
Anatomy
Asteroid
Afghanistan
Angola
Argentina
Austria
Armenia
Archaeology
Application
Animal
Acceleration
Black pudding
Boot device
Boot
Bankruptcy
Breakfast sausage
Browser
Beekeeping
British English
Being
Beijing
Bottle
Berry
Boil
Beard
Black
Bubonic plague
Biology
Botany
Belgium
Brazil
Chemistry
Compound
Computer science
Computer
Chinese
Continent
Classical element
China
Country
Colchester
Cartography
Creator
Contact network
Chorizo
Creativity
Catharism
Cosmology
Church (building)
City
Cooking
Chat
Cup
Crime
Time Cube
Census of Marine Life
Maize
Civics
Calculus
Coin
Conceptual metaphor
Crust
Comedy
Comet
Cytology
Christian
Cheese
Constitution
Circle
Capitalization
Cuba
Cube
Cost of living
December
Dub

In [9]:
%pip install -qU \
  langchain \
  openai \
  pinecone-client \
  tiktoken

Note: you may need to restart the kernel to use updated packages.


In [10]:
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [11]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

tiktoken_len("hello I am a chunk of text and using the tiktoken_len function "
             "we can find the length of this chunk of text in tokens")

26

In [12]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [13]:
chunks = text_splitter.split_text(data[6]['text'])[:3]
chunks

['Alan Mathison Turing OBE FRS (London, 23 June 1912 – Wilmslow, Cheshire, 7 June 1954) was an English mathematician and computer scientist. He was born in Maida Vale, London.\n\nEarly life and family \nAlan Turing was born in Maida Vale, London on 23 June 1912. His father was part of a family of merchants from Scotland. His mother, Ethel Sara, was the daughter of an engineer.\n\nEducation \nTuring went to St. Michael\'s, a school at 20 Charles Road, St Leonards-on-sea, when he was five years old.\n"This is only a foretaste of what is to come, and only the shadow of what is going to be.” – Alan Turing.\n\nThe Stoney family were once prominent landlords, here in North Tipperary. His mother Ethel Sara Stoney (1881–1976) was daughter of Edward Waller Stoney (Borrisokane, North Tipperary) and Sarah Crawford (Cartron Abbey, Co. Longford); Protestant Anglo-Irish gentry.\n\nEducated in Dublin at Alexandra School and College; on October 1st 1907 she married Julius Mathison Turing, latter son o

In [14]:
tiktoken_len(chunks[0]), tiktoken_len(chunks[1]), tiktoken_len(chunks[2])

(299, 323, 382)

In [16]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

  warn_deprecated(


In [17]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]

res = embed.embed_documents(texts)
len(res), len(res[0])

(2, 1536)

### Vector Database
To create our vector database we first need a free API key from Pinecone. Then we initialize like so:

In [18]:
from pinecone import Pinecone

# configure client
pc = Pinecone(api_key=PINECONE_API_KEY)

In [19]:
from pinecone import ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

In [20]:
import time

index_name = 'langchain-retrieval-augmentation'
existing_indexes = [
    index_info["name"] for index_info in pc.list_indexes()
]

# check if index already exists (it shouldn't if this is first time)
if index_name not in existing_indexes:
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=1536,  # dimensionality of ada 002
        metric='dotproduct',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
time.sleep(1)
# view index stats
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Indexing
We can perform the indexing task using the LangChain vector store object. But for now it is much faster to do it via the Pinecone python client directly. We will do this in batches of `100` or more.

In [21]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'wiki-id': str(record['id']),
        'source': record['url'],
        'title': record['title']
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

100%|██████████| 10000/10000 [27:06<00:00,  6.15it/s] 


In [22]:

index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 28422}},
 'total_vector_count': 28422}

### Creating a Vector Store and Querying
Now that we've build our index we can switch back over to LangChain. We start by initializing a vector store using the same index we just built. We do that like so:

In [23]:
from langchain.vectorstores import Pinecone

text_field = "text"  # the metadata field that contains our text

# initialize the vector store object
vectorstore = Pinecone(
    index, embed.embed_query, text_field
)

  warn_deprecated(


In [24]:
query = "who was Benito Mussolini?"

vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content='Benito Amilcare Andrea Mussolini KSMOM GCTE (29 July 1883 – 28 April 1945) was an Italian politician and journalist. He was also the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party.\n\nBiography\n\nEarly life\nBenito Mussolini was named after Benito Juarez, a Mexican opponent of the political power of the Roman Catholic Church, by his anticlerical (a person who opposes the political interference of the Roman Catholic Church in secular affairs) father. Mussolini\'s father was a blacksmith. Before being involved in politics, Mussolini was a newspaper editor (where he learned all his propaganda skills) and elementary school teacher.\n\nAt first, Mussolini was a socialist, but when he wanted Italy to join the First World War, he was thrown out of the socialist party. He \'invented\' a new ideology, Fascism, much out of Nationalist\xa0and Conservative views.\n\nRise to power and becoming dictator\nIn 1922, he took power b

In [31]:
#moje
queryPL = "kim był mussolinii?"
vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content='Benito Amilcare Andrea Mussolini KSMOM GCTE (29 July 1883 – 28 April 1945) was an Italian politician and journalist. He was also the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party.\n\nBiography\n\nEarly life\nBenito Mussolini was named after Benito Juarez, a Mexican opponent of the political power of the Roman Catholic Church, by his anticlerical (a person who opposes the political interference of the Roman Catholic Church in secular affairs) father. Mussolini\'s father was a blacksmith. Before being involved in politics, Mussolini was a newspaper editor (where he learned all his propaganda skills) and elementary school teacher.\n\nAt first, Mussolini was a socialist, but when he wanted Italy to join the First World War, he was thrown out of the socialist party. He \'invented\' a new ideology, Fascism, much out of Nationalist\xa0and Conservative views.\n\nRise to power and becoming dictator\nIn 1922, he took power b

### Generative Question-Answering
In GQA we take the query as a question that is to be answered by a LLM, but the LLM must answer the question based on the information it is seeing being returned from the `vectorstore`.

To do this we initialize a `RetrievalQA` object like so:

In [25]:
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

# completion llm
llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

  warn_deprecated(


In [26]:
qa.run(query)

  warn_deprecated(


'Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party and played a significant role in the rise of Fascism in Italy. Mussolini was known for his authoritarian rule and alliance with Adolf Hitler during World War II.'

We can also include the sources of information that the LLM is using to answer our question. We can do this using a slightly different version of `RetrievalQA` called `RetrievalQAWithSourcesChain`:

In [28]:
from langchain.chains import RetrievalQAWithSourcesChain

qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [30]:

qa_with_sources(query)

  warn_deprecated(


{'question': 'who was Benito Mussolini?',
 'answer': 'Benito Mussolini was an Italian politician and journalist who served as the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party. Mussolini was known for his form of Fascism, "Italian Fascism," which was different from Hitler\'s Nazism. He wanted Italy to become a new Roman Empire and was involved in various military actions, including the occupation of Abyssinia (Ethiopia) and Albania. Mussolini was eventually deposed, captured, and executed by partisans during World War II.\n',
 'sources': 'https://simple.wikipedia.org/wiki/Benito%20Mussolini'}

In [32]:
qa_with_sources(queryPL)

{'question': 'kim był mussolinii?',
 'answer': 'Benito Mussolini was an Italian politician and journalist who was the Prime Minister of Italy from 1922 until 1943. He was the leader of the National Fascist Party. After his death, several Neo-Fascist movements have had success in Italy, the most important being the Movimento Sociale Italiano. His granddaughter Alessandra Mussolini has outspoken views similar to Fascism.\n',
 'sources': 'https://simple.wikipedia.org/wiki/Benito%20Mussolini'}

In [33]:
query2PL = "czy mussolini dopuścił się zbrodni wojennych?"
qa_with_sources(query2PL)

{'question': 'czy mussolini dopuścił się zbrodni wojennych?',
 'answer': "Józef Piłsudski was a Polish military leader and politician who played a significant role in Poland's history. There is no evidence to suggest that he committed war crimes. \n",
 'sources': 'https://simple.wikipedia.org/wiki/J%C3%B3zef%20Pi%C5%82sudski'}

Now we answer the question being asked, and return the source of this information being used by the LLM.

In [34]:
query2aPL="Czy Benito Mussolini dopuścił się zbrodni wojennych?"
qa_with_sources(query2aPL)

{'question': 'Czy Benito Mussolini dopuścił się zbrodni wojennych?',
 'answer': 'Nie wiadomo, czy Benito Mussolini dopuścił się zbrodni wojennych.\nŹRÓDŁA:',
 'sources': ''}

Delete the index to save space, but this will waste 210K / 2M WUs (Write Units) of free Pinecone plan if you need this vector again.

```pc.delete_index(index_name)```