In [None]:
OPENAI_API_KEY = "OPENAI_API_KEY_HERE"
PINECONE_API_KEY = "PINECONE_API_KEY_HERE"
PINECONE_ENV = "PINECONE_ENV_HERE"

### Text Splitter: Chunk up text data into smaller texts

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
# This is a long document we can split up.
with open('data/guidetoinvestors.txt') as f:
    pg_work = f.read()
    
print (f"You have {len([pg_work])} document")

You have 1 document


In [5]:
import pandas as pd
import tiktoken

tokenizer = tiktoken.get_encoding("cl100k_base")
tokenizer = tiktoken.encoding_for_model("gpt-4")

demo_df = pd.DataFrame(
    {
        'text': [pg_work],
        'n_tokens': [len(tokenizer.encode(pg_work))]
    }
)

demo_df.head(1)

Unnamed: 0,text,n_tokens
0,April 2007(This essay is derived from a keynot...,8151


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_text(pg_work)
texts[:5]

["April 2007(This essay is derived from a keynote talk at the 2007 ASES Summit\nat Stanford.)The world of investors is a foreign one to most hackers—partly\nbecause investors are so unlike hackers, and partly because they\ntend to operate in secret.  I've been dealing with this world for\nmany years, both as a founder and an investor, and I still don't\nfully understand it.In this essay I'm going to list some of the more surprising things\nI've learned about investors.  Some I only learned in the past year.Teaching hackers how to deal with investors is probably the second\nmost important thing we do at Y Combinator.  The most important\nthing for a startup is to make something good.  But everyone knows\nthat's important.  The dangerous thing about investors is that\nhackers don't know how little they know about this strange world.1. The investors are what make a startup hub.About a year ago I tried to figure out what you'd need to reproduce\nSilicon Valley.  I decided the \ncritical in

In [7]:
demo_df_splitted = pd.DataFrame(
    {
        'text': texts,
        'n_tokens': [len(tokenizer.encode(text)) for text in texts]
    }
)

demo_df_splitted.head()

Unnamed: 0,text,n_tokens
0,April 2007(This essay is derived from a keynot...,443
1,companies that raise series A rounds have take...,453
2,and (c) they invest at a point where the strea...,473
3,at something and predict whether it will take ...,467
4,"companies, most of which fail, and one of whic...",463


In [8]:
print (f"You have {len(texts)} documents")

You have 20 documents


### Create Pinecone Index & Embedding Demo

In [9]:
PINECONE_INDEX = "demo-index"

In [10]:
import openai
import pinecone
from langchain.document_loaders import TextLoader

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

if PINECONE_INDEX not in pinecone.list_indexes():
    pinecone.create_index(PINECONE_INDEX, dimension=1536, metric="cosine")

index = pinecone.Index("demo-index")
index.describe_index_stats()

  from tqdm.autonotebook import tqdm


{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 20}},
 'total_vector_count': 20}

In [12]:
openai.api_key = OPENAI_API_KEY
demo_df_splitted['embeddings'] = demo_df_splitted.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])

In [13]:
demo_df_splitted.head()

Unnamed: 0,text,n_tokens,embeddings
0,April 2007(This essay is derived from a keynot...,443,"[0.028812682256102562, -0.020147008821368217, ..."
1,companies that raise series A rounds have take...,453,"[0.01580376923084259, -0.0337928831577301, 0.0..."
2,and (c) they invest at a point where the strea...,473,"[0.006701738107949495, -0.030748752877116203, ..."
3,at something and predict whether it will take ...,467,"[-0.010730039328336716, -0.01844008080661297, ..."
4,"companies, most of which fail, and one of whic...",463,"[0.008658324368298054, -0.023522228002548218, ..."


In [14]:
from uuid import uuid4

df = demo_df_splitted.copy()
df['id'] = [str(uuid4()) for _ in range(len(df))]
df = df[['id', 'text', 'embeddings', 'n_tokens']]
df.head()

Unnamed: 0,id,text,embeddings,n_tokens
0,5508c41e-b90e-4622-8348-e8127da98fef,April 2007(This essay is derived from a keynot...,"[0.028812682256102562, -0.020147008821368217, ...",443
1,f4941ada-e704-4286-9998-4e1dd900bbb7,companies that raise series A rounds have take...,"[0.01580376923084259, -0.0337928831577301, 0.0...",453
2,6ab04a7b-26bb-4a6d-9157-6435ca029470,and (c) they invest at a point where the strea...,"[0.006701738107949495, -0.030748752877116203, ...",473
3,7596d022-971f-43a9-89e8-fc80b5cc24a9,at something and predict whether it will take ...,"[-0.010730039328336716, -0.01844008080661297, ...",467
4,f1284066-5223-4ab4-8081-19a8d691fb04,"companies, most of which fail, and one of whic...","[0.008658324368298054, -0.023522228002548218, ...",463


### Retriever: Create embeddings of documents for semantic search

In [15]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

retriever = Pinecone.from_texts([t for t in df.text], embeddings, index_name="demo-index")

In [18]:
query = "What is Silicon Valley?"
docs = retriever.similarity_search(query)

In [19]:
docs[0].page_content

"April 2007(This essay is derived from a keynote talk at the 2007 ASES Summit\nat Stanford.)The world of investors is a foreign one to most hackers—partly\nbecause investors are so unlike hackers, and partly because they\ntend to operate in secret.  I've been dealing with this world for\nmany years, both as a founder and an investor, and I still don't\nfully understand it.In this essay I'm going to list some of the more surprising things\nI've learned about investors.  Some I only learned in the past year.Teaching hackers how to deal with investors is probably the second\nmost important thing we do at Y Combinator.  The most important\nthing for a startup is to make something good.  But everyone knows\nthat's important.  The dangerous thing about investors is that\nhackers don't know how little they know about this strange world.1. The investors are what make a startup hub.About a year ago I tried to figure out what you'd need to reproduce\nSilicon Valley.  I decided the \ncritical ing

In [23]:
docs

[Document(page_content="April 2007(This essay is derived from a keynote talk at the 2007 ASES Summit\nat Stanford.)The world of investors is a foreign one to most hackers—partly\nbecause investors are so unlike hackers, and partly because they\ntend to operate in secret.  I've been dealing with this world for\nmany years, both as a founder and an investor, and I still don't\nfully understand it.In this essay I'm going to list some of the more surprising things\nI've learned about investors.  Some I only learned in the past year.Teaching hackers how to deal with investors is probably the second\nmost important thing we do at Y Combinator.  The most important\nthing for a startup is to make something good.  But everyone knows\nthat's important.  The dangerous thing about investors is that\nhackers don't know how little they know about this strange world.1. The investors are what make a startup hub.About a year ago I tried to figure out what you'd need to reproduce\nSilicon Valley.  I dec

### Query the retrieved data to get answers back

In [20]:
from langchain.llms import OpenAI
from langchain.chains.question_answering import load_qa_chain

In [21]:
llm = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

In [22]:
chain.run(input_documents=docs, question=query)

' Silicon Valley is a startup hub in the Bay Area of California, known for its high concentration of technology companies and venture capital firms.'

In [None]:
docs[0].page_content