In [12]:
OPENAI_API_KEY = "OPENAI-API-KEY"

In [13]:
PINECONE_API_KEY = "PINECONE-API-KEY"
PINECONE_ENV = "PINECONE-ENV"
PINECONE_INDEX = "PINECONE-INDEX"

In [14]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [15]:
# This is a long document we can split up.
with open('data/guidetoinvestors.txt') as f:
    pg_work = f.read()
    
print (f"You have {len([pg_work])} document")

You have 1 document


In [16]:
import pandas as pd
import tiktoken

tokenizer = tiktoken.get_encoding("cl100k_base")
tokenizer = tiktoken.encoding_for_model("gpt-4")

demo_df = pd.DataFrame(
    {
        'text': [pg_work],
        'n_tokens': [len(tokenizer.encode(pg_work))]
    }
)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_text(pg_work)

demo_df_splitted = pd.DataFrame(
    {
        'text': texts,
        'n_tokens': [len(tokenizer.encode(text)) for text in texts]
    }
)

demo_df_splitted.head()

Unnamed: 0,text,n_tokens
0,April 2007(This essay is derived from a keynot...,443
1,companies that raise series A rounds have take...,453
2,and (c) they invest at a point where the strea...,473
3,at something and predict whether it will take ...,467
4,"companies, most of which fail, and one of whic...",463


In [17]:
import openai
import pinecone
from langchain.document_loaders import TextLoader

pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

if PINECONE_INDEX not in pinecone.list_indexes():
    pinecone.create_index(PINECONE_INDEX, dimension=1536, metric="cosine")

index = pinecone.Index("demo-index")

openai.api_key = OPENAI_API_KEY
demo_df_splitted['embeddings'] = demo_df_splitted.text.apply(lambda x: openai.Embedding.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])

  from tqdm.autonotebook import tqdm


In [18]:
from uuid import uuid4

df = demo_df_splitted.copy()
df['id'] = [str(uuid4()) for _ in range(len(df))]
df = df[['id', 'text', 'embeddings', 'n_tokens']]
df.head()

Unnamed: 0,id,text,embeddings,n_tokens
0,56f3f3cc-db56-46da-bfeb-e53556b64187,April 2007(This essay is derived from a keynot...,"[0.028928915038704872, -0.020220162346959114, ...",443
1,0082afa0-c5d2-44a3-bf90-163e78866604,companies that raise series A rounds have take...,"[0.01580376923084259, -0.0337928831577301, 0.0...",453
2,46c00591-d2b2-423b-88fa-58f03f8014c0,and (c) they invest at a point where the strea...,"[0.006701738107949495, -0.030748752877116203, ...",473
3,be263646-990c-4d15-be59-ea021961bbaf,at something and predict whether it will take ...,"[-0.010839071124792099, -0.018601154908537865,...",467
4,ee98b173-7200-4bd7-b9a7-f9a4440fe348,"companies, most of which fail, and one of whic...","[0.008925837464630604, -0.023426974192261696, ...",463


In [19]:
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
retriever = Pinecone.from_texts([t for t in df.text], embeddings, index_name="demo-index")

In [20]:
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

from langchain.agents import initialize_agent
from langchain.agents import AgentType

from langchain.llms import OpenAI

keywords = ['angel investors', 'startup hub', 'Silicon Valley',
    'venture capitalists', 'Google', 'momentum investors', 'stock']
OPENAI_MODELS = ["gpt-4", "gpt-3.5-turbo"]

llm = OpenAI(temperature=0, model_name=OPENAI_MODELS[0], openai_api_key=OPENAI_API_KEY)



In [21]:
for keyword in keywords:

    description = ""

    # TODO: work on the retriever part
    docs = retriever.similarity_search(keyword)
    docs_set = set([doc.page_content for doc in docs])

    # CHAIN1: Genrating query for generating a detailed description for the keyword
    CHAIN1_PROMPT = PromptTemplate(
        input_variables=["keyword", "document", "description"],
        template="""
            As a helpful knowledge provider,
            Refer to the given document: {document},
            generate a detailed description for the keyword: {keyword},
            and add the details to the description: {description} if needed.
            If context overlaps with the description, skip.
        """
    )

    description_chain = LLMChain(llm=llm, prompt=CHAIN1_PROMPT)

    for doc in docs_set:
        description = description_chain.run(
            {
                "keyword": keyword,
                "document": doc,
                "description": description
            }
        )

    print("-----")
    print(f"\nGenerated description for {keyword}: \n{description}")

-----

Generated description for angel investors: 
Angel investors are high-net-worth individuals who provide financial support to early-stage startups and entrepreneurs in exchange for ownership equity or convertible debt. They play a crucial role in the growth and development of startups, as they often invest in companies at a stage when traditional venture capital firms may not be willing to take the risk. These investors typically have a strong background in entrepreneurship or business, and they use their personal wealth, experience, and network to help startups succeed.

Angel investors are considered the most critical component in creating a startup hub, as they provide the initial funding and support needed for startups to grow and attract further investment from venture capital firms. They are often more willing to take risks on innovative ideas and unproven business models, which can lead to the creation of groundbreaking technologies and successful companies.

In addition to