In [1]:
OPENAI_API_KEY = "OPENAI_API_KEY"

PINECONE_API_KEY = "PINECONE_API_KEY"
PINECONE_ENV = "asia-southeast1-gcp-free"
PINECONE_INDEX = "guidetoinvestors"

DOCUMENT = "data/guidetoinvestors.txt"

In [2]:
import openai
import pinecone
import tiktoken
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings

# load document
with open(DOCUMENT) as f: pg_work = f.read()

# tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")
tokenizer = tiktoken.encoding_for_model("gpt-4")

# split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_text(pg_work)

# initialize Pinecone
pinecone.init(api_key=PINECONE_API_KEY, environment=PINECONE_ENV)

if PINECONE_INDEX not in pinecone.list_indexes():
    pinecone.create_index(PINECONE_INDEX, dimension=1536, metric="cosine")

index = pinecone.Index(PINECONE_INDEX)

# initialize OpenAI Embeddings
openai.api_key = OPENAI_API_KEY
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY, model="text-embedding-ada-002")

# initialize Pinecone retriever
retriever = Pinecone.from_texts(texts, embeddings, index_name=PINECONE_INDEX)

  from tqdm.autonotebook import tqdm


In [3]:
from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

keywords = ['angel investors', 'startup hub', 'Silicon Valley',
    'venture capitalists', 'Google', 'momentum investors', 'stock']

OPENAI_MODELS = ["gpt-4", "gpt-3.5-turbo"]
llm = OpenAI(temperature=0, model_name=OPENAI_MODELS[0], openai_api_key=OPENAI_API_KEY)



In [4]:
keyword_description_dict = {}

for keyword in keywords:

    description = ""

    docs = retriever.similarity_search(keyword)
    docs_set = set([doc.page_content for doc in docs])

    # CHAIN1: Genrating query for generating a detailed description for the keyword
    CHAIN1_PROMPT = PromptTemplate(
        input_variables=["keyword", "document", "description"],
        template="""
            As a helpful knowledge provider,
            Refer to the given document: {document},
            generate a detailed description for the keyword: {keyword},
            and add the details to the description: {description} if needed.
            If context overlaps with the description, skip.
        """
    )

    description_chain = LLMChain(llm=llm, prompt=CHAIN1_PROMPT)

    for doc in docs_set:
        description = description_chain.run(
            {
                "keyword": keyword,
                "document": doc,
                "description": description
            }
        )
    
    keyword_description_dict[keyword] = description

In [7]:
for key in keyword_description_dict.keys():
    description = keyword_description_dict[key]
    description = description.replace("\n\n", "")

    keyword_description_dict[key] = description

In [8]:
import json
print(json.dumps(keyword_description_dict, indent=4))

{
    "angel investors": "Angel investors are high-net-worth individuals who provide financial support to early-stage startups and entrepreneurs in exchange for ownership equity or convertible debt. They play a crucial role in the growth and development of startups, as they often invest in companies at a stage when traditional venture capital firms may not be willing to take the risk. These investors typically have a strong background in entrepreneurship or business, and they use their personal wealth, experience, and network to help startups succeed.Angel investors are considered the most critical component in creating a startup hub, as they provide the initial funding and support needed for startups to grow and attract further investment. They are often more willing to take risks on innovative ideas and unproven business models, which can lead to the creation of groundbreaking technologies and successful companies. In addition to financial support, angel investors often provide mento