# Import necessary packages

In [78]:
import warnings
warnings.filterwarnings('ignore')
from openai import OpenAI
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from DLAIUtils import Utils

import functions as func
import ast
import os
import pandas as pd
import json
import numpy as np
import copy
import tqdm

# Set up Pinecone

In [2]:
# get api key
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [123]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)

utils = Utils()
INDEX_NAME = utils.create_dlai_index_name('vc')
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  index = pinecone.Index(INDEX_NAME)

else:
  pinecone.create_index(name=INDEX_NAME, dimension=1536, metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1'))
  index = pinecone.Index(INDEX_NAME)

# Gather VC's info

In [4]:
OPENAI_API_KEY = utils.get_openai_api_key()
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [16]:
def transform_link():
    res = openai_client.completions.create(
        model="gpt-3.5-turbo-instruct",
        prompt = """Request:
        Convert links into standard form e.g. https://www.accel.com/. Separate them with spaces.
        Links: www.accel.com www.a16z.com www.greylock.com www.benchmark.com www.sequoiacap.com www.indexventures.com www.kpcb.com www.lsvp.com www.matrixpartners.com www.500.co www.sparkcapital.com www.insightpartners.com 
        Standardtized links:
        """,
        temperature=0,
        max_tokens=636,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )
    return res

In [17]:
a = transform_link()

In [23]:
a.choices[0].text
text = a.choices[0].text
text

' https://www.accel.com/ https://www.a16z.com/ https://www.greylock.com/ https://www.benchmark.com/ https://www.sequoiacap.com/ https://www.indexventures.com/ https://www.kpcb.com/ https://www.lsvp.com/ https://www.matrixpartners.com/ https://www.500.co/ https://www.sparkcapital.com/ https://www.insightpartners.com/'

In [24]:
VC_list = text.split(' ')[1:] 

In [26]:
other_VCs = ['https://www.nea.com/',
             'https://dragoneer.com/',
             'https://deerfield.com/',
             'https://www.khoslaventures.com/',
             'https://www.industryventures.com/']

In [27]:
VC_list = VC_list + other_VCs

In [28]:
VC_list

['https://www.accel.com/',
 'https://www.a16z.com/',
 'https://www.greylock.com/',
 'https://www.benchmark.com/',
 'https://www.sequoiacap.com/',
 'https://www.indexventures.com/',
 'https://www.kpcb.com/',
 'https://www.lsvp.com/',
 'https://www.matrixpartners.com/',
 'https://www.500.co/',
 'https://www.sparkcapital.com/',
 'https://www.insightpartners.com/',
 'https://www.nea.com/',
 'https://dragoneer.com/',
 'https://deerfield.com/',
 'https://www.khoslaventures.com/',
 'https://www.industryventures.com/']

In [30]:
def find_info(text):
    messages = []
    messages.append({"role": "system", "content": "Find information about VC. Find VC name, contacts, industries that they invest in, investment rounds that they participate/lead. Don't make assumptions."})
    messages.append({"role": "user", "content": text})
    chat_response = openai_client.chat.completions.create(
        model="gpt-4-turbo", #use gpt4 because this is one time use per company, and quality is over the price
        messages=messages,
        # stream=True,
        tools = [{"type": "function","function": {"name": "get_vc_info","description": "Extracts detailed information about a Venture Capital firm from given text. Fill only if values are present.","parameters": {"type": "object","properties": {"name": {"type": "string","description": "Name of the Venture Capital firm, e.g., Greylock Partners."},"contacts": {"type": "array","items": {"type": "string"},"description": "Contact information, e.g., phone : +48540234, email: myemail@mail.com, LinkedIn. : linkedin.com/username. KEEP THE FROMAT FROM EXAMPLE"},"investment_industries": {"type": "array","items": {"type": "string"},"description": "List of industries that the company invests in."},"investment_rounds": {"type": "array","items": {"type": "string"},"description": "List of investment rounds that they participate/lead."}}}}}],
        tool_choice={"type": "function","function": {"name": "get_vc_info","description": "Extracts detailed information about a Venture Capital firm from given text. Fill only if values are present.","parameters": {"type": "object","properties": {"name": {"type": "string","description": "Name of the Venture Capital firm, e.g., Greylock Partners."},"contacts": {"type": "array","items": {"type": "string"},"description": "Contact information, e.g., phone : +48540234, email: myemail@mail.com, LinkedIn. : linkedin.com/username. KEEP THE FORMAT FROM EXAMPLE"},"investment_industries": {"type": "array","items": {"type": "string"},"description": "List of industries that the company invests in."},"investment_rounds": {"type": "array","items": {"type": "string"},"description": "List of investment rounds that they participate/lead."}}}}}
        )
    return chat_response

In [43]:
VC_text, _ = func.get_text_links(VC_list[0])

In [31]:
info_resp = find_info(VC_text)

In [32]:
info_resp.choices[0].message.tool_calls[0].function.arguments

'{"name":"Accel","investment_rounds":["Series A","Series B","Series C","Series D","Series E","IPO","Initial Investment","Follow-on","Acquired"],"investment_industries":["AI enabled digital transformation","AI-powered document processing","Non-human identity security","No-code computer vision","Enterprise digital intelligence","Third-Party API Security"]}'

In [35]:
json.loads(info_resp.choices[0].message.tool_calls[0].function.arguments)

{'name': 'Accel',
 'investment_rounds': ['Series A',
  'Series B',
  'Series C',
  'Series D',
  'Series E',
  'IPO',
  'Initial Investment',
  'Follow-on',
  'Acquired'],
 'investment_industries': ['AI enabled digital transformation',
  'AI-powered document processing',
  'Non-human identity security',
  'No-code computer vision',
  'Enterprise digital intelligence',
  'Third-Party API Security']}

### Let's check how each model performes on text description creation and find similarity

In [42]:
def write_description(text,model):
    messages = []
    messages.append({"role": "system", "content": "Write a description of VC company based on given text"})
    messages.append({"role": "user", "content": text})
    chat_response = openai_client.chat.completions.create(
        model=model,
        messages=messages
        max_tokens=8000
        )
    return chat_response

In [45]:
gpt4_desc = write_description(VC_text, 'gpt-4-turbo')
gpt35_desc = write_description(VC_text, 'gpt-3.5-turbo')

In [53]:
gpt4_txt = gpt4_desc.choices[0].message.content

In [54]:
gpt35_txt = gpt35_desc.choices[0].message.content

In [49]:
def get_embeddings(articles, model="text-embedding-3-small"):
   return openai_client.embeddings.create(input = articles, model=model)

In [51]:
def cosine_similarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors."""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

Cosine similarity: 0.9746318461970762


In [55]:
gpt4_embed = get_embeddings(gpt4_txt)
gpt35_embed = get_embeddings(gpt35_txt)

In [59]:
cosine_similarity(gpt4_embed.data[0].embedding,gpt35_embed.data[0].embedding)

0.8824363652298336

As can be seen, cosine similartiy between embedding of description created by gpt3.5 and gpt4 are quite high. However, gpt4 it time consuming. Here gpt4 is going to be used for higher quality text. When doing search, if the company is unknown, description of gpt3.5 is going to be used for time efficiency, but description by gpt4 is going to be generated in the background and it is going to be embedded and stored in the DB and VC records JSON

<h4>Now creating a json object where information gathered from VC website will be stored. This is done for few reasons</h4>
<p>1. To avoid generating info each time</p>
<p>2. To be able to find the information based on link, because pinecone does not provide ability to fetch by metadata (only filter, which doesn't satisfy this task)</p>

In [69]:
VC_record = []

In [70]:
for i, VC_link in enumerate(tqdm(VC_list, desc="Processing VC links")):
    VC_text, _ = func.get_text_links(VC_link)
    VC_desc = write_description(VC_text, "gpt-4-turbo")
    VC_desc = VC_desc.choices[0].message.content
    VC_info = func.find_info(VC_text)
    VC_info = json.loads(VC_info.choices[0].message.tool_calls[0].function.arguments)
    VC = {"id": i+1, "link": VC_link, "description": VC_desc, "info": VC_info}
    VC_record.append(VC)

Processing VC links: 100%|██████████| 17/17 [05:03<00:00, 17.83s/it]


In [86]:
with open("VC_record.json", "w") as json_file:
    json.dump(VC_record, json_file, indent=4)

# JSON file successfully filled with initial data. All future requests that will be unknown, will also be stored in json

# Create description embeddings and store into Pinecone index

In [102]:
embeddings = []

In [106]:
embeddings = [get_embeddings(i['description']) for i in VC_record]

In [112]:
embeddings = [i.data[0].embedding for i in embeddings]

In [118]:
data = []

In [135]:
data = []
for i, embedding in enumerate(tqdm.tqdm(embeddings, desc="Upserting embeddings")):
    meta = VC_record[i]['info']
    data.append({'id': str(VC_record[i]['id']),  # Convert ID to string
                 'values': embedding,
                 'metadata': meta})
index.upsert(data)

Upserting embeddings: 100%|██████████| 17/17 [00:00<?, ?it/s]


{'upserted_count': 17}