In [6]:
import os
import pinecone

import custom_loaders

In [8]:
# complete rewrite of langchain git loader using parallelism
loader = custom_loaders.TurboGitLoader(
    clone_url="https://github.com/hwchase17/langchain",
    repo_path="./example_data/TurboGitLoader/",
    branch="master",
    file_filter=lambda file_path: file_path.endswith(".py")
)
data = loader.load()
print(f"Length of data from dataloader: {len(data)}")

Length of data from dataloader: 937


In [18]:
dict(data[0])

{'page_content': '"""Prompt for the router chain in the multi-prompt chain."""\r\n\r\nMULTI_PROMPT_ROUTER_TEMPLATE = """\\\r\nGiven a raw text input to a language model select the model prompt best suited for \\\r\nthe input. You will be given the names of the available prompts and a description of \\\r\nwhat the prompt is best suited for. You may also revise the original input if you \\\r\nthink that revising it will ultimately lead to a better response from the language \\\r\nmodel.\r\n\r\n<< FORMATTING >>\r\nReturn a markdown code snippet with a JSON object formatted to look like:\r\n```json\r\n{{{{\r\n    "destination": string \\\\ name of the prompt to use or "DEFAULT"\r\n    "next_inputs": string \\\\ a potentially modified version of the original input\r\n}}}}\r\n```\r\n\r\nREMEMBER: "destination" MUST be one of the candidate prompt names specified below OR \\\r\nit can be "DEFAULT" if the input is not well suited for any of the candidate prompts.\r\nREMEMBER: "next_inputs" can 

In [20]:
import tiktoken

tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [21]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [44]:
from langchain.embeddings.openai import OpenAIEmbeddings
from getpass import getpass

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=getpass('OPENAI_API_KEY='),
)

In [54]:
data[2].page_content

'"""An agent designed to hold a conversation in addition to using tools."""\r\n'

In [55]:
embed_resp = embed.embed_documents(data[2].page_content)
len(embed_resp), len(embed_resp[0])

(76, 1536)

In [68]:
import pinecone

index_name = 'testindex'
pinecone.init(
    api_key=getpass('PINECONE_API_KEY'), 
    environment="us-west1-gcp-free"
)

if index_name in pinecone.list_indexes():
    print(f"Index {index_name} already exists...")
else:
    pinecone.create_index(
        name=index_name,
        metric='dotproduct',
        dimension=len(embed_resp[0])
)

Index testindex already exists...


In [70]:
index = pinecone.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        'file_name': record.metadata['file_name'],
        'file_path': record.metadata['file_path']
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['text'])
    # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

In [None]:
'metadata': {'file_path': 'langchain\\chains\\router\\multi_prompt_prompt.py',
  'file_name': 'multi_prompt_prompt.py',
  'file_type': '.py'}}

In [3]:
pinecone.create_index("quickstart", dimension=8, metric="euclidean")

In [5]:
index = pinecone.Index("quickstart")

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [16]:
index.upsert([
    ("A", [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]),
    ("B", [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]),
    ("C", [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]),
    ("D", [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]),
    ("E", [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5])
])

{'upserted_count': 5}

In [12]:
index.describe_index_stats()

{'dimension': 8,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 5}},
 'total_vector_count': 5}

In [13]:
index.query(
  vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
  top_k=3,
  include_values=True
)

{'matches': [{'id': 'C',
              'score': 0.0,
              'values': [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]},
             {'id': 'D',
              'score': 0.0799999237,
              'values': [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]},
             {'id': 'B',
              'score': 0.0800000429,
              'values': [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]}],
 'namespace': ''}

In [57]:
pinecone.delete_index("quickstart")


TypeError: expected string or bytes-like object