In [1]:
"""Infra Setup"""

# Step 1: If you already have an existing Pinecode DB that is unrelated to this notebook
#  and if you are on a trial version with Pinecode DB, 
# delete your existing DB (if you don't have an existing DB, don't mind this step)

# Step 2: Do you already have an existing Pinecone DB for this notebook?
has_existing_pinecone_db = True

# Step 3: have you already uploaded your embeddings to the Pinecode DB?
has_uploaded_embeddings = False

In [2]:
"""Data Setup"""
from langchain.document_loaders import TextLoader

# Step 4: what is your query?
user_query = "What SCPs are difficult to destroy?"

# Step 5: what is your dataset/prompt?
folder_name = "scp"
content = TextLoader(f'datasets/{folder_name}/content.txt').load()
prompt = TextLoader(f'datasets/{folder_name}/prompt.txt').load()

page_content = content[0].page_content
prompt_content = prompt[0].page_content

In [3]:
# Get chunk size

chunk_size_map = {
    "scp": 100,
    "PH_law_criminal": 250,
    "ben_tech_resume": 100,
}

content_chunk_size = chunk_size_map[folder_name]

In [4]:
# Load environment variables

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

True

In [5]:
# import schema for chat messages and ChatOpenAI in order to query chatmodels GPT-3.5-turbo or GPT-4

from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.chat_models import ChatOpenAI

In [6]:
# Import utility for splitting up texts and split up the explanation given above into document chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = content_chunk_size,
    chunk_overlap  = 0,
)

texts = text_splitter.create_documents([page_content])

In [7]:
# Individual text chunks can be accessed with "page_content"

texts[0].page_content

'SCP-682, also known as "The Hard-to-Destroy Reptile," is a highly dangerous and resilient creature'

In [8]:

# Import and instantiate OpenAI embeddings

from langchain.embeddings import OpenAIEmbeddings

### wont' work
# embeddings = OpenAIEmbeddings(model_name="ada")

embeddings = OpenAIEmbeddings()

In [9]:

# Import and initialize Pinecone client

import os
import pinecone
from langchain.vectorstores import Pinecone


pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

  from tqdm.autonotebook import tqdm


In [10]:
# Create an index in Pinecone

index_name = "benv-llm-learnings"

### if this doesn't work, create it manually
if not has_existing_pinecone_db:
    pinecone.create_index(index_name, dimension=1536,
                          metric="cosine", pods=1, pod_type="p1.x1")

In [11]:
# Upload vectors to Pinecone

### run this if you haven't already uploaded your embeddings to Pinecone

if not has_uploaded_embeddings:
    search = Pinecone.from_documents(texts, embeddings, index_name=index_name)

In [12]:
# Get the vector store (Pinecone)

### run this if you have already uploaded your embeddings to Pinecone

if has_uploaded_embeddings:
    index = pinecone.Index(index_name)
    search = Pinecone(index, embeddings.embed_query, "text")

In [13]:
# Do a simple vector similarity search

user_query = user_query

query = user_query
results = search.similarity_search(query)
results

[Document(page_content='of containment, SCP-682 is considered one of the most challenging anomalies to manage.', metadata={}),
 Document(page_content='of containment, SCP-682 is considered one of the most challenging anomalies to manage.', metadata={}),
 Document(page_content='of containment, SCP-682 is considered one of the most challenging anomalies to manage.', metadata={}),
 Document(page_content='of containment, SCP-682 is considered one of the most challenging anomalies to manage.', metadata={})]

In [14]:
documents = ', '.join([result.page_content for result in results])
documents

'of containment, SCP-682 is considered one of the most challenging anomalies to manage., of containment, SCP-682 is considered one of the most challenging anomalies to manage., of containment, SCP-682 is considered one of the most challenging anomalies to manage., of containment, SCP-682 is considered one of the most challenging anomalies to manage.'

In [15]:
# Setup ChatGPT
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0.3)

In [16]:
system_message = prompt_content + documents

messages = [
    SystemMessage(content=system_message),
    HumanMessage(content=user_query)
]

## Costs Money
response=chat(messages)
content=response.content

In [17]:
print(f"Question: \n{user_query}", end="\n\n")
print(f"Answer: \n{content}", end="\n\n")
print(f"Basis: \n{documents}", end="\n\n")

Question: 
What SCPs are difficult to destroy?

Answer: 
One SCP that is considered difficult to destroy is SCP-682.

Basis: 
of containment, SCP-682 is considered one of the most challenging anomalies to manage., of containment, SCP-682 is considered one of the most challenging anomalies to manage., of containment, SCP-682 is considered one of the most challenging anomalies to manage., of containment, SCP-682 is considered one of the most challenging anomalies to manage.

