# Set up runtime

Based on https://morioh.com/a/71f9de6c6fd5/build-a-custom-knowledge-chatbot-with-langchain-and-openai-api

In [29]:
%pip install python-dotenv==1.0.0
%pip install langchain==0.0.137
%pip install pinecone-client==2.2.1
%pip install openai
%pip install tiktoken

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


You need to have a file `.env` in the local directory with the following content:
```
OPENAI_API_KEY=<your OpenAI API Key>
PINECONE_API_KEY=<your pinecone API Key>
PINECONE_ENVIRONMENT=<your pinecone tunrime environment, e.g.: "gcp-starter">
PINECONE_INDEX=<index with 1024 dimensions>
```

Sign up to Open AI and create an API Key: https://platform.openai.com/account/api-keys

Sign up to Pinecone and create an environment, API Key and Index: https://app.pinecone.io

In [30]:
from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())

True

# Embeddings

In [31]:
# Import utility for splitting up texts and split up the explanation given above into document chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap  = 0,
)

texts = text_splitter.create_documents([explanation])

In [32]:
# Import and instantiate OpenAI embeddings

from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model_name="ada")

In [22]:
# Turn the first text chunk into a vector with the embedding

query_result = embeddings.embed_query(texts[0].page_content)
print(query_result)

[-0.012135120207298346, 0.03651650371900011, 0.009836418810505222, 0.016500125438175733, 0.02849378613800053, 0.000196873633417946, -0.025583783271630613, 0.02125919464375031, 0.03657712908915699, -0.011326785353389195, -0.0559771544071065, -0.037910883507318235, 0.03405108441692059, 0.02295669569491775, 0.028473578302163245, -0.006203966255179635, 0.03508171198429746, -0.02073377601082072, 0.0030363060313042473, 0.01356991380464602, 0.023724613945829823, -0.0063656331328292145, 0.07153759405843073, -0.003432895054158586, 0.010821576418941917, 0.011589493738531477, 0.02109752636911696, 0.040376302809397746, -0.027160035445129325, -0.04146755202164144, -0.0032661759847190577, -0.020157838719620437, 0.0415888027619552, 0.033485250112316435, 0.01759137651306445, -0.01225637001628959, 0.03298004117786916, 0.001230815341052983, -0.013337517173259665, -0.00129080892530166, 0.002485628302069939, -0.004829798027988839, 0.025664616477624774, 0.028554411508157407, -0.03506150228581515, -0.022027

In [23]:
# Import and initialize Pinecone client

import os
import pinecone
from langchain.vectorstores import Pinecone


pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

  from tqdm.autonotebook import tqdm


In [27]:
# Upload vectors to Pinecone

index_name = os.getenv('PINECONE_INDEX')
search = Pinecone.from_documents(texts, embeddings, index_name=index_name)

In [28]:
# Do a simple vector similarity search

query = "What is magical about an autoencoder?"
result = search.similarity_search(query)

print(result)

[Document(page_content='and easier to understand, an autoencoder can help.', metadata={}), Document(page_content='and easier to understand, an autoencoder can help.', metadata={}), Document(page_content='So, how does an autoencoder do this? It uses a type of artificial neural network to learn how to', metadata={}), Document(page_content='So, how does an autoencoder do this? It uses a type of artificial neural network to learn how to', metadata={})]


# Prompts

In [11]:
# Run basic query with OpenAI wrapper

from langchain.llms import OpenAI
llm = OpenAI(model_name="text-davinci-003")
llm("explain large language models in one sentence")

'\n\nLarge language models are machine learning models which use large amounts of data to generate statistical representations of language.'

In [None]:
# import schema for chat messages and ChatOpenAI in order to query chatmodels GPT-3.5-turbo or GPT-4

from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.chat_models import ChatOpenAI

In [None]:
chat = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0.3)
messages = [
    SystemMessage(content="You are an expert data scientist"),
    HumanMessage(content="Write a Python script that trains a neural network on simulated data ")
]
response=chat(messages)

print(response.content,end='\n')
     

In [9]:
# Import prompt and define PromptTemplate

from langchain import PromptTemplate

template = """
You are an expert data scientist with an expertise in building deep learning models. 
Explain the concept of {concept} in a couple of lines
"""

prompt = PromptTemplate(
    input_variables=["concept"],
    template=template,
)

In [12]:
# Run LLM with PromptTemplate

llm(prompt.format(concept="autoencoder"))
llm(prompt.format(concept="regularization"))

'\nRegularization is a technique used in machine learning to prevent overfitting by introducing a penalty term to the loss function that penalizes complex models. It helps to reduce the complexity of the model by forcing model weights to be smaller, which in turn reduces the variance of the model. Regularization helps to improve the generalization of the model and helps to prevent overfitting.'

In [17]:
texts[0].page_content

'Autoencoders are like robots that can help us with tasks. But instead of doing something physical,'