# How to Embed Data into Pinecone using OpenAI

## Obtaining API keys from keys.txt file and writing them to OS environment variables.

In [None]:
import os

def set_env_variables_from_file(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            key, value = line.strip().split('=')
            os.environ[key] = value

In [None]:
set_env_variables_from_file('keys.txt')

## Obtaining API keys from user's input and writing them to OS environment variables.

In [None]:
import os
import getpass

In [None]:
os.environ["PINECONE_API_KEY"] = getpass.getpass("Pinecone API Key:")
os.environ["PINECONE_ENV"] = getpass.getpass("Pinecone Environment:")
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")

## Loading documents.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

### The following code will load and split to chunks one single text file. 

In [None]:

loader = TextLoader("transcripts.txt")
documents = loader.load()

length_function = len

# The default list of split characters is [\n\n, \n, " ", ""]
# Tries to split on them in order until the chunks are small enough
# Keep paragraphs, sentences, words together as long as possible
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=1000, 
    chunk_overlap=100,
    length_function=length_function,
)

docs = splitter.split_documents(documents)

### The following code will load and split to chunks all text files in a specified derictory.
#### Text loader autodetects file encoding to avoid errors. 

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_loader_kwargs={'autodetect_encoding': True}
loader = DirectoryLoader('./transcripts', glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
documents = loader.load()

In [None]:
length_function = len

# The default list of split characters is [\n\n, \n, " ", ""]
# Tries to split on them in order until the chunks are small enough
# Keep paragraphs, sentences, words together as long as possible
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=1000, 
    chunk_overlap=100,
    length_function=length_function,
)

docs = splitter.split_documents(documents)

## Text embedding using OpenAI API & storing embeddings to Pinecone.

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
# initialize pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)

index_name = "aichatbot-alex"

# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
      name=index_name,
      metric='cosine',
      dimension=1536  
)

In [None]:
# The OpenAI embedding model `text-embedding-ada-002 uses 1536 dimensions`
docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)

## Testing database: QA.

In [None]:
index_name = "aichatbot-alex"
embeddings = OpenAIEmbeddings()

docsearch = Pinecone.from_existing_index(index_name, embeddings)

In [None]:
query = "How to relax?"
docs = docsearch.similarity_search(query)

In [None]:
print(docs[0].page_content)

# Adding More Transcripts to an Existing Index

## The following code will load and split to chunks all text files in a specified derictory.

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_loader_kwargs={'autodetect_encoding': True}
loader = DirectoryLoader('./transcripts2', glob="**/*.txt", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
documents = loader.load()

length_function = len

# The default list of split characters is [\n\n, \n, " ", ""]
# Tries to split on them in order until the chunks are small enough
# Keep paragraphs, sentences, words together as long as possible
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],
    chunk_size=1000, 
    chunk_overlap=100,
    length_function=length_function,
)

docs = splitter.split_documents(documents)

## Text embedding using OpenAI API & storing embeddings to Pinecone.

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
import pinecone

embeddings = OpenAIEmbeddings()

# initialize pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)

index_name = "aichatbot-alex"

vectorstore = Pinecone.from_existing_index(index_name, embeddings)

vectorstore.add_documents(docs)