In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
from openai import OpenAI
import os
import re

In [None]:
load_dotenv() 
os.environ.get("OPENAI_ACCESS_TOKEN")

In [None]:
openai_api_key = os.environ.get("OPENAI_ACCESS_TOKEN")
pinecone_api_key = os.environ.get("PINECONE_API_KEY")

In [None]:
from pinecone import Pinecone, PodSpec

pc = Pinecone(api_key=pinecone_api_key)

index_name = 'llm-for-sql-agent'
spec = PodSpec(environment='gcp-starter')

# check if index already exists
if index_name not in pc.list_indexes().names():
    # create index
    pc.create_index(
        index_name,
        dimension=512,  # dimensionality of text-embedding-3-small
        metric='cosine',
        spec=spec
    )
# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

In [None]:
client = OpenAI(api_key=openai_api_key)

In [None]:
# Process a PDF and create embeddings
EMBEDDING_MODEL = "text-embedding-3-small"
TEXT_DIMENSION = index.describe_index_stats()['dimension']
FILE_PATH = "data/rideInvoice.pdf"
full_path = os.path.join(os.path.dirname(os.getcwd()), FILE_PATH)

In [None]:
from typing import List 

def load_and_process_pdf(file_path: str) -> List[str]:
    loader = PyPDFLoader(file_path)
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
    documents = text_splitter.split_documents(data)
    return documents

In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore

embeddings = OpenAIEmbeddings(
    model=EMBEDDING_MODEL,
    openai_api_key=openai_api_key,
    dimensions=TEXT_DIMENSION
)

docs = load_and_process_pdf(full_path)
docsearch = PineconeVectorStore.from_documents(docs, embeddings, index_name=index_name)

In [None]:
query = "How much was the taxi fare"
docs = docsearch.similarity_search(query)
print(docs[0].page_content)

In [None]:
index.describe_index_stats()

In [None]:
from langchain_pinecone import PineconeVectorStore

text_field = "text"

vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embeddings)

vectorstore.add_texts([text_field])

In [None]:
from langchain_openai import ChatOpenAI
from langchain.chains import ConversationChain
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.memory import VectorStoreRetrieverMemory
from langchain.prompts import PromptTemplate

In [None]:
# chat completion llm
llm = ChatOpenAI(
    openai_api_key=openai_api_key,
    model_name='gpt-3.5-turbo',
    temperature=0.0
)
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=2,
    return_messages=True
)

retriever = vectorstore.as_retriever(search_kwargs=dict(k=1))
memory = VectorStoreRetrieverMemory(retriever=retriever)

In [None]:
_DEFAULT_TEMPLATE = """The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.

Relevant pieces of previous conversation:
{history}

(You do not need to use these pieces of information if not relevant)

Current conversation:
Human: {input}
AI:"""
PROMPT = PromptTemplate(
    input_variables=["history", "input"], template=_DEFAULT_TEMPLATE
)
conversation_with_summary = ConversationChain(
    llm=llm,
    prompt=PROMPT,
    memory=memory,
    verbose=True
)
conversation_with_summary.predict(input="How much was the taxi ride?")