# Lanchain OpenAI Setup

In [2]:
# import langchain
import os
import logging
from langchain_openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage, SystemMessage

# Import chat templates
from langchain.prompts import (
    ChatPromptTemplate,
    PromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

# Caching
from langchain.cache import InMemoryCache
from langchain.cache import SQLiteCache
from langchain.globals import set_llm_cache

# Embeddings and documents
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma

# Context compression
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [3]:
cacheType = 'in_memory'

if cacheType == 'in_memory':
    set_llm_cache(InMemoryCache())
elif cacheType == 'sqlite':
    set_llm_cache(SQLiteCache(database_path=".langchain.db"))

# Set OpenAI API key and create LLM and Chat LLM. Note that key can be stored in a separate file or as an environment variable. Refer to docs.
api_key = open('./openai_key.txt').read()
os.environ['OPENAI_API_KEY'] = api_key

logging.basicConfig(filename='example.log', encoding='utf-8', level=logging.DEBUG)

In [4]:
def get_embedding_function():
    embedding_function = OpenAIEmbeddings()
    return embedding_function


def load_text_file_into_db(file, db_name,embedding_function, chunk_size=500): 
    logging.info(f'Loading file into DB. file={file} db={db_name}, chunk_size={chunk_size}')
    loader = TextLoader(file)
    documents = loader.load()
    docs = split_documents_into_chunks(documents, chunk_size)
    db = Chroma.from_documents(docs, embedding_function, persist_directory=db_name)
    
    # save to disk
    db.persist()
    logging.info(f'Persited file into DB. file={file} db={db_name}, chunk_size={chunk_size}')
    return db


def get_chroma_db(db_path, embedding_function):
    return Chroma(persist_directory=db_path, embedding_function=embedding_function)


def split_documents_into_chunks(documents, chunk_size=500):
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=chunk_size)
    docs = text_splitter.split_documents(documents)
    return docs

def load_text_files_into_db(files: list, embedding_function):
    db = None
    for file in files:
        db = load_text_file_into_db('extras/01-Data-Connections/some_data/FDR_State_of_Union_1944.txt',
                    db_path, embedding_function, chunk_size=500)
    
    return db

In [5]:
db_path = './speech_new_db_1'
question = "What was Lincon's stance on slavery?"

files = ['extras/01-Data-Connections/some_data/FDR_State_of_Union_1944.txt',
         'extras/01-Data-Connections/some_data/Lincoln_State_of_Union_1862.txt']

In [6]:
embedding_function = get_embedding_function()
db = load_text_files_into_db(files, embedding_function)

In [7]:
#db = load_text_file_into_db('extras/01-Data-Connections/some_data/FDR_State_of_Union_1944.txt',
#                    db_path, embedding_function, chunk_size=500)

In [8]:
#db = load_text_file_into_db('extras/01-Data-Connections/some_data/Lincoln_State_of_Union_1862.txt',
#                    db_path, embedding_function, chunk_size=500)

In [9]:
# LLM Use compression
chat = ChatOpenAI(temperature=0)

# LLM -> LLMChainExtractor
compressor = LLMChainExtractor.from_llm(chat)

In [10]:
# Context Retriever
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor,
                                                       base_retriever=db.as_retriever())

In [11]:
# Old way without OpenAI
docs = db_connection.similarity_search(question)
docs[0].page_content[:500]

NameError: name 'db_connection' is not defined

In [None]:
# New way with context retriever
compressed_docs = compression_retriever.get_relevant_documents(question)

In [None]:
compressed_docs[0]