### **Load Environment variables from .env file**

In [2]:
#from langchain.llms import AzureOpenAI
from langchain.chat_models import AzureChatOpenAI
import openai
from dotenv import load_dotenv
import os
from IPython.display import display, HTML, JSON, Markdown

load_dotenv()
# env variables that are used by LangChain
os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")
os.environ['OPENAI_API_TYPE'] = "azure"
os.environ['OPENAI_API_VERSION'] = os.getenv("OPENAI_DEPLOYMENT_VERSION")
os.environ['OPENAI_API_BASE'] = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")

OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = os.getenv("OPENAI_DEPLOYMENT_VERSION")
openai.api_base = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
openai.api_key = os.getenv("OPENAI_API_KEY")

#### Init LLM model

In [3]:
def init_llm(model=OPENAI_MODEL_NAME,
             deployment_name=OPENAI_DEPLOYMENT_NAME,
             temperature=0,
             max_tokens=800,
             stop="<|im_end|>",
             ):

    llm = AzureChatOpenAI(deployment_name=deployment_name,
                      model=model,
                      temperature=temperature,
                      max_tokens=max_tokens,
                      model_kwargs={"stop": ["<|im_end|>"]})
    return llm

In [4]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown

#### Load data, movies reviews

In [5]:
file = "documents/imdb_clean.csv"
loader = CSVLoader(file_path=file)
# LangChain Document is created per line in CSV file, no need
docs = loader.load()
print(len(docs))

2532


In [32]:
# from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings.openai import OpenAIEmbeddings

#init embedding model
embeddings = OpenAIEmbeddings(
    model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME, chunk_size=1)

#### Create in memory vector store to use along with LLM model 


In [33]:
# init in-memory vector store
# this could take several minutes
db = DocArrayInMemorySearch.from_documents(
    docs,
    embeddings
)

KeyboardInterrupt: 

#### Run Similarity Search on the vector store

In [8]:
qdocs = db.similarity_search(
    "Shawshank redemption", k=5)
for doc in qdocs:
    display(Markdown(doc.page_content))

: 0
title: The Shawshank Redemption
director: Frank Darabont
release_year: 1994
runtime: 142
genre: Drama
rating: 9.3
metascore: 82
gross(M): 28.34

: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Drama
rating: 8.6
metascore: 61
gross(M): 136.8

: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Crime
rating: 8.6
metascore: 61
gross(M): 136.8

: 211
title: Prisoners
director: Denis Villeneuve
release_year: 2013
runtime: 153
genre: Drama
rating: 8.1
metascore: 70
gross(M): 61.0

: 3
title: Schindler's List
director: Steven Spielberg
release_year: 1993
runtime: 195
genre: Biography
rating: 9.0
metascore: 95
gross(M): 96.9

In [9]:
# prepare context for sending to LLM
# concatenate all retrieved documents 
context_for_llm = "".join([qdocs[i].page_content for i in range(len(qdocs))])
print(context_for_llm)

: 0
title: The Shawshank Redemption
director: Frank Darabont
release_year: 1994
runtime: 142
genre: Drama
rating: 9.3
metascore: 82
gross(M): 28.34: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Drama
rating: 8.6
metascore: 61
gross(M): 136.8: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Crime
rating: 8.6
metascore: 61
gross(M): 136.8: 211
title: Prisoners
director: Denis Villeneuve
release_year: 2013
runtime: 153
genre: Drama
rating: 8.1
metascore: 70
gross(M): 61.0: 3
title: Schindler's List
director: Steven Spielberg
release_year: 1993
runtime: 195
genre: Biography
rating: 9.0
metascore: 95
gross(M): 96.9


#### Call LLM to process the query

In [10]:
# Import LangChain schema classes for messages
from langchain.schema import (
    SystemMessage,
    HumanMessage,
    AIMessage
)

llm = init_llm()

user_query = f""" Which movie genre is the prevalent in this text: ```{context_for_llm}```?"""

messages = [
    SystemMessage(
        content="You're great movies expert. Answer users question about movies. Answer only based on the provided text dilimited in triple backticks.")
]
messages.append(
    HumanMessage(content=user_query))

answer = llm(messages)

display(Markdown(answer.content))

The prevalent movie genre in the provided text is Drama.

#### Using LangChain RetrievalQA 

In [11]:
# define a retiever as a vector store
retriever = db.as_retriever()

qa_stuff = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=False)

In [12]:
response = qa_stuff.run(user_query)
display(Markdown(response))

The prevalent movie genre in this text is Drama.

In [13]:
user_query = f" Summarize the movies in the text. Remove duplicated movies: {context_for_llm}?"
response = qa_stuff.run(user_query)
display(Markdown(response))

There are four movies mentioned in the text: 
1. The Shawshank Redemption (1994) directed by Frank Darabont, a drama with a runtime of 142 minutes, a rating of 9.3, a metascore of 82, and a gross of 28.34 million dollars. 
2. The Green Mile (1999) directed by Frank Darabont, a drama with a runtime of 189 minutes, a rating of 8.6, a metascore of 61, and a gross of 136.8 million dollars. 
3. Prisoners (2013) directed by Denis Villeneuve, a drama with a runtime of 153 minutes, a rating of 8.1, a metascore of 70, and a gross of 61.0 million dollars. 
4. Schindler's List (1993) directed by Steven Spielberg, a biography with a runtime of 195 minutes, a rating of 9.0, a metascore of 95, and a gross of 96.9 million dollars.