In [204]:
from langchain.llms import AzureOpenAI
import openai
import tiktoken
from dotenv import load_dotenv, find_dotenv
import os
from IPython.display import display, HTML, JSON, Markdown

#read local .env file
_ = load_dotenv(find_dotenv()) 

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_DEPLOYMENT_NAME = os.getenv("OPENAI_DEPLOYMENT_NAME")
OPENAI_MODEL_NAME = os.getenv("OPENAI_MODEL_NAME")
OPENAI_DEPLOYMENT_VERSION = os.getenv("OPENAI_DEPLOYMENT_VERSION")

OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME = os.getenv("OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME")
OPENAI_ADA_EMBEDDING_MODEL_NAME = os.getenv("OPENAI_ADA_EMBEDDING_MODEL_NAME")

OPENAI_DAVINCI_DEPLOYMENT_NAME = os.getenv("OPENAI_DAVINCI_DEPLOYMENT_NAME")
OPENAI_DAVINCI_MODEL_NAME = os.getenv("OPENAI_DAVINCI_MODEL_NAME")

# Configure OpenAI API
openai.api_type = "azure"
openai.api_version = OPENAI_DEPLOYMENT_VERSION
openai.api_base = OPENAI_DEPLOYMENT_ENDPOINT
openai.api_key = OPENAI_API_KEY

In [205]:
def init_llm(model=OPENAI_MODEL_NAME,
             deployment_name=OPENAI_DEPLOYMENT_NAME, 
             temperature=0,
             max_tokens=800,
             stop="<|im_end|>", 
             ):
    
    llm = AzureOpenAI(deployment_name=deployment_name,  
                  model=model,
                  temperature=temperature,) 
    return llm

In [206]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import CSVLoader
from langchain.vectorstores import DocArrayInMemorySearch
from IPython.display import display, Markdown
from langchain.vectorstores import FAISS

In [207]:
file = "documents/imdb_clean.csv"
loader = CSVLoader(file_path=file)
# LangChain Document is created per line in CSV file, no need 
docs = loader.load()
print(len(docs))


2532


In [109]:
#from langchain.indexes import VectorstoreIndexCreator
from langchain.embeddings.openai import OpenAIEmbeddings

In [208]:
#init in-memory vector store
# using just 100 documents for this example to avoid API rate limits
max_num_docs = 100
embeddings = OpenAIEmbeddings(model=OPENAI_ADA_EMBEDDING_DEPLOYMENT_NAME, chunk_size=1)
db = DocArrayInMemorySearch.from_documents(
    docs[:max_num_docs], 
    embeddings
)


Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Create Operation under Azure OpenAI API version 2022-12-01 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 10 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Create Operation under Azure OpenAI API version 2022-12-01 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 5 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_C

In [117]:
qdocs = db.similarity_search("What is the best movie ever made?",k=10)
print(qdocs)

[Document(page_content=': 1\ntitle: The Godfather\ndirector: Francis Ford Coppola\nrelease_year: 1972\nruntime: 175\ngenre: Drama\nrating: 9.2\nmetascore: 100\ngross(M): 134.97', metadata={'source': 'documents/imdb_clean.csv', 'row': 2}), Document(page_content=': 1\ntitle: The Godfather\ndirector: Francis Ford Coppola\nrelease_year: 1972\nruntime: 175\ngenre: Crime\nrating: 9.2\nmetascore: 100\ngross(M): 134.97', metadata={'source': 'documents/imdb_clean.csv', 'row': 1}), Document(page_content=": 34\ntitle: It's a Wonderful Life\ndirector: Frank Capra\nrelease_year: 1946\nruntime: 130\ngenre: Fantasy\nrating: 8.6\nmetascore: 89\ngross(M): 0.0", metadata={'source': 'documents/imdb_clean.csv', 'row': 85}), Document(page_content=": 34\ntitle: It's a Wonderful Life\ndirector: Frank Capra\nrelease_year: 1946\nruntime: 130\ngenre: Drama\nrating: 8.6\nmetascore: 89\ngross(M): 0.0", metadata={'source': 'documents/imdb_clean.csv', 'row': 83}), Document(page_content=': 37\ntitle: Back to the Fut

In [209]:
qdocs = db.similarity_search("Find similar to Shawshank redemption movies?",k=10)

Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Create Operation under Azure OpenAI API version 2022-12-01 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 10 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_Create Operation under Azure OpenAI API version 2022-12-01 have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 5 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..
Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the Embeddings_C

In [119]:
#define a retiever as a vectorstore
llm = init_llm()
retriever = db.as_retriever()
#from vector store output prepare context for LLM 
context_for_llm = "".join([qdocs[i].page_content for i in range(len(qdocs))])
print(context_for_llm)

: 0
title: The Shawshank Redemption
director: Frank Darabont
release_year: 1994
runtime: 142
genre: Drama
rating: 9.3
metascore: 82
gross(M): 28.34: 3
title: Schindler's List
director: Steven Spielberg
release_year: 1993
runtime: 195
genre: Biography
rating: 9.0
metascore: 95
gross(M): 96.9: 3
title: Schindler's List
director: Steven Spielberg
release_year: 1993
runtime: 195
genre: Drama
rating: 9.0
metascore: 95
gross(M): 96.9: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Drama
rating: 8.6
metascore: 61
gross(M): 136.8: 3
title: Schindler's List
director: Steven Spielberg
release_year: 1993
runtime: 195
genre: History
rating: 9.0
metascore: 95
gross(M): 96.9: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Crime
rating: 8.6
metascore: 61
gross(M): 136.8: 24
title: The Green Mile
director: Frank Darabont
release_year: 1999
runtime: 189
genre: Fantasy
rating: 8.6
metascore: 61
gross(M): 136.8: 26
title:

In [196]:
query = f""" Which movie genre is the prevalent in this text: {context_for_llm}? 
Output in the form of JSON : <Movie genre : count>
"""
response = llm(query)
 
display(Markdown(response))


{"Drama": 4, "Biography": 1, "History": 1, "Crime": 2, "Fantasy": 1}
'''<|im_end|>

In [202]:
qa_stuff = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever,
    verbose=False)

In [203]:
response = qa_stuff.run(query)
display(Markdown(response))


 {"Drama": 4, "Biography": 1, "History": 1, "Crime": 2, "Fantasy": 1}

Note: The output is not case-sensitive. Drama and drama should be considered the same.

"""

import json

data = [
    {
        "title": "The Shawshank Redemption",
        "director": "Frank Darabont",
        "release_year": 1994,
        "runtime": 142,
        "genre": "Drama",
        "rating": 9.3,
        "metascore": 82,
        "gross(M)": 28.34
    },
    {
        "title": "The Green Mile",
        "director": "Frank Darabont",
        "release_year": 1999,
        "runtime": 189,
        "genre": "Crime",
        "rating": 8.6,
        "metascore": 61,
        "gross(M)": 136.8
    },
    {
        "title": "The Green Mile",
        "director": "Frank Darabont",
        "release_year": 1999,
        "runtime": 189,
        "genre": "Drama",
        "

In [195]:
query = f" Summarize the movies in the text: {context_for_llm}?"
response = qa_stuff.run(query)
display(Markdown(response))



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


 The text describes several movies, including The Shawshank Redemption, Schindler's List, The Green Mile, Se7en, and 12 Angry Men. 

The Shawshank Redemption is a drama directed by Frank Darabont and released in 1994. It has a runtime of 142 minutes, a rating of 9.3, a metascore of 82, and grossed $28.34 million.

Schindler's List is a drama directed by Steven Spielberg and released in 1993. It has a runtime of 195 minutes, a rating of 9.0, a metascore of 95, and grossed $96.9 million.

The Green Mile is a drama directed by Frank Darabont and released in 1999. It has a runtime of 189 minutes, a rating of 8.6, a metascore of 61, and grossed $136.8 million.

Se7en is a drama directed by David Fincher and released in 1995. It has a runtime of 127 minutes, a rating of 8.6, a metascore of 65, and grossed $100.13 million.

12 Angry Men is a drama directed by Sidney