In [1]:
from dotenv import load_dotenv
import os
import openai
import time
load_dotenv() 
openai.api_key = os.getenv('OPEN_AI')

In [8]:
'''
This cell imports the dependencies and creates the service context.
The service context can either be OpenAI or Tiny-Vicuna-1B
'''
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index import ServiceContext, llms, PromptTemplate, set_global_service_context

# For chatGPT
service_context = ServiceContext.from_defaults(
    llm=llms.OpenAI(temperature=0.20, model="gpt-3.5-turbo")
)


# llm = llms.HuggingFaceLLM(
#     model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
#     tokenizer_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",)

# # For Vicuna
# service_context = ServiceContext.from_defaults(
#   llm = llms.HuggingFaceLLM(
#     model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
#     tokenizer_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
#     query_wrapper_prompt = PromptTemplate(f"<|system|>\nYou are a chatbot who can help students!</s>\n<|user|>\n{query_str}</s>\n<|assistant|>\n"),
#     device_map = "auto",
#   ),
#   embed_model = llms.HuggingFaceLLM(model_name="local:BAAI/bge-small-en-v1.5")
# )


In [9]:
set_global_service_context(service_context)
# Create the vector store
documents = SimpleDirectoryReader("documents").load_data()
index = VectorStoreIndex.from_documents(documents, service_context=service_context,
                                        show_progress=True)
# Save the vector store to file
index.storage_context.persist('./index_storage/')
# Build the query engine
query_engine = index.as_query_engine(
    response_mode="tree_summarize",
    similarity_top_k=3
)

Parsing nodes: 100%|██████████| 7/7 [00:00<00:00, 640.23it/s]
Generating embeddings: 100%|██████████| 7/7 [00:00<00:00, 19.68it/s]


In [13]:
# Define a function to put it all together and run the function
def get_response(query):
    st = time.time()
    response = query_engine.query(query)
    time_elapsed = time.time()-st
    page_numbers = [response.metadata[i]['page_label'] for i in response.metadata]
    document_labels = set([response.metadata[i]['file_name'] for i in response.metadata])
    return {'response':response.response, 'processing_time':time_elapsed, 'page_numbers':page_numbers, 'document':document_labels}

print(get_response('What date is my first assignment due?'))

{'response': 'The first assignment is due on Tuesday.', 'processing_time': 1.5048096179962158, 'page_numbers': ['3', '6', '5'], 'document': {'PSY-GS-8875_Syllabus_S2024.pdf'}}


In [6]:
# Load the vector store from file and run the query again
from llama_index import StorageContext, load_index_from_storage
storage_context = StorageContext.from_defaults(
    persist_dir='./index_storage/'
)
index = load_index_from_storage(storage_context)
get_response('How is the class graded?')

('The class is graded based on a combination of activities and a final project. There will be 10 activities throughout the semester, each worth 14 points, which will account for 70% of the final grade. Additionally, there will be 1 final project worth 60 points, which will account for 30% of the final grade. Only the 10 best activity grades will be counted towards the final grade. The grading scale ranges from A (94-100%) to F (<60%).',
 1.7700726985931396,
 ['2', '3', '6'],
 {'PSY-GS-8875_Syllabus_S2024.pdf'})

In [7]:
for i in ['Who is the professor for PSY-GS 8875?', 'What is the grading policy?',
          'Where can I find data?']:
    print(get_response(i))

('The professor for PSY-GS 8875 is Alexander Christensen.', 0.6531496047973633, ['2', '1', '7'], {'PSY-GS-8875_Syllabus_S2024.pdf'})
('The grading policy for the course is as follows: There will be 12 activities and 1 final project throughout the semester. Only the 10 best activity grades will count towards the final grade. Activities are due the next Tuesday before class at 11:59:59pm. The activities are worth 70% of the final grade, while the final project is worth 30% of the final grade. The grading scale ranges from A (94-100%) to F (<60%).', 2.290818452835083, ['3', '2', '6'], {'PSY-GS-8875_Syllabus_S2024.pdf'})
('You can find data in various resources such as Kaggle datasets, Open Psychometrics, Journal of Open Psychology Data, Open Science Framework, and UCI Machine Learning Repository. Additionally, the Vanderbilt Library provides access to R books, R resources, and free R workshops.', 1.8024470806121826, ['1', '7', '4'], {'PSY-GS-8875_Syllabus_S2024.pdf'})
