# Installing the necessary libraries

In [1]:
!pip install llama-index
!pip install langchain

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting llama-index
  Downloading llama_index-0.5.23.post1.tar.gz (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.7/188.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dataclasses_json
  Downloading dataclasses_json-0.5.7-py3-none-any.whl (25 kB)
Collecting langchain==0.0.142
  Downloading langchain-0.0.142-py3-none-any.whl (548 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m548.8/548.8 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
Collecting openai>=0.26.4
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting tiktoken
  Downloading tiktoken-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━

# Defining the functions
The following code defines the functions we need to construct the index and query it

In [10]:
from llama_index import SimpleDirectoryReader, GPTListIndex, readers, GPTSimpleVectorIndex, LLMPredictor, PromptHelper, ServiceContext
from langchain import OpenAI
import sys
import os
from IPython.display import Markdown, display

def construct_index(directory_path):
    # set maximum input size
    max_input_size = 4096
    # set number of output tokens
    num_outputs = 2000
    # set maximum chunk overlap
    max_chunk_overlap = 20
    # set chunk size limit
    chunk_size_limit = 600 

    # define prompt helper
    prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)

    # define LLM
    llm_predictor = LLMPredictor(llm=OpenAI(temperature=0.5, model_name="text-davinci-003", max_tokens=num_outputs))
 
    documents = SimpleDirectoryReader(directory_path).load_data()
    
    service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)
    index = GPTSimpleVectorIndex.from_documents(documents, service_context=service_context)

    index.save_to_disk('index.json')

    return index

def ask_ai():
    index = GPTSimpleVectorIndex.load_from_disk('index.json')
    while True: 
        query = input("What do you want to ask? ")
        response = index.query(query)
        display(Markdown(f"Response: <b>{response.response}</b>"))

# Set OpenAI API Key

In [11]:
os.environ["OPENAI_API_KEY"] = input("Paste your OpenAI key here and hit enter:")

Paste your OpenAI key here and hit enter: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


# Constructing an index
This will take every file in the folder 'context_data', split it into chunks, and embed it with OpenAI's embeddings API.

In [12]:
construct_index("/content/context_data")

<llama_index.indices.vector_store.vector_indices.GPTSimpleVectorIndex at 0x7f6c51895eb0>

# Asking the chatbot

In [13]:
ask_ai()

What do you want to ask? what is the most known company


Response: <b>
It is not possible to answer this question with the given context information.</b>

What do you want to ask? what is the most frequent job


Response: <b>

The most frequent job mentioned in the context information is Software Engineer, with additional roles such as Cloud DevOps Engineer, Java/Python Web Developer, and Cloud Infrastructure Product Engineer also mentioned.</b>

KeyboardInterrupt: ignored