# Question Answering

In [None]:
%pip install langchain openai chromadb tiktoken pypdf llama-index

In [1]:
import tomli, os
with open("../.streamlit/secrets.toml","rb") as f:
    secrets = tomli.load(f)
os.environ["OPENAI_API_KEY"] = secrets["OPENAI_API_KEY"]

## Load data (PDF reader)

In [16]:
import requests, io, pypdf
# get the impromptu book
url = 'https://www.impromptubook.com/wp-content/uploads/2023/03/impromptu-rh.pdf'

def pdf_to_pages(file):
	"extract text (pages) from pdf file"
	pages = []
	pdf = pypdf.PdfReader(file)
	for p in range(len(pdf.pages)):
		page = pdf.pages[p]
		text = page.extract_text()
		pages += [text]
	return pages

r = requests.get(url)
f = io.BytesIO(r.content)
pages = pdf_to_pages(f)
print(pages[1])

Impromptu
Amplifying Our Humanity 
Through AI
By Reid Hoffman  
with GPT-4


Let's save the content of the PDF into txt files.

In [29]:
if not os.path.exists("impromptu"):
    os.mkdir("impromptu")
for i, page in enumerate(pages):
    with open(f"impromptu/{i}.txt","w", encoding='utf-8') as f:
        f.write(page)

In [19]:
sep = '\n'
book = sep.join(pages)
print(book[0:35])


Impromptu
Amplifying Our Humanity 


In [21]:
import tiktoken
def num_tokens(string):
    """Returns the number of tokens in a text string."""
    encoding_name = 'cl100k_base'
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens(book)

83310

## Build an index

In [36]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex
documents = SimpleDirectoryReader("impromptu").load_data()
index = VectorStoreIndex.from_documents(documents)
documents[1]

Document(id_='35131695-d4d6-4d3e-9626-2e08eb9626e3', embedding=None, metadata={'file_path': 'impromptu\\1.txt', 'file_name': '1.txt', 'file_type': 'text/plain', 'file_size': 78, 'creation_date': '2023-12-04', 'last_modified_date': '2023-12-04', 'last_accessed_date': '2023-12-04'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, hash='76752471ca4354b6b90ff26ed59cf61a48074d3b551759904d2b7232d647c4df', text='Impromptu\nAmplifying Our Humanity \nThrough AI\nBy Reid Hoffman  \nwith GPT-4', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [39]:
documents[1].dict()

{'id_': '35131695-d4d6-4d3e-9626-2e08eb9626e3',
 'embedding': None,
 'metadata': {'file_path': 'impromptu\\1.txt',
  'file_name': '1.txt',
  'file_type': 'text/plain',
  'file_size': 78,
  'creation_date': '2023-12-04',
  'last_modified_date': '2023-12-04',
  'last_accessed_date': '2023-12-04'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'hash': '76752471ca4354b6b90ff26ed59cf61a48074d3b551759904d2b7232d647c4df',
 'text': 'Impromptu\nAmplifying Our Humanity \nThrough AI\nBy Reid Hoffman  \nwith GPT-4',
 'start_char_idx': None,
 'end_char_idx': None,
 'text_template': '{metadata_str}\n\n{content}',
 'metadata_template': '{key}: {value}',
 'metadata_seperator': '\n',
 'class_name': 'Document'}

In [35]:
# save to disk
index.storage_context.persist()

## Query your data

In [54]:
query_engine = index.as_query_engine()
response = query_engine.query('what is the potential of AI in education?')
print(response)

AI has the potential to become a powerful tool in education, transforming the way we learn and deliver instruction. It can provide personalized and individualized learning experiences tailored to each student's needs and interests. AI can also assist teachers in identifying the topics and skills that students need to focus on, providing guidance and support as needed. Additionally, AI-driven tools can automate and streamline certain aspects of teaching, such as grading and content creation, freeing up teachers' time to focus on engaging and inspiring their students. However, the full potential of AI in education may be limited by factors such as cost, access, and privacy concerns.


In [55]:
response.get_formatted_sources()

'> Source (Doc id: 74310496-b5d5-4d53-80e0-2c5cc4fa4542): 47Education\nthe technology will also create an educational system \nthat is less equitable and acc...\n\n> Source (Doc id: e779e205-87c6-4c3e-b835-17cd485050bd): 46Impromptu: Amplifying Our Humanity Through AI\nReid: GPT-4, there are so many more subjects I wa...'

In [56]:
sources = [s.node.get_text() for s in response.source_nodes]
# print(len(sources))
print(sources[0][0:11])

47Education
