In [1]:
# import logging
# import sys
# from IPython.display import Markdown, display

# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [1]:
from IPython.display import Markdown, display

## Using GPT Tree Index

In [9]:
from langchain.llms.base import LLM
from llama_index import PromptHelper
from transformers import pipeline
import torch
from typing import Any, List, Mapping, Optional


# define prompt helper
# set maximum input size
max_input_size = 1024
# set number of output tokens
num_output = 128
# set maximum chunk overlap
max_chunk_overlap = 5  # 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)


class CustomLLM(LLM):
    # model_name = 'sberbank-ai/rugpt3small_based_on_gpt2'
    # model_name = 'EleutherAI/gpt-neo-125M'
    model_name = 'EleutherAI/gpt-neo-1.3B'
    # model_name = 'facebook/opt-iml-max-1.3b'
    pipeline = pipeline(
        'text-generation', model=model_name, device='cuda:0', model_kwargs={'torch_dtype': torch.bfloat16}
    )

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        prompt_length = len(prompt)
        response = self.pipeline(prompt, temperature=0.9, max_new_tokens=num_output)[0]['generated_text']

        # only return newly generated tokens
        return response[prompt_length:]
    
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {'name_of_model': self.model_name}
    
    @property
    def _llm_type(self) -> str:
        return 'custom'

In [2]:
from langchain.llms.base import LLM
from llama_index import PromptHelper
from transformers import pipeline
import torch
from typing import Any, List, Mapping, Optional
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# define prompt helper
# set maximum input size
max_input_size = 2048
# set number of output tokens
num_output = 256
# set maximum chunk overlap
max_chunk_overlap = 20
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)


class CustomLLM(LLM):
    model_name = 'google/flan-t5-large'
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to('cuda')
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        # prompt_length = len(prompt)
        inputs = self.tokenizer(prompt, return_tensors='pt').to('cuda')
        response = self.model.generate(**inputs, temperature=1, max_new_tokens=num_output)
        response = self.tokenizer.batch_decode(response, skip_special_tokens=True)[0]

        return response
    
    @property
    def _identifying_params(self) -> Mapping[str, Any]:
        return {'name_of_model': self.model_name}
    
    @property
    def _llm_type(self) -> str:
        return 'custom'

In [3]:
from llama_index import LLMPredictor, ServiceContext

llm_predictor = LLMPredictor(llm=CustomLLM())
# service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper=prompt_helper)

# Generation

In [4]:
from llama_index import SimpleDirectoryReader

documents = SimpleDirectoryReader('paul_graham_essay/data').load_data()

In [10]:
from llama_index import GPTListIndex

index = GPTListIndex.from_documents(documents, service_context=service_context)

response = index.query("Answer the question: what did the author do growing up?")
display(Markdown(f"<b>{response}</b>"))

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 22214 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens


<b>I was in decent shape at painting and drawing from the RISD foundation that summer, but I still don't know how I managed to pass the written exam. I remember that I answered the essay question by writing about Cezanne, and that I cranked up the intellectual level as high as I could to make the most of my limited vocabulary. [2] I'm only up to age 25 and already there are such conspicuous patterns. Here I was, yet again about to attend some august institution in the hopes of learning about some prestigious subject, and yet again about to be disappointed.</b>

In [9]:
from llama_index import GPTTreeIndex

index = GPTTreeIndex.from_documents(documents, service_context=service_context)

response = index.query("What did the author do growing up?", child_branch_factor=1)
display(Markdown(f"<b>{response}</b>"))
# index = GPTListIndex.from_documents(documents, service_context=service_context)

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:llama_index.indices.query.tree.leaf_query:> Starting query: What did the author do growing up?
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 1595 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens


<b>writing and programming</b>

In [7]:
from llama_index import SummaryPrompt
from llama_index import SimpleDirectoryReader
from llama_index import GPTTreeIndex

documents = SimpleDirectoryReader('data').load_data()

query_str = "What did the author do growing up?"
SUMMARY_PROMPT_TMPL = (
    "Context information is below. \n"
    "---------------------\n"
    "{context_str}"
    "\n---------------------\n"
    "Given the context information and not prior knowledge, "
    f"answer the question: {query_str}\n"
)
SUMMARY_PROMPT = SummaryPrompt(SUMMARY_PROMPT_TMPL)
index_with_query = GPTTreeIndex.from_documents(
    documents, service_context=service_context, summary_template=SUMMARY_PROMPT
)

# directly retrieve response from root nodes instead of traversing tree
response = index_with_query.query(query_str, mode="retrieve")
display(Markdown(f"<b>{response}</b>"))

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens
INFO:llama_index.indices.query.tree.retrieve_query:> Starting query: What did the author do growing up?
INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 1584 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 0 tokens


<b>writing and programming</b>

## Using GPT Keyword Table Index

In [1]:
from llama_index import GPTKeywordTableIndex, SimpleDirectoryReader
from IPython.display import Markdown, display

In [12]:
# build keyword index
documents = SimpleDirectoryReader('data').load_data()
index = GPTKeywordTableIndex.from_documents(documents, service_context=service_context)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/vyacharin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 18765 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 0 tokens


In [9]:
# set Logging to DEBUG for more detailed outputs
response = index.query("What did the author do after his time at Y Combinator?")

> Starting query: What did the author do after his time at Y Combinator?
Extracted keywords: ['y combinator', 'combinator']
> Querying with idx: 7143669651211954504: of excluding them, because there were so many s...
> Querying with idx: 4978118451876167434: browser, and then host the resulting applicatio...
> Querying with idx: 7378313280237489139: person, and from those we picked 8 to fund. The...
> Querying with idx: 2670584622494666310: it was like living in another country, and sinc...


In [10]:
display(Markdown(f"<b>{response}</b>"))

<b>

After a few years, the author decided to step away from Y Combinator to focus on other projects, such as painting and writing essays. In 2013, he handed over control of Y Combinator to Sam Altman. The author's mother passed away in 2014, and after taking some time to grieve, he returned to writing essays and working on Lisp. He continued working on Lisp until 2019, when he finally completed the project.

In 2015, the author decided to move to England with his family. They originally intended to only stay for a year, but ended up liking it so much that they remained there. The author wrote Bel while living in England. In 2019, he finally finished the project. After completing Bel, the author wrote a number of essays on various topics. He continued writing essays through 2020, but also started thinking about other things he could work on.</b>

In [5]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import GPTSimpleVectorIndex, LangchainEmbedding

# load in HF embedding model from langchain
embed_model = LangchainEmbedding(HuggingFaceEmbeddings())

llm_predictor = LLMPredictor(llm=CustomLLM())
service_context = ServiceContext.from_defaults(
    llm_predictor=llm_predictor,
    prompt_helper=prompt_helper,
    embed_model=embed_model,
    # chunk_size_limit=chunk_size_limit,
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
INFO:sentence_transformers.SentenceTransformer:Use pytorch device: cuda


In [16]:
from llama_index import GPTVectorStoreIndex
index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total LLM token usage: 0 tokens
INFO:llama_index.token_counter.token_counter:> [build_index_from_nodes] Total embedding token usage: 17617 tokens


In [19]:
# query will use the same embed_model
response = index.query(
    "What did the author do growing up?", 
    mode="embedding", 
    verbose=False, 
)
print(response)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:llama_index.token_counter.token_counter:> [query] Total LLM token usage: 3875 tokens
INFO:llama_index.token_counter.token_counter:> [query] Total embedding token usage: 8 tokens


While I was a student at the Accademia I started painting still lives in my bedroom at night. These paintings were tiny, because the room was, and because I painted them on leftover scraps of canvas, which was all I could afford at the time. Painting still
