In [None]:
! pip install openai langchain
! pip install llama_index llama-index-embeddings-huggingface
! pip install trulens-eval

In [None]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### SET ENVIRONMENT VARIABLES:

In [None]:
import os
from google.colab import userdata

def get_openai_api_key():
    os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
    return os.getenv("OPENAI_API_KEY")


def get_hf_api_key():
    os.environ["HUGGINGFACE_API_KEY"] = userdata.get('HF_TOKEN')
    return os.getenv("HUGGINGFACE_API_KEY")

# 1. EXPLORING THE WORKINGS OF WINDOW RETREIVAL COMPONENTS:

### A. The SentenceWindowNodeParser:

In [None]:
from llama_index.core import Document
from llama_index.core.node_parser import SentenceWindowNodeParser

In [None]:
window_size = 2

node_parser = SentenceWindowNodeParser.from_defaults(
        window_size = window_size,
        window_metadata_key = "window",
        original_text_metadata_key = "original_text",
    )

node_parser

SentenceWindowNodeParser(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x79c37e02e3b0>, id_func=<function default_id_func at 0x79c450244d30>, sentence_splitter=<function split_by_sentence_tokenizer.<locals>.split at 0x79c37e027490>, window_size=2, window_metadata_key='window', original_text_metadata_key='original_text')

In [None]:
txt = 'hello. how are you? I am fine. That is good to hear. I am good too!' # 5 sentences

nodes = node_parser.get_nodes_from_documents([Document(text = txt)])

In [None]:
type(nodes)

list

In [None]:
len(nodes)

5

In [None]:
type(nodes[0])

In [None]:
nodes[0]

TextNode(id_='9b172235-5c49-4419-a14e-2d873883924e', embedding=None, metadata={'window': 'hello.  how are you?  I am fine. ', 'original_text': 'hello. '}, excluded_embed_metadata_keys=['window', 'original_text'], excluded_llm_metadata_keys=['window', 'original_text'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='67cc248c-488f-4a78-aad7-cc96b6f21a05', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='31d6729954681d9227f9d6ee7fa74dd574bef67c7e05f8ded015bb781d9ae5c0'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='c6271c05-18fc-4387-b93a-c08671071225', node_type=<ObjectType.TEXT: '1'>, metadata={'window': 'hello.  how are you?  I am fine.  That is good to hear. ', 'original_text': 'how are you? '}, hash='4e3923bc7fbc3db63920f513a68bb00e2e5b0c9b17ca50110aa6a0fd9c50b728')}, text='hello. ', start_char_idx=0, end_char_idx=7, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [None]:
nodes[0].text

'hello. '

In [None]:
nodes[0].metadata

{'window': 'hello.  how are you?  I am fine. ', 'original_text': 'hello. '}

In [None]:
[x.text for x in nodes]

['hello. ',
 'how are you? ',
 'I am fine. ',
 'That is good to hear. ',
 'I am good too!']

In [None]:
node_idx = 0

original_text = nodes[node_idx].metadata['original_text']
window = nodes[0].metadata['window']

print(f'original_text: {original_text}')
print(f'window: {window}')

original_text: hello. 
window: hello.  how are you?  I am fine. 


In [None]:
node_idx = 2

original_text = nodes[node_idx].metadata['original_text']
window = nodes[0].metadata['window']

print(f'original_text: {original_text}')
print(f'window: {window}')

original_text: I am fine. 
window: hello.  how are you?  I am fine. 


In [None]:
node_idx = -1

original_text = nodes[node_idx].metadata['original_text']
window = nodes[0].metadata['window']

print(f'original_text: {original_text}')
print(f'window: {window}')

original_text: I am good too!
window: hello.  how are you?  I am fine. 


### B. The MetadataReplacementPostProcessor:
This takes a value stored in the metadata and replaces a node text with that value.     
This is done after retreiving the nodes and before sending the nodes to the LLM.

In [None]:
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor

postproc = MetadataReplacementPostProcessor(
    target_metadata_key="window"
)

In [None]:
from llama_index.core.schema import NodeWithScore
from copy import deepcopy

scored_nodes = [NodeWithScore(node=x, score=1.0) for x in nodes] # From previous example sentence
nodes_old = [deepcopy(n) for n in nodes]

In [None]:
nodes_old[1].text

'how are you? '

In [None]:
replaced_nodes = postproc.postprocess_nodes(scored_nodes)

replaced_nodes[1].text

'hello.  how are you?  I am fine.  That is good to hear. '

### C. The SentenceTransformerRerank model:

In [None]:
from llama_index.core.indices.postprocessor import SentenceTransformerRerank

# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(
    top_n=2, model="BAAI/bge-reranker-base"
)

In [None]:
from llama_index.core import QueryBundle
from llama_index.core.schema import TextNode, NodeWithScore

query = QueryBundle("I want a dog.")

# node with "dog" in text should have a higher score, but here it is not so,
# the reranker corrects this

scored_nodes = [
    NodeWithScore(node=TextNode(text="This is a cat"), score=0.6),
    NodeWithScore(node=TextNode(text="This is a dog"), score=0.4),
]

In [None]:
reranked_nodes = rerank.postprocess_nodes(
    scored_nodes, query_bundle=query
)

In [None]:
print([(x.text, x.score) for x in reranked_nodes])

[('This is a dog', 0.9182736), ('This is a cat', 0.0014040753)]
