# RAG from scratch

### References

* https://towardsdatascience.com/local-rag-from-scratch-3afc6d3dea08
* https://iamajithkumar.medium.com/how-to-use-chroma-to-build-your-first-similarity-search-5c054bfd5add


### Chunky

In [73]:
import re
import os
import uuid

from transformers import AutoTokenizer, AutoModel

In [74]:
from transformers import AutoModel, AutoTokenizer
import torch

model_name = "BAAI/bge-small-en-v1.5"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

tokenizer.save_pretrained("model/tokenizer")
model.save_pretrained("model/embedding")

In [75]:
def recursivelyChunkTillOptimumSize(
        chunks, 
        tokenizer, 
        chunk_size = 1024, 
        separator = None, 
        secondary_chunking_regex = None ):

    modifiedChunks = []
    refined = []
    current_chunk = ""
    # Split all words of size greater than chunk size into smaller chunks
    while len(chunks) > 0:
        chunk = chunks.pop(0)
        chunk = chunk.strip()

        if len(tokenizer.tokenize(chunk)) > chunk_size:
            sub_chunks = re.split(secondary_chunking_regex, current_chunk)
            chunks = sub_chunks + chunks
        else:
            modifiedChunks.append(chunk)

    # Join small chunks ot make big chunks
    current_chunk = ""
    for chunk in modifiedChunks:
        new_chunk = current_chunk + (separator if current_chunk else '') + chunk

        if len(tokenizer.tokenize(new_chunk)) <= chunk_size:
            current_chunk = new_chunk
        else:
            if current_chunk:
                refined.append(current_chunk)
            current_chunk = chunk

    if current_chunk:
        refined.append(current_chunk)

    return refined    
    
def chunk( 
        text, 
        tokenizer, 
        paragraph_separator = "\n\n", 
        chunk_size = 1024, 
        separator = " ", 
        secondary_chunking_regex = r'\S+?[\.,;!?]',
        chunk_overlap = 0 ):
    # Divide documents into paragraphs
    paragraphs = re.split(paragraph_separator, text)
    all_chunks = []

    for paragraph in paragraphs:
        words = paragraph.split(separator)
        chunks = recursivelyChunkTillOptimumSize(
            words, 
            tokenizer, 
            chunk_size = chunk_size, 
            separator = separator, 
            secondary_chunking_regex = secondary_chunking_regex 
        )
        all_chunks.extend(chunks)

    if chunk_overlap == 0:
        return all_chunks

    final_chunks = []
    for idx, chunk in enumerate(all_chunks):
        if idx == 0:
            final_chunks.append(chunk)
            continue
        # split the overlap between end of last chunk and start of next chunk
        overlap_chunk = all_chunks[idx - 1][overlap_chunk // 2: ] + all_chunks[idx][0: overlap_chunk // 2]
        final_chunks.extend( [overlap_chunk, chunk] )

    return final_chunks    

### Indexing

* Statisticals -> bag of words, sparse locations, etc
* Machine learned -> embedding

In [76]:
def compute_embeddings(chunk, tokenizer, model):
    inputs = tokenizer(chunk, return_tensors="pt", padding=True, truncation=True) 
    
    # Generate the embeddings 
    with torch.no_grad():    
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).squeeze()

    return embeddings.tolist()

In [88]:
import chromadb
chroma_client = chromadb.PersistentClient(path="vector_store")
collection = chroma_client.create_collection(name="embeddings", metadata={"hnsw:space": "cosine"}, get_or_create = True)


In [89]:
text = "Traditional sleigh bed crafted in rich walnut wood, featuring a curved headboard and footboard with intricate grain details. Queen size, includes a plush, supportive mattress. Produced by Heritage Bed Co. Dimensions: 65\"W x 85\"L x 50\"H."
chunks = chunk(text, tokenizer, chunk_overlap = 512)
for created_chunk in chunks:
    embeddings = compute_embeddings(created_chunk, tokenizer, model)
    collection.add(str(uuid.uuid1()), embeddings, {"sku": 1}, created_chunk )

In [100]:
text = "Size of the Traditional sleigh bed with walnut wood"
chunks = chunk(text, tokenizer, chunk_overlap = 512)
embeddings = []

for created_chunk in chunks:
    embedding = compute_embeddings(created_chunk, tokenizer, model)
    embeddings.append(embedding)

res = collection.query(
    query_embeddings=embeddings,
    n_results=100,
    include=['distances', 'documents', 'metadatas']
)


Number of requested results 100 is greater than number of elements in index 1, updating n_results = 1


In [96]:
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

qa_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
qa_model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [106]:
def generateResponse(question, context, tokenizer, model):
    print(question)
    print(context)
    input_ids = tokenizer.encode(question, context)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    sep_idx = tokens.index('[SEP]')
    token_type_ids = [0 for i in range(sep_idx+1)] + [1 for i in range(sep_idx+1,len(tokens))]

    out = model(torch.tensor([input_ids]), # The tokens representing our input text.
                token_type_ids=torch.tensor([token_type_ids]))

    start_logits,end_logits = out['start_logits'],out['end_logits']
    answer_start = torch.argmax(start_logits)
    answer_end = torch.argmax(end_logits)
    ans = ''.join(tokens[answer_start:answer_end])

    return ans


In [108]:
print(
    generateResponse(
        "What is the dimensions of the walnut sleigh", 
        "".join(res['documents'][0]),
        qa_tokenizer,
        qa_model
    )
)


What is the dimensions of the walnut sleigh
Traditional sleigh bed crafted in rich walnut wood, featuring a curved headboard and footboard with intricate grain details. Queen size, includes a plush, supportive mattress. Produced by Heritage Bed Co. Dimensions: 65"W x 85"L x 50"H.
65"wx85"lx50"
