<a href="https://colab.research.google.com/github/vsingh9076/Generative_AI/blob/main/Data_Preparation/Tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import tiktoken
import math


def get_token_size(document, model):
    tokenizer=tiktoken.encoding_for_model(model)
    return len(tokenizer.encode(document))

def naive_chunker(document, chunk_size, model):
    tokenizer=tiktoken.encoding_for_model(model)
    document_tokens=tokenizer.encode(document)
    document_size = len(document_tokens)

    chunks = []
    for i in range(0, document_size, chunk_size):
        chunk = document_tokens[i:i + chunk_size]
        chunks.append(tokenizer.decode(chunk))

    return chunks


def auto_chunker(document, max_chunk_size, model):
    tokenizer = tiktoken.encoding_for_model(model)
    document_tokens = tokenizer.encode(document)
    document_size = len(document_tokens)
    # total chunk number
    K = math.ceil(document_size / max_chunk_size)
    # average integer chunk size
    average_chunk_size = math.ceil(document_size / K)
    # number of chunks with average_chunk_size - 1
    shorter_chunk_number = K * average_chunk_size - document_size
    # number of chunks with average_chunk_size
    standard_chunk_number = K - shorter_chunk_number

    chunks = []
    chunk_start = 0
    for i in range(0, K):
        if i < standard_chunk_number:
            chunk_end = chunk_start + average_chunk_size
        else:
            chunk_end = chunk_start + average_chunk_size - 1
        chunk = document_tokens[chunk_start:chunk_end]
        chunks.append(tokenizer.decode(chunk))
        chunk_start = chunk_end

    assert chunk_start == document_size
    return chunks

In [5]:
MODEL='gpt-3.5-turbo'

def comparison(test_doc, chunk_size):
    print('document_size:', get_token_size(test_doc, MODEL))
    print('max_chunk_size:', chunk_size)
    naive_chunk_list=naive_chunker(test_doc, chunk_size, MODEL)
    print('Naive chunking size list: ', [get_token_size(chunk, MODEL) for chunk in naive_chunk_list])
    auto_chunk_list=auto_chunker(test_doc, chunk_size, MODEL)
    print('Auto chunking size list: ', [get_token_size(chunk, MODEL) for chunk in auto_chunk_list])
    print('----------------------------------------')


CHUNK_SIZE=5
single_token_text="abcd"

for i in range(10, 16):
    test_doc=single_token_text*i
    print('document:',test_doc)
    comparison(test_doc, chunk_size=CHUNK_SIZE)

document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd
document_size: 10
max_chunk_size: 5
Naive chunking size list:  [5, 5]
Auto chunking size list:  [5, 5]
----------------------------------------
document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd
document_size: 11
max_chunk_size: 5
Naive chunking size list:  [5, 5, 1]
Auto chunking size list:  [4, 4, 3]
----------------------------------------
document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd
document_size: 12
max_chunk_size: 5
Naive chunking size list:  [5, 5, 2]
Auto chunking size list:  [4, 4, 4]
----------------------------------------
document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd
document_size: 13
max_chunk_size: 5
Naive chunking size list:  [5, 5, 3]
Auto chunking size list:  [5, 4, 4]
----------------------------------------
document: abcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcdabcd
document_size: 14
max_chunk_size: 5
Naive chunking size list:  [5, 5, 4]
Auto chunking size list:  [5, 5, 4]
--