In [None]:
# Set, to get tokens for a string
text = None
# Set, to get tokens for a file
file_path = None
# Set, to get tokens for files in a folder
folder_path = None
file_filter = ['.md'] # Include only .md files. Set to None to include all files in the folder

# Name of the model to use to calculate tokens
# For use with Hugging Face models, specify user/model name from the Hugging Face model hub
# For use with OpenAI models, specify the model name from the OpenAI API
# See https://huggingface.co/models for Hugging Face models
model_name = 'microsoft/Phi-3.5-mini-instruct'
# See https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/model.py#L22-L72 for OpenAI models
# model_name = 'gpt-4o'

In [None]:
import os
import pandas as pd
import tiktoken
from tokenizers import Tokenizer

# Tokenizer from Hugging Face model hub
tokenizer_hf = None
# Tokenizer from OpenAI API
encoding_oai = None
# Function to calculate number of tokens in a text
tokenizer_fn = None

def get_num_tokens_hf(text: str) -> int:
    """
    Get number of tokens in a text using a tokenizer from Hugging Face model hub
    """
    encoded = tokenizer_hf.encode(text)
    return len(encoded.tokens)

def get_num_tokens_oai(text: str) -> int:
    """
    Get number of tokens in a text using OpenAI tokenizer
    """
    encoded = encoding_oai.encode(text)
    return len(encoded)

def get_num_tokens_from_file(file_path: str) -> int:
    """
    Get number of tokens in a text file
    """
    with open(file_path, 'r') as file:
        text = file.read()
    return tokenizer_fn(text)

def get_num_tokens_from_folder(folder_path: str, file_filter=None) -> list:
    """
    Get number of tokens in all files in a folder
    """
    tokens_info = []
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if file_filter is None or any(partial_filename in filename for partial_filename in file_filter):
                file_path = os.path.join(root, filename)
                tokens = get_num_tokens_from_file(file_path)
                tokens_info.append((file_path, tokens))
            else:
                print(f'Skipping {filename}')
    return tokens_info

if '/' in model_name:
    tokenizer_hf = Tokenizer.from_pretrained(model_name)
    tokenizer_fn = get_num_tokens_hf
else:
    encoding_oai = tiktoken.encoding_for_model(model_name)
    tokenizer_fn = get_num_tokens_oai

In [None]:
if text:
    num_tokens = tokenizer_fn(text)
    print(f'Number of tokens in text: {num_tokens}')

In [None]:
if file_path:
    num_tokens = get_num_tokens_from_file(file_path)
    print(f'{file_path}: {num_tokens}')

In [None]:
if folder_path:
    tokens_info = get_num_tokens_from_folder(folder_path, file_filter)

    tokens_info_df = pd.DataFrame(tokens_info, columns=['file', 'tokens'])
    tokens_info_df = tokens_info_df.sort_values(by='file')
    # Configure pandas to show all rows
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_colwidth', 100)
    # Configure pandas to not print columns on multiple lines
    pd.set_option('display.expand_frame_repr', False)

    print(f'Tokens in files in folder {folder_path} using model {model_name}:\n')
    print(tokens_info_df.to_string(index=False))