# ws_version_ch2.ipynb
## WESmith 03/06/24

## Working thru book's chapter 2 in a personalized way.

## TOKENIZATION

In [None]:
# Importing the tiktoken library
import tiktoken

# Initializing a tokenizer for the 'cl100k_base' model
# This tokenizer is designed to work with the 'ada-002' embedding model
tokenizer = tiktoken.get_encoding("cl100k_base")

# Using the tokenizer to encode the text 'hey there'
# The resulting output is a list of integers representing the encoded text
# This is the input format required for embedding using the 'ada-002' model
tokenizer.encode('hey there, you, and you')  # WS mods

## READING PDF

In [None]:
import PyPDF2
from tqdm import tqdm  # tqdm is a progress meter

# Open the PDF file in read-binary mode
author_file = '../data/pds2.pdf'                  # WS large, 16M
ws_file     = '../data/journal.pone.0000404.pdf'  # WS much smaller, 561K

pages = []  # WS

with open(ws_file, 'rb') as file:                 # WS

    # Create a PDF reader object
    reader = PyPDF2.PdfReader(file)

    # Initialize an empty string to hold the text
    #principles_of_ds = ''
    file_txt = ''  # WS
    
    # Loop through each page in the PDF file
    for page in tqdm(reader.pages):
        text = page.extract_text()
        pages.append(text)   # WS
        #principles_of_ds += '\n\n' + text[text.find(' ]')+2:]

        # WS find() returns the number found; if ' ]' not found, -1 returned, and '\n\n' + text[2 - 1] is catenated:
        # WS for my pdf, this misses the first char of the page: it, text[1] is added, not text[0]; 
        # WS this may work for author's pdf: it may have ] chars, my pdf doesn't have any ' ]' 'space-bracket' tokens
        file_txt += '\n\n' + text[text.find(' ]')+2:]  # WS if ' ]' not found, -1 returned, 

# Print the final string containing all the text from the PDF file
#principles_of_ds = principles_of_ds.strip()
#file_txt = file_txt.strip()

#print(len(principles_of_ds))
print(len(file_txt))

In [None]:
dd = tokenizer.encode(pages[0])  # WS works
pages[0][:70], dd[:10]  #WS

In [None]:
ee = tokenizer.encode(file_txt)  # WS
file_txt[:50], ee[:10]

In [None]:
tokenizer.encode('Subspace')

In [None]:
tokenizer.encode('\n\n')

In [None]:
tokenizer.encode('ub')

In [None]:
tokenizer.encode('space')

## HASHING

In [None]:
import hashlib

def my_hash(s):
    # Return the MD5 hash of the input string as a hexadecimal string
    return hashlib.md5(s.encode()).hexdigest()

my_hash('I love to hash it')

## OVERLAPPING CHUNKS

In [None]:
import re

# Function to split the text into chunks of a maximum number of tokens. Inspired by OpenAI
def overlapping_chunks(text, max_tokens = 500, overlapping_factor = 5):
    '''
    max_tokens: tokens we want per chunk
    overlapping_factor: number of sentences to start each chunk with that overlaps with the previous chunk
    '''

    # Split the text using punctuation
    sentences = re.split(r'[.?!]', text)

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks, tokens_so_far, chunk = [], 0, []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            if overlapping_factor > 0:
                chunk = chunk[-overlapping_factor:]
                tokens_so_far = sum([len(tokenizer.encode(c)) for c in chunk])
            else:
                chunk = []
                tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1
    if chunk:
        chunks.append(". ".join(chunk) + ".")

    return chunks

In [None]:
chunks = overlapping_chunks(file_txt)

In [None]:
len(chunks)

In [None]:
len(tokenizer.encode(chunks[10]))

In [None]:
sentences_1 = re.split(r'[.?!]', chunks[0])
sentences_2 = re.split(r'[.?!]', chunks[1])

In [None]:
for k in sentences_1[-7:]: print(k + '\n')  # WS overlap is at the level of sentences

In [None]:
for k in sentences_2[:7]: print(k + '\n')

## TEXT SOURCES FROM INTERNET

In [None]:
from urllib.request import urlopen

# A textbook about insects
text = urlopen('https://www.gutenberg.org/cache/epub/10834/pg10834.txt').read().decode()

In [None]:
Einstein = urlopen('https://www.gutenberg.org/cache/epub/30155/pg30155.txt').read().decode()

In [None]:
Einstein[1000:3000]

In [None]:
text[:1000]

In [None]:
split = overlapping_chunks(text, overlapping_factor=0)
avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'non-overlapping chunking approach has {len(split)} documents with average length {avg_length:.1f} tokens')

## STATISTICS

In [None]:
# Importing the Counter and re libraries
from collections import Counter
import re

# Find all occurrences of one or more spaces in 'principles_of_ds'
matches = re.findall(r'[\s]{1,}', text)

# The 10 most frequent spaces that occur in the document
most_common_spaces = Counter(matches).most_common(30)

# Print the most common spaces and their frequencies
print(most_common_spaces)

In [None]:
# Only keep documents of at least 100 characters split by a custom delimiter
split = list(filter(lambda x: len(x) > 50, text.split('\r\n\r\n')))  # WS using \r\n\r\n for this text sample

avg_length = sum([len(tokenizer.encode(t)) for t in split]) / len(split)
print(f'custom delimiter approach has {len(split)} documents with average length {avg_length:.1f} tokens')

In [None]:
split[20]

## SENTENCE EMBEDDING

In [None]:
# see https://pypi.org/project/sentence-transformers/
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
sentences = [
    "This framework generates embeddings for each input sentence",
    "Sentences are passed as a list of string.",
    "The quick brown fox jumps over the lazy dog.",]
sentence_embeddings = model.encode(sentences)

In [None]:
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding[0:20])
    print("")

In [None]:
len(sentence_embeddings[1])

In [None]:
chunks[1]

In [None]:
len(chunks)

In [None]:
chunks_encoded = model.encode(chunks)

In [None]:
len(chunks_encoded)

In [None]:
chunks_encoded[3].size