# Counting Tokens

This notebook is dedicated to exploring the number of tokens in the papers/excerpts that we will be analyzing.

In [None]:
import tiktoken
import pandas as pd
import numpy as np
import re

In [None]:
encoding = tiktoken.encoding_for_model('text-embedding-3-small')

In [None]:
papers = pd.read_csv('../papers.csv')

In [None]:
def n_tokens(text):
    try:
        n_tokens = len(encoding.encode(text))
    except:
        n_tokens = np.nan
    return n_tokens

In [None]:
papers['n_tokens'] = papers.text.apply(n_tokens)

In [None]:
papers.n_tokens.describe()

In [None]:
max_tokens = 8000

In [None]:
# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):

    # Split the text into sentences
    sentences = re.split(r"\.\s+", text)

    # Get the number of tokens for each sentence
    n_tokens = [len(encoding.encode(" " + sentence)) for sentence in sentences]

    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks

In [None]:
shortened = []

In [None]:
# Loop through the dataframe
for row in papers.iterrows():

    # If the text is None, go to the next row
    if row[1]['text'] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]['n_tokens'] > max_tokens:
        shortened += split_into_many(row[1]['text'])

    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append( row[1]['text'] )

In [None]:
len(shortened)

In [None]:
papers.shape

In [None]:
papers_shortened = pd.DataFrame(shortened, columns=['text'])
papers_shortened['n_tokens'] = papers_shortened.text.apply(n_tokens)
papers_shortened.n_tokens.describe()

In [None]:
papers_shortened.head()

In [None]:
papers_shortened.shape

In [None]:
papers_shortened.head()

In [None]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   try:
      embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
   except:
      embedding = None
   return embedding

papers_shortened['embedding'] = papers_shortened.text.apply(lambda x: get_embedding(x))

In [None]:
papers_shortened.shape

In [None]:
papers.head()

In [None]:
papers_shortened.to_csv('papers_embedded.csv')

In [None]:
papers_shortened.isna().sum()

In [None]:
papers_shortened.head()