# Counting Tokens

This notebook is dedicated to exploring the number of tokens in the papers/excerpts that we will be analyzing.

In [1]:
import tiktoken
import pandas as pd
import numpy as np
import re

In [2]:
encoding = tiktoken.encoding_for_model('text-embedding-3-small')

In [3]:
papers = pd.read_csv('../papers.csv')

In [4]:
def n_tokens(text):
    try:
        n_tokens = len(encoding.encode(text))
    except:
        n_tokens = np.nan
    return n_tokens

In [5]:
papers['n_tokens'] = papers.text.apply(n_tokens)

In [6]:
papers.n_tokens.describe()

count     20277.000000
mean      10168.549835
std        6816.756597
min           1.000000
25%        6374.000000
50%        9823.000000
75%       12609.000000
max      299550.000000
Name: n_tokens, dtype: float64

In [7]:
max_tokens = 8000

In [8]:
# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):

    # Split the text into sentences
    sentences = re.split(r"\.\s+", text)

    # Get the number of tokens for each sentence
    n_tokens = [len(encoding.encode(" " + sentence)) for sentence in sentences]

    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks

In [9]:
shortened = []

In [10]:
# Loop through the dataframe
for row in papers.iterrows():

    # If the text is None, go to the next row
    if row[1]['text'] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]['n_tokens'] > max_tokens:
        shortened += split_into_many(row[1]['text'])

    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append( row[1]['text'] )

In [11]:
len(shortened)

22997

In [12]:
papers.shape

(20286, 4)

In [13]:
papers_shortened = pd.DataFrame(shortened, columns=['text'])
papers_shortened['n_tokens'] = papers_shortened.text.apply(n_tokens)
papers_shortened.n_tokens.describe()

count    22988.000000
mean      6875.241039
std       1878.560148
min          1.000000
25%       6584.750000
50%       7899.000000
75%       7945.000000
max       8001.000000
Name: n_tokens, dtype: float64

In [14]:
papers_shortened.head()

Unnamed: 0,text,n_tokens
0,573 \n\nBIT - SERIAL NEURAL NETWORKS \n\nAlan...,7959.0
1,1 \n\nCONNECTIVITY VERSUS ENTROPY \n\nYaser S...,5220.0
2,278 \n\nTHE HOPFIELD MODEL WITH MUL TI-LEVEL N...,4445.0
3,442 \n\nAlan Lapedes \nRobert Farber \n\nThe...,7942.0
4,740 \n\nSPATIAL ORGANIZATION OF NEURAL NEn...,7980.0


In [15]:
papers_shortened.shape

(22997, 2)

In [None]:
papers_shortened.head()

In [None]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-small"):
   try:
      embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
   except:
      embedding = None
   return embedding

papers_shortened['embedding'] = papers_shortened.text.apply(lambda x: get_embedding(x))

In [None]:
papers_shortened.shape

In [None]:
papers.head()

In [None]:
papers_shortened.to_csv('papers_embedded.csv')

In [None]:
papers_shortened.isna().sum()

In [None]:
papers_shortened.head()