This is notebook for quick Pandas Dataframe embeddings calculation using OpenAI's api.  
It serves a practical role, because I'm calculating a lot of embeddings and it can take a long time.  

I found it problematic that when just using "apply" function it takes ages, because each cell is a single call to API.  
Another issue is that you usually need to be careful about sizes of texts.  
And last issue is that running in colab/databricks, you have limited resources.  

This notebook adresses all three, by:
1. taking pandas rows in groups (this can be controlled with **df_batch_limit**)
2. for each row in group it generates sliding windows (it's controlled  with **window_size** and **window_overlap**, be careful **window_size** cannot be larger than max API input size, the notebook assumes you know that)
3. such sliced group is packed into batches that are sent to OpenAI's api (at this moment they support up to 1000 texts at once, you can change that with **api_batch_limit**)  
   I need to mention: chunks for single row always fall into single batch, so there's small inefficiency and limitation, because batches are not always fully packed. It can be problem in 2 cases:
   - if texts are very long, then the batches might have big "holes" at the end
   - if any text (chunked into windows) is larger than **api_batch_limit**, then this notebook will simply not work
4. results for those batches are then redistributed into particular rows (since each row was divided into windows, it will calculate mean(windows)/l2_norm

In [None]:
%pip install datasets tiktoken

In [None]:
import os
from google.colab import userdata
import numpy as np
import pandas as pd
import tiktoken
from datasets import load_dataset
from typing import Literal
from scipy.spatial import distance

In [None]:
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [None]:
from openai import OpenAI
client = OpenAI()

""" Example usage
response = client.embeddings.create(
    input=["Your text string goes here",
           "some other text"],
    model="text-embedding-3-small"
)

print(response.data[0].embedding)
"""

In [None]:
ds = load_dataset("qgyd2021/e_commerce_customer_service", "product")

In [None]:
# load df
df = ds['train'].to_pandas()
# remove empty text ones
df = df[(df['overview'] != '') & (df['overview'].notna())]

In [None]:
# max length of single embedding input
window_size = 512
# how much each window overlaps previous
window_overlap = 128
window_step = window_size-window_overlap

# max rows in a group to process together
df_batch_limit = 100
# max texts OpenAI can take
api_batch_limit = 1000
# column for which embeddings will be generated
embedding_source_column = 'overview'

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")

In [None]:


# Create a batch ID column
df['batch_id'] = np.arange(len(df)) // df_batch_limit

# Group by batch ID and apply your function to each group
def process_group(group):
    # Extract data from group
    source_texts = group[embedding_source_column].tolist()

    tokenizer = tiktoken.get_encoding("cl100k_base")
    texts_tokens = [tokenizer.encode(text) for text in source_texts]


    slices = [[tokenizer.decode(tokens[i*window_step:i*window_step+window_size]) for i in range(int(np.ceil(len(tokens)/window_step)))] for tokens in texts_tokens]
    slices_lengths = [len(row_slices) for row_slices in slices]
    if any(x > api_batch_limit for x in slices_lengths):
        raise ValueError(f"Single row/text exceeds api batch limit of {api_batch_limit}")

    def acc_divide_slices(slices_lengths, slices, result_boxes=[]):
        if len(slices_lengths) == 0:
            return result_boxes
        else:
            if len(result_boxes) == 0:
                result_boxes.append({'total_size': slices_lengths[0],
                                     'slice_sizes': [slices_lengths[0]],
                                     'api_package': slices[0]
                                     })
            elif (result_boxes[-1]['total_size'] + slices_lengths[0]) <= api_batch_limit:
                result_boxes[-1]['total_size'] = (result_boxes[-1]['total_size'] + slices_lengths[0])
                result_boxes[-1]['slice_sizes'].append(slices_lengths[0])
                result_boxes[-1]['api_package'].extend(slices[0])
            else:
                result_boxes.append({'total_size': slices_lengths[0],
                                     'slice_sizes': [slices_lengths[0]],
                                     'api_package': slices[0]
                                     })

            if len(slices_lengths) == 1:
                return result_boxes
            return acc_divide_slices(slices_lengths[1:], slices[1:], result_boxes)

    prepared_packages = acc_divide_slices(slices_lengths, slices)

    packages_with_results = [
        {
            'total_size': package['total_size'],
            'slice_sizes': package['slice_sizes'],
            'api_package': package['api_package'],
            'api_results': [
                item.embedding for item in client.embeddings.create(
                    input=package['api_package'],
                    model="text-embedding-3-small"
                    ).data
                ]
            }
        for package in prepared_packages]


    def calc_results(prepared_package: list[dict], combination_method: Literal['mean', 'max']) -> list:
        res = []
        for element in prepared_package:
            slice_sizes = element['slice_sizes']
            so_far_popped = 0
            for row_id in range(len(slice_sizes)):
                row_res_unnormalized = np.mean(np.array(element['api_results'][so_far_popped:so_far_popped+slice_sizes[row_id]]), axis=0)
                norm = np.linalg.norm(row_res_unnormalized)
                row_res = row_res_unnormalized/norm
                so_far_popped = so_far_popped+slice_sizes[row_id]
                res.append(row_res)
            assert so_far_popped == element['total_size']


        return res

    results = calc_results(packages_with_results, 'mean')

    # Return a Series with the results
    return pd.Series(results, index=group.index)

# Apply the function to each group and assign to new column
df[f'{embedding_source_column}_embeddings'] = df.groupby('batch_id').apply(process_group).reset_index(level=0, drop=True)
# Remove the temporary batch_id column if desired
df.drop('batch_id', axis=1, inplace=True)

In [None]:
print(df.iloc[0]['overview_embeddings'])
print(df.iloc[1]['overview_embeddings'])
print(1-distance.cosine(df.iloc[0]['overview_embeddings'], df.iloc[1]['overview_embeddings']))

In [None]:

for i in range(100):
    print(np.linalg.norm(df.iloc[i]['overview_embeddings']))