Here, we'll have a look at [WildChat Dataset](https://wildchat.allen.ai/). 

[Huggingface dataset](https://huggingface.co/datasets/allenai/WildChat-1M) (hugging face stub: `allenai/WildChat-1M`)


In [1]:
from imbed_data_prep.wildchat import *

In [2]:
dacc = WildchatDacc()

In [3]:
df = dacc.embeddable_df
df.shape

(3884385, 58)

In [4]:
df.columns[:20]

Index(['conversation_hash', 'model', 'timestamp', 'turn', 'language', 'toxic',
       'redacted', 'state', 'country', 'hashed_ip', 'header',
       'conversation.content', 'conversation.country',
       'conversation.hashed_ip', 'conversation.header',
       'conversation.language', 'conversation.redacted', 'conversation.role',
       'conversation.state', 'conversation.timestamp'],
      dtype='object')

In [5]:
df.language.value_counts()

language
English     2016583
Chinese      701008
Russian      457522
French       131610
Spanish      115372
             ...   
Armenian         30
Marathi          28
Tamil            14
Gujarati         14
Georgian          4
Name: count, Length: 74, dtype: int64

In [151]:
df = df[df.language == 'English']
segments = df['conversation.content'].to_list()

In [152]:
ordered_segment_lengths = sorted(df['segment_length'], reverse=True)
ordered_segment_lengths[:10]

[74660, 50685, 50006, 43920, 43918, 43918, 43846, 43409, 43173, 43097]

In [153]:
ordered_segment_lengths = sorted(df['token_count'], reverse=True)
ordered_segment_lengths[:10]

[8191, 8190, 8187, 8186, 8169, 8169, 8167, 8167, 8165, 8165]

In [154]:
total_n_chars_in_millions = sum(df['segment_length']) / 1e6
print(f"{total_n_chars_in_millions:.0f} MBs of text")

2734 MBs of text


In [155]:
from math import floor

def get_a_batch_size(
    total_n_chars_in_millions,
    *,
    max_upload_size_mb=100,
    max_n_items_in_batch=50000,
    margin=1.2,
):
    mean_batch_size = total_n_chars_in_millions / max_upload_size_mb
    target_number_of_batches = floor(mean_batch_size * margin)
    batch_size = min(max_n_items_in_batch, floor(len(df) / target_number_of_batches))
    return batch_size

batch_size = get_a_batch_size(total_n_chars_in_millions)
batch_size

50000

### Sending batches of embedding requests

In [156]:
from oa import OaStores

oa_stores = OaStores()

In [236]:
from imbed import fixed_step_chunker
from lkj import print_progress, clog as _clog
from functools import partial 

clog = partial(_clog, log_func=print_progress)


def length_if_sizable(segments):
    if hasattr(segments, '__len__'):
        return len(segments)
    else:
        return None
    

def create_embedding_task_batches(segments, batch_size, *, oa_stores=None, verbose=True):
    log = clog(verbose)
    if oa_stores is None:
        from oa import OaStores
        oa_stores = OaStores()

    total_length = length_if_sizable(segments)
    if total_length is not None:
        total_n_batches = int(total_length / batch_size) + 1
    else:
        total_n_batches = '<size unknown>'

    try:
        for i, batch in enumerate(fixed_step_chunker(segments, batch_size)):
            log(f"Uploading batch {i+1} of {total_n_batches}")
            file_id = oa_stores.files.create_embedding_task(batch)
            log(f"  Launching batch for {file_id=}")
            batch_id = oa_stores.batches.append(file_id)
            log(f"  {batch_id.id=}, {batch_id.created_at=}")
            yield batch_id
    except Exception as e:
        print(f"Error: {e}")


In [250]:
# This is the main call to create the batches for embeddings.
# I commented out to avoid running it again (it's long and expensive)

# batch_ids = list(
#     create_embedding_task_batches(segments, 2000, oa_stores=oa_stores)
# ) # 29mn

## Aggregating the embedding chunks

In [2]:
from oa.stores import OaStores
from oa.batches import get_segments_and_embeddings
from imbed_data_prep.wildchat import *

dacc = WildchatDacc()
s = OaStores()

In [11]:
from lkj import print_with_timestamp
import pandas as pd 
from oa.batches import batch_info_to_segments_and_embeddings

min_date = 1724503218 - 3600 * 24

# cumul_df = pd.DataFrame()

import os
rootdir = '/Users/thorwhalen/Downloads/wildchat_temp'
filepath_for = lambda i: os.path.join(rootdir, f"wildchat_embeddings_{i:03d}.parquet")


for i, batch_info in enumerate(s.batches_base):
    # if i < 1010:  # 370, 1009, 1010
    #     continue
    if batch_info.output_file_id is not None:
        print_with_timestamp(f"{i=}, {batch_info=}")
        if batch_info.created_at < min_date:
            break
        if batch_info.endpoint == '/v1/embeddings' and batch_info.status == 'completed':
            segments, embeddings = batch_info_to_segments_and_embeddings(s.jsonl_files, batch_info)
            df = pd.DataFrame(dict(zip(['segment', 'embedding'], [segments, embeddings])))
            df.to_parquet(filepath_for(i))
            # cumul_df = pd.concat([cumul_df, df])
            # dacc.saves['segment_and_embeddings.parquet'] = cumul_df



(11)09:47:17 - i=1041, batch_info=Batch(id='batch_U7vTZ6lJizqjsgx73Z3gpkQY', completion_window='24h', created_at=1724501174, endpoint='/v1/embeddings', input_file_id='file-zRhEu8zzG6Z9A98H5dQqIfvc', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1724501189, error_file_id=None, errors=None, expired_at=None, expires_at=1724587574, failed_at=None, finalizing_at=1724501186, in_progress_at=1724501174, metadata=None, output_file_id='file-xcI7Dnd1d8E6eAUB2K1yR3j7', request_counts=BatchRequestCounts(completed=1, failed=0, total=1))
(11)09:47:23 - i=1043, batch_info=Batch(id='batch_A8sNqtVWlCYsOcHKqid089wT', completion_window='24h', created_at=1724500991, endpoint='/v1/embeddings', input_file_id='file-ddYc1OXWFRorqpTb8dQjqCln', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1724501004, error_file_id=None, errors=None, expired_at=None, expires_at=1724587391, failed_at=None, finalizing_at=1724501001, in_progress_at=

## Aggregating the pieces

In [2]:
# TODO: Make a tool for saving chunks to a folder, then aggregating, then saving the aggregated file and deleting the chunks.
# Concatenate all the parquet files into one dataframe.

from tabled import DfFiles, ColumnOrientedMapping
from dol import filt_iter, cached_keys, Pipe
from functools import partial
import re

mk_df_files_aggregate = Pipe(
    DfFiles,
    filt_iter.suffixes('.parquet'),
    cached_keys(keys_cache=partial(sorted, key=lambda k: int(re.compile(r'(\d+)').search(k).group(1)))), 
)

df_files = mk_df_files_aggregate('/Users/thorwhalen/Downloads/wildchat_temp/')
wildchat_df2 = ColumnOrientedMapping(df_files).df()  # 22s
print(wildchat_df2.shape)
wildchat_df2.head()
len(df_files)

(1291010, 2)


647

In [3]:
wildchat_df1 = dacc.saves['segment_and_embeddings.parquet']  # 36s

In [4]:
wildchat_df = pd.concat([wildchat_df1, wildchat_df2])
del wildchat_df1, wildchat_df2

In [5]:
dacc.saves['segment_and_embeddings_2.parquet']  = wildchat_df

## Picking up the pieces

In [None]:
from imbed_data_prep.wildchat import *

dacc = WildchatDacc()

In [3]:
embeddable_df = dacc.saves['embeddable_df.parquet']
wildchat_df = dacc.saves['segment_and_embeddings_2.parquet']

In [4]:
embeddable_df.shape, wildchat_df.shape

((3884385, 58), (2031010, 2))

In [5]:
wildchat_df = wildchat_df.drop_duplicates(subset='segment')

In [6]:
t = embeddable_df[embeddable_df['language'] == 'English']
t = t[['id_', 'conversation.content']]
# del embeddable_df
t.shape

(2016583, 2)

In [7]:
# join wildchat_df (over 'conversation.content') to t (over 'segments').
# If t has some segments values that are not in wildchat_df, we keep them, but put NaNs in the columns of wildchat_df

t = t.set_index('conversation.content')
t = t.join(wildchat_df.set_index('segment'), how='left')
t = t.reset_index()
t = t.rename(columns={'index': 'segment'})
t = t.drop_duplicates(subset='id_')
t.shape

(2016583, 3)

In [8]:
lidx = t['embedding'].isna()
w = t[lidx]
redo_segments = {k: v for k, v in zip(w['id_'], w['conversation.content'])}
# redo_segments = w['conversation.content'].to_list()

print(f"{len(redo_segments)=} segments not found that had no embeddings")

import oa

redo_embedding = oa.embeddings(redo_segments)

len(redo_segments)=561 segments not found that had no embeddings


In [11]:
w = w.copy()
w['embedding'] = list(redo_embedding.values())
t = pd.concat([t[~lidx], w])
t = t.sort_values('id_')
t.shape

(2016583, 3)

In [12]:
t = t.sort_values('id_')
del t['conversation.content']

In [13]:
# offload some big stuff to not run out of memory
del wildchat_df
del embeddable_df

In [14]:
# Finally: Save this thing
# dacc.saves['embeddings.parquet'] = t

## Computing Planar embeddings

In [18]:
from imbed_data_prep.wildchat import WildchatDacc

dacc = WildchatDacc()

In [None]:
embeddings_df = dacc.saves['embeddings.parquet']

In [30]:
import imbed

planar_embeddings_ = imbed.planar_embeddings(
    embeddings_df.embedding, embeddings_func='ncvis'
)

In [73]:
dacc.saves['planar_embeddings.parquet'] = imbed.util.planar_embeddings_dict_to_df(
    planar_embeddings_
)

In [74]:
dacc.saves['planar_embeddings.parquet']

Unnamed: 0_level_0,x,y
id_,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3.207581,19.062687
1,-11.124226,9.500543
2,1.524697,1.585643
3,1.467232,1.227258
4,-8.419129,3.024352
...,...,...
2016578,-16.623129,1.531909
2016579,4.847935,5.949535
2016580,-15.696042,7.779152
2016581,-8.650246,7.840590


## Computing clusters

In [1]:
from imbed.data_prep import kmeans_cluster_indices
from imbed_data_prep.wildchat import WildchatDacc
import numpy as np
import pandas as pd


fibonacci_sequence = [5, 8, 13, 21, 34]

def clusters_df(embeddings, n_clusters = fibonacci_sequence):
    if isinstance(embeddings, pd.DataFrame):
        if 'embedding' in embeddings.columns:
            embeddings = embeddings.embedding
        embeddings = np.array(embeddings.to_list())
    def gen():
        for k in n_clusters:
            yield f"cluster_{k:02.0f}", kmeans_cluster_indices(embeddings, n_clusters=k)

    return pd.DataFrame(dict(gen()))


In [2]:
dacc = WildchatDacc()

embeddings = dacc.saves['embeddings.parquet']
ids = embeddings.id_

# make a np array of the embeddings
embeddings = np.array(embeddings.embedding.to_list())

clusters_df_ = clusters_df(embeddings)

# set the indices of the clusters_df_ to the same as the embeddings
clusters_df_ = clusters_df_.set_index(ids)

dacc.saves['clusters.parquet'] = clusters_df_

In [8]:
df = dacc.embeddable_df

In [79]:
from lkj import unique_affixes

df = df.rename(columns=dict(
    zip(
        df.columns, 
        unique_affixes(df.columns, ingress=lambda x: x.split('.'), egress=lambda x: '.'.join(x), suffix=True)
    )
))

In [78]:
df.iloc[0]

conversation_hash                                          c9ec5b440fbdd2a269333dd241f32f64
model                                                                            gpt-4-0314
timestamp                                                         2023-04-09 00:02:53+00:00
turn                                                                                      1
language                                                                            English
toxic                                                                                 False
redacted                                                                              False
state                                                                                 Texas
country                                                                       United States
hashed_ip                                 22fd87ba9b98f3d379b23c7b52961f2d4a8505127e58b3...
header                                    {'accept-language': 'en-US,en;q=0.9,es

In [None]:
df['conversation.redacted'].value_counts()

In [89]:
t = df[df['conversation.redacted'] == True]
t.iloc[0]

conversation_hash                                          06f64a7c60c0d6d3e703fe639c95009b
model                                                                            gpt-4-0314
timestamp                                                         2023-04-09 05:38:32+00:00
turn                                                                                      2
language                                                                            English
toxic                                                                                 False
redacted                                                                               True
state                                                         Province of Negros Occidental
country                                                                         Philippines
hashed_ip                                 c3a33ecad4d97759bef84668ceaf078ba91fd317d61878...
header                                    {'accept-language': 'en-GB,en-US;q=0.9

In [153]:
dacc.saves['segments.parquet'] = df[['id_', 'content']]

In [152]:
cols = [
    'id_', 'timestamp', 'role', 'language', 
    'conversation.redacted', 'country', 'token_count']
dacc.saves['metadata.parquet'] = df[cols]

In [151]:
score_cols = ['category_scores.harassment',
 'category_scores.harassment_threatening',
 'category_scores.hate',
 'category_scores.hate_threatening',
 'category_scores.self-harm',
 'category_scores.self_harm',
 'category_scores.self_harm_instructions',
 'category_scores.self_harm_intent',
 'category_scores.sexual',
 'category_scores.sexual_minors',
 'category_scores.violence',
 'category_scores.violence_graphic']

repl = {k: k[len('category_scores.'):] for k in score_cols}

score_cols.extend(
    ['identity_attack', 'insult', 'obscene', 'severe_toxicity', 'sexual_explicit', 'threat', 'toxicity']
)
t = df[score_cols]
t = t.rename(columns=repl)
t.columns
dacc.saves['scores.parquet'] = t


In [129]:
df['identity_attack'].value_counts()

identity_attack
0.000147    11760
0.000109     5860
0.001088     3690
0.000113     2610
0.000185     2488
            ...  
0.000243        1
0.000493        1
0.005238        1
0.005935        1
0.000838        1
Name: count, Length: 2568530, dtype: int64

In [125]:
t = df[[c for c in df.columns if c.startswith('category_scores.')]]
(t > 0.20).sum()

category_scores.harassment                 7486
category_scores.harassment_threatening       81
category_scores.hate                        362
category_scores.hate_threatening              0
category_scores.self-harm                   552
category_scores.self_harm                   552
category_scores.self_harm_instructions        0
category_scores.self_harm_intent             40
category_scores.sexual                    19237
category_scores.sexual_minors              1812
category_scores.violence                  31140
category_scores.violence_graphic           1540
dtype: int64

In [118]:
t

categories.harassment  categories.harassment_threatening  categories.hate  categories.hate_threatening  categories.self-harm  categories.self_harm  categories.self_harm_instructions  categories.self_harm_intent  categories.sexual  categories.sexual_minors  categories.violence  categories.violence_graphic
False                  False                              False            False                        False                 False                 False                              False                        False              False                     False                False                          3884385
Name: count, dtype: int64

In [95]:
w = dacc.train_data

README.md:   0%|          | 0.00/10.8k [00:00<?, ?B/s]

Using the latest cached version of the dataset since allenai/WildChat-1M couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /Users/thorwhalen/.cache/huggingface/datasets/allenai___wild_chat-1_m/default/0.0.0/a524e8201e98dfd07445f613736700f85b7d3918 (last modified on Thu Jul 25 16:00:42 2024).


In [98]:
w.iloc[0]

conversation_hash                       c9ec5b440fbdd2a269333dd241f32f64
model                                                         gpt-4-0314
timestamp                                      2023-04-09 00:02:53+00:00
conversation           [{'content': 'Hey there! Are you familiar with...
turn                                                                   1
language                                                         English
openai_moderation      [{'categories': {'harassment': False, 'harassm...
detoxify_moderation    [{'identity_attack': 0.00020589135237969458, '...
toxic                                                              False
redacted                                                           False
state                                                              Texas
country                                                    United States
hashed_ip              22fd87ba9b98f3d379b23c7b52961f2d4a8505127e58b3...
header                 {'accept-language': 'en-US,e

toxic
False    840774
Name: count, dtype: int64

In [100]:
w.redacted.value_counts()

redacted
False    834278
True       6496
Name: count, dtype: int64

## More scrap

In [5]:
from oa.util import utc_int_to_iso_date

utc_int_to_iso_date(1724503218)

'2024-08-24T12:40:18+00:00'

In [6]:
utc_int_to_iso_date(1724503218 - 3600 * 24)

'2024-08-23T12:40:18+00:00'

In [None]:
dacc.saves['segments_and_embeddings']

In [None]:
batch_infos = list(s.batches_base)

In [None]:
import matplotlib.pyplot as plt

t = filter(lambda x: x.endpoint == '/v1/embeddings', batch_infos)
plt.plot(sorted([x.created_at for x in t]))

In [254]:
batch = next(iter(oa_stores.batches_base))
batch.to_dict()

{'id': 'batch_YY7gAytldnVxPRy3vgnlbsbT',
 'completion_window': '24h',
 'created_at': 1724503218,
 'endpoint': '/v1/embeddings',
 'input_file_id': 'file-d1dB20OYOXmnE3ENznpwvVZd',
 'object': 'batch',
 'status': 'completed',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1724503296,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1724589618,
 'failed_at': None,
 'finalizing_at': 1724503293,
 'in_progress_at': 1724503219,
 'metadata': None,
 'output_file_id': 'file-g1iSPVYMatQRuCRQAkIGFBUA',
 'request_counts': {'completed': 1, 'failed': 0, 'total': 1}}

In [258]:
# reload oa.batches module
from importlib import reload
import oa.batches

reload(oa.batches)


<module 'oa.batches' from '/Users/thorwhalen/Dropbox/py/proj/t/oa/oa/batches.py'>

In [266]:
from oa.batches import get_output_file_data

w = get_output_file_data(t, oa_stores=oa_stores)

In [265]:
ww = w['embeddings_batch_id-1724501443969629952']
type(ww), len(ww)

(list, 1536)

In [None]:
class BatchStatus:
    def __init__(self, batch, *, oa_stores):
        self.batch = batch
        self.oa_stores = oa_stores
        self.batch_obj = self.oa_stores.batches[self.batch]

    @property
    def batch_obj(self):
        return self.oa_stores.batches[self.batch]

    def __call__(self, batch_id):
        return self.oa_stores.batches.get(batch_id)

In [247]:
from oa.util import utc_int_to_iso_date

def output_file(batch_id, *, oa_stores):
    batch_obj = oa_stores.batches[batch]
    if batch_obj.status == 'completed':
        return oa_stores.data_files[batch_obj.output_file_id]
    else:
        if batch_obj.status == 'failed':
            error_obj = ValueError(
                f"Batch {batch_id} failed "
                "at {utc_int_to_iso_date(batch_obj.failed_at)}. "
                "Check out {batch_obj.error_file_id} for more information."
            )
            
        elif batch_obj.status == 'in_progress   ':
            # TODO: add more conditions.
            ...

        error_obj.batch_obj = batch_obj  # add batch_obj as attribute of error_obj
        raise error_obj
    


In [246]:
from oa.util import *
from oa.stores import *
utc_int_to_iso_date(1724501445)

'2024-08-24T13:10:45'

In [244]:
t = oa_stores.batches[batch]
t.to_dict()

{'id': 'batch_pqJWKffJftwx5HUUliO0Rk51',
 'completion_window': '24h',
 'created_at': 1724501445,
 'endpoint': '/v1/embeddings',
 'input_file_id': 'file-cEyBYep7TSw18WA5OPVOusP5',
 'object': 'batch',
 'status': 'completed',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1724501460,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1724587845,
 'failed_at': None,
 'finalizing_at': 1724501457,
 'in_progress_at': 1724501445,
 'metadata': None,
 'output_file_id': 'file-z7nbiv2g95IVVbeS0HMjyg9W',
 'request_counts': {'completed': 1, 'failed': 0, 'total': 1}}

In [None]:
oa_stores.files[t.output_file_id]

In [None]:
oa_stores.files.client.files

## Scrap demo

In [49]:
from oa.stores import OaStores

s = OaStores()

file_objs = list(s.files_base)
batch_objs = list(s.batches_base)

In [62]:
print(f"{len(file_objs)=}, {len(batch_objs)=}")

len(file_objs)=2142, len(batch_objs)=1133


In [52]:
t = s.json_files[file_objs[0]]

In [54]:
for k, v in s.json_files.items():
    break

In [57]:
from lkj import truncate_dict_values

truncate_dict_values(v)

{'id': 'batch_req_0qWxQ4rcAnzo9lEUMZR7xsgo',
 'custom_id': 'embeddings_batch_id-1724503217185829120',
 'response': {'status_code': 200,
  'request_id': 'dacf8a2486eb1d6a7070852bf04d292d',
  'body': {'object': 'list',
   'data': [{'object': 'embedding',
     'index': 0,
     'embedding': [0.017829118, -0.0066988687]},
    {'object': 'embedding',
     'index': 1,
     'embedding': [0.0010772116, -0.0015551111]}],
   'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 629349, 'total_tokens': 629349}}},
 'error': None}

In [66]:
from collections import Counter

print(f"{Counter(x.purpose for x in file_objs)=}")
print(f"{Counter(x.status for x in file_objs)=}")

Counter(x.purpose for x in file_objs)=Counter({'batch': 1075, 'batch_output': 1067})
Counter(x.status for x in file_objs)=Counter({'processed': 2142})


In [63]:
batch_outputs = list(filter(lambda x: x.purpose=='batch_output', file_objs))
print(f"{len(batch_outputs)=}")

len(batch_outputs)=1067


In [71]:
batch_output_file = batch_outputs[0]
batch_output_file_contents = s.json_files[batch_output_file]

truncate_dict_values(batch_output_file_contents)

{'id': 'batch_req_0qWxQ4rcAnzo9lEUMZR7xsgo',
 'custom_id': 'embeddings_batch_id-1724503217185829120',
 'response': {'status_code': 200,
  'request_id': 'dacf8a2486eb1d6a7070852bf04d292d',
  'body': {'object': 'list',
   'data': [{'object': 'embedding',
     'index': 0,
     'embedding': [0.017829118, -0.0066988687]},
    {'object': 'embedding',
     'index': 1,
     'embedding': [0.0010772116, -0.0015551111]}],
   'model': 'text-embedding-3-small',
   'usage': {'prompt_tokens': 629349, 'total_tokens': 629349}}},
 'error': None}

In [82]:
from oa.oa_types import EmbeddingResponse

t = EmbeddingResponse(**batch_output_file_contents)
type(t)

oa.oa_types.Response

In [92]:
t.response.body.data[0].keys()

dict_keys(['object', 'index', 'embedding'])

## Checking the files

In [5]:
from oa.stores import OaFilesBase
import pandas as pd
from operator import methodcaller

files_base = OaFilesBase()
all_files_df = pd.DataFrame(map(methodcaller('to_dict'), files_base))
print(f"{all_files_df.shape=}")
all_files_df.head()

all_files_df.shape=(2142, 8)


Unnamed: 0,id,bytes,created_at,filename,object,purpose,status,status_details
0,file-g1iSPVYMatQRuCRQAkIGFBUA,41629954,1724503295,batch_YY7gAytldnVxPRy3vgnlbsbT_output.jsonl,file,batch_output,processed,
1,file-JUOe0LxPKzlJDauidqVAiYHd,41629905,1724503291,batch_sV6na1dUQabLp5jEEjTODCKX_output.jsonl,file,batch_output,processed,
2,file-zqP3q2hDPVwOYp1Mw8lYAV38,41628327,1724503265,batch_31Ks4sc5G78nOYlbbjo2QXaQ_output.jsonl,file,batch_output,processed,
3,file-QmvvLF1ToOcDtZhRNqo0CxQX,41628789,1724503261,batch_C4fvfp1QCYG0zDWLMhDnPC0Z_output.jsonl,file,batch_output,processed,
4,file-5X41zNzZWqqhOmksQsmmfqrI,41633793,1724503259,batch_O1YgvIBmEs60RbsXPqkv9MB9_output.jsonl,file,batch_output,processed,


## Checking on errors

In [1]:
from lkj import value_in_interval
from dol import Pipe
from operator import attrgetter
from oa import OaStores
from oa.stores import OaDacc
import pandas as pd 
from oa.batches import batches_within_range, request_counts

dacc = OaDacc()
s = dacc.s


In [3]:
date_when_requests_were_made = 1724501445

In [None]:
check_batch_requests = Pipe(batches_within_range, request_counts)
check_batch_requests(s.batches_base, date_when_requests_were_made)

In [4]:
batch = next(iter(batches_within_range(s.batches_base, min_date=date_when_requests_were_made)))
batch

Batch(id='batch_YY7gAytldnVxPRy3vgnlbsbT', completion_window='24h', created_at=1724503218, endpoint='/v1/embeddings', input_file_id='file-d1dB20OYOXmnE3ENznpwvVZd', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1724503296, error_file_id=None, errors=None, expired_at=None, expires_at=1724589618, failed_at=None, finalizing_at=1724503293, in_progress_at=1724503219, metadata=None, output_file_id='file-g1iSPVYMatQRuCRQAkIGFBUA', request_counts=BatchRequestCounts(completed=1, failed=0, total=1))

In [5]:
from oa.stores import OaDacc

dacc = OaDacc()
d = dacc.get_output_file_data(batch)
type(d)


openai._legacy_response.HttpxBinaryResponseContent

In [42]:
from operator import methodcaller
from functools import partial
from oa.stores import jsonl_loads_iter
from dol import path_get

from oa.stores import response_body_data_objects, response_body_data, object_of_data

jsonl_loads_response_lines = partial(jsonl_loads_iter, get_lines=methodcaller('iter_lines'))


for line in jsonl_loads_response_lines(d):
    data_objs = list(response_body_data_objects(line))
    break

len(data_objs)

2000

In [66]:
k = 'file-h6ezcCF8qUWd5kxZB41At7Ku'

In [67]:
r = dacc.s.files_base[k]
type(r)

openai._legacy_response.HttpxBinaryResponseContent

In [163]:
# import openai._response

In [68]:
it = r.iter_lines()

In [69]:
t = next(it)
type(t)

str

In [90]:
list(json_content)

['custom_id', 'method', 'url', 'body']

In [174]:
from pydantic import BaseModel, Field, model_validator

class BodyModel(BaseModel):
    # Define the fields within the 'body' if required
    pass

class ResponseModel(BaseModel):
    status_code: int = Field(..., ge=100, le=599)  # Ensure it's a valid status code
    body: BodyModel

    @model_validator(mode='before')
    def check_status_code(cls, values):
        if values.get('status_code') != 200:
            raise ValueError("Status code is not 200.")
        return values

class JdictModel(BaseModel):
    response: ResponseModel

    @model_validator(mode='before')
    def check_response(cls, values):
        if 'response' not in values:
            raise ValueError("No 'response' key in json object.")
        return values

In [None]:
from lkj import print_progress
from dol import path_get

from oa.stores import OaDacc

dacc = OaDacc()


class OaJsonObj:
    def __init__(self, jdict):
        self.jdict = jdict

    @property
    def custom_id(self):
        return self.jdict['custom_id']
        

class OaJsonResponseObj(OaJsonObj):

    # TODO: Mechanicize this. (Json (dict) validation (pydantic?) or routing)
    def validate(self):
        if 'response' not in self.jdict:
            raise ValueError("No 'response' key in json object.")
        if 'status_code' not in self.jdict['response']:
            raise ValueError("No 'status_code' key in 'response' key.")
        if self.jdict['response']['status_code'] != 200:
            raise ValueError("Status code is not 200.")
        if 'body' not in self.jdict['response']:
            raise ValueError("No 'body' key in 'response' key.")
        return True   
    
    @property
    def response(self):
        return self.jdict['response']


def json_object(jdict):
    custom_id = jdict['custom_id']
    if 'response' in jdict:
        response = jdict['response']
        if response['status_code'] == 200:
            body = response['body']
            return {
                'custom_id': custom_id,
                'type': 'response',
                'object': body['object'],
                'n_data': len(body['data']),
            }
        else:
            error = jdict['error']
            return {
                'custom_id': custom_id,
                'type': 'response_error',
                'error': error,
            }
    else:
        body = jdict['body']
        return {
            'custom_id': custom_id,
            'type': 'input',
            'input_type': type(body['input']),
            'n_input': len(body['input']),
            'n_input_0': len(body['input'][0]),
        }

for i, id_ in enumerate(sorted(all_files_df.id.to_list()[21:], reverse=True)):
    print_progress(f"---> {i=}: {id_}")
    jdict = dacc.s.json_files[id_]
    print(json_info(jdict))
    print("")
    


In [41]:
import lkj

In [36]:
len('A string li...o long'), len('A string...too long')

(20, 19)

In [None]:
from functools import partial
from typing import List, Dict, ClassVar, Type, Optional, Any
from abc import abstractmethod
from pprint import pformat

from pydantic import BaseModel, Field, ValidationError
from lkj import truncate_dict_values

    
truncate_dict_list_values = partial(truncate_dict_values, max_string_size=50)


# Base class for all JSON responses
class JsonResponse(BaseModel):
    # Class-level registry for dynamically registering subclasses
    _response_registry: ClassVar[Dict[str, Type['JsonResponse']]] = {}

    custom_id: str

    @abstractmethod
    def value(self):
        """Extract main data from the JSON response"""
        pass

    @classmethod
    def register_response_type(cls, name: str, response_cls: Type['JsonResponse']):
        """Register a new response type to the registry"""
        cls._response_registry[name] = response_cls

    @classmethod
    def from_dict(cls, data: Dict) -> 'JsonResponse':
        """Determine which response type to instantiate by attempting validation"""
        for response_type, response_cls in cls._response_registry.items():
            try:
                return response_cls.model_validate(data)
            except ValidationError:
                continue
        
        raise ValueError(
            "No valid response type found for this data: \n"
            f"{pformat(truncate_dict_values(data))}"
        )

# Successful response model
class ResponseBodyModel(BaseModel):
    object: str
    data: List[dict]

class ResponseModel(BaseModel):
    status_code: int = Field(..., ge=100, le=599)
    body: ResponseBodyModel

class ResponseSuccessModel(JsonResponse):
    response: ResponseModel

    def value(self):
        return {
            'object': self.response.body.object,
            'n_data': len(self.response.body.data),
        }


class ErrorDetailsModel(BaseModel):
    message: str
    type: str
    param: Optional[Any]
    code: Optional[Any]

class ErrorBodyModel(BaseModel):
    error: ErrorDetailsModel

class ResponseModel(BaseModel):
    status_code: int = Field(..., eq=400)
    body: ErrorBodyModel

class ErrorResponseModel(BaseModel):
    response: ResponseModel


# Input data model
class InputBodyModel(BaseModel):
    input: List[List[int]]

class InputModel(JsonResponse):
    body: InputBodyModel

    def value(self):
        return {
            'input_type': type(self.body.input),
            'n_input': len(self.body.input),
            'n_input_0': len(self.body.input[0]),
        }




# Registering the subclasses with appropriate names
JsonResponse.register_response_type('response', ResponseSuccessModel)
JsonResponse.register_response_type('response_error', ErrorResponseModel)
JsonResponse.register_response_type('input', InputModel)



In [6]:
from functools import partial
from typing import List, Dict, ClassVar, Type, Optional, Any
from abc import abstractmethod
from pprint import pformat

from pydantic import BaseModel, Field, ValidationError
from lkj import truncate_dict_values

    
truncate_dict_list_values = partial(truncate_dict_values, max_string_size=50)


# Base class for all JSON responses
class JsonResponse(BaseModel):
    # Class-level registry for dynamically registering subclasses
    _response_registry: ClassVar[Dict[str, Type['JsonResponse']]] = {}

    custom_id: str

    @abstractmethod
    def value(self):
        """Extract main data from the JSON response"""
        pass

    @classmethod
    def register_response_type(cls, name: str, response_cls: Type['JsonResponse']):
        """Register a new response type to the registry"""
        cls._response_registry[name] = response_cls

    @classmethod
    def from_dict(cls, data: Dict) -> 'JsonResponse':
        """Determine which response type to instantiate by attempting validation"""
        for response_type, response_cls in cls._response_registry.items():
            try:
                return response_cls.model_validate(data)
            except ValidationError:
                continue
        
        raise ValueError(
            "No valid response type found for this data: \n"
            f"{pformat(truncate_dict_values(data))}"
        )

# Successful response model
class ResponseBodyModel(BaseModel):
    object: str
    data: List[dict]

class ResponseModel(BaseModel):
    status_code: int = Field(..., ge=100, le=599)
    body: ResponseBodyModel

class ResponseSuccessModel(JsonResponse):
    response: ResponseModel

    def value(self):
        return {
            'object': self.response.body.object,
            'n_data': len(self.response.body.data),
        }


class ErrorDetailsModel(BaseModel):
    message: str
    type: str
    param: Optional[Any]
    code: Optional[Any]

class ErrorBodyModel(BaseModel):
    error: ErrorDetailsModel

class ResponseModel(BaseModel):
    status_code: int = Field(..., eq=400)
    body: ErrorBodyModel

class ErrorResponseModel(BaseModel):
    response: ResponseModel


# Input data model
class InputBodyModel(BaseModel):
    input: List[List[int]]

class InputModel(JsonResponse):
    body: InputBodyModel

    def value(self):
        return {
            'input_type': type(self.body.input),
            'n_input': len(self.body.input),
            'n_input_0': len(self.body.input[0]),
        }




# Registering the subclasses with appropriate names
JsonResponse.register_response_type('response', ResponseSuccessModel)
JsonResponse.register_response_type('response_error', ErrorResponseModel)
JsonResponse.register_response_type('input', InputModel)



In [45]:
ValidationError

pydantic_core._pydantic_core.ValidationError

In [None]:
def is_valid_wrt_model(json_obj, model):
    try:
        model(**json_obj)
        return True
    except ValidationError as e:
        return False

In [44]:
class InputBody(BaseModel):
    input: List[str]
    model: str


class BatchRequest(BaseModel):
    custom_id: str
    method: str
    url: str
    body: InputBody



    
is_valid_wrt_model(jdict, BatchRequest)

True

In [26]:
def truncate_dict_list_values(d: dict, max_list_size: int=2) -> dict:
    """Returns a new dictionary whose list values where reduced to a max_list_size.
    Note that the output will have the same nested keys structure, but when 
    a value is a list, only the first max_list_size elements will be kept.
    

    """
    if isinstance(d, dict):
        return {k: truncate_dict_list_values(v, max_list_size) for k, v in d.items()}
    elif isinstance(d, list):
        return [truncate_dict_list_values(v, max_list_size) for v in d[:max_list_size]]
    else:
        return d
    
t = truncate_dict_list_values(jdict, max_list_size=2)


In [27]:
t

{'custom_id': 'embeddings_batch_id-1724501970634621952',
 'method': 'POST',
 'url': '/v1/embeddings',
 'body': {'input': ['latitude and longitude?',
   "What’s the difference between 'any more' and 'anymore'?"],
  'model': 'text-embedding-3-small'}}

In [13]:
import qo
import json

qo.copy_to_clipboard(json.dumps(jdict))

In [None]:
t = r.json()
assert t['response']['status_code'] == 400

assert t['response']['body']['error'] == {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.",
 'type': 'invalid_request_error',
 'param': None,
 'code': None}

error_response = ErrorResponseModel.model_validate(t)
error_response

t['response']['body']['error']


{'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.",
 'type': 'invalid_request_error',
 'param': None,
 'code': None}

In [246]:
from lkj import print_progress
from dol import path_get

from oa.stores import OaDacc

dacc = OaDacc()

def json_info(jdict):
    custom_id = jdict['custom_id']
    if 'response' in jdict:
        # response kind
        response = jdict['response']
        if response['status_code'] == 200:
            body = response['body']
            return {
                'custom_id': custom_id,
                'type': 'response',
                'object': body['object'],
                'n_data': len(body['data']),
            }
        else:
            # response_error kind
            error = jdict['error']
            return {
                'custom_id': custom_id,
                'type': 'response_error',
                'error': jdict['error'],
            }
    else:
        body = jdict['body']
        # response_error kind
        return {
            'custom_id': custom_id,
            'type': 'input',
            'input_type': type(body['input']),
            'n_input': len(body['input']),
            'n_input_0': len(body['input'][0]),
        }

for i, id_ in enumerate(sorted(all_files_df.id.to_list()[21:], reverse=True)):
    print_progress(f"---> {i=}: {id_}")
    jdict = dacc.s.json_files[id_]
    print(JsonResponse.from_dict(jdict))
    print("")
    

(28)18:58:44 - ---> i=0: file-zvibCKnvuH4aUFijR7ad35fB


TypeError: descriptor 'alias' for 'FieldInfo' objects doesn't apply to a 'FieldInfo' object

In [198]:
from dol import path_get
from dol.paths import paths_getter

from functools import partial


def return_none(x):
    return None


extract_response_info = paths_getter(
    {
        'custom_id': 'custom_id',
        'status_code': 'response.status_code',
        'error': 'error',
        'response.body': 'response.body',
        'body': 'body',
    },
    on_error=return_none,
)


def determine_kind(jdict):
    info = extract_response_info(jdict)

    if info['status_code']:
        if (
            info['status_code'] == 200
            and info['response.body']
            and 'data' in info['response.body']
        ):
            return 'response'
        elif info['status_code'] != 200 and info['error']:
            return 'response_error'
    elif info['body'] and 'input' in info['body']:
        return 'input'

    raise ValueError("Could not determine kind of JSON object")

extract_response_info(r.json())
    

{'custom_id': 'embeddings_batch_id-1724496832063141888',
 'status_code': 400,
 'error': None,
 'response.body': {'error': {'message': "'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.",
   'type': 'invalid_request_error',
   'param': None,
   'code': None}},
 'body': None}

In [130]:
from lkj import print_progress
from dol import path_get

from oa.stores import OaDacc

dacc = OaDacc()

def json_info(jdict):
    custom_id = jdict['custom_id']
    if 'response' in jdict:
        # response kind
        response = jdict['response']
        if response['status_code'] == 200:
            body = response['body']
            return {
                'custom_id': custom_id,
                'type': 'response',
                'object': body['object'],
                'n_data': len(body['data']),
            }
        else:
            # response_error kind
            error = jdict['error']
            return {
                'custom_id': custom_id,
                'type': 'response_error',
                'error': jdict['error'],
            }
    else:
        body = jdict['body']
        # response_error kind
        return {
            'custom_id': custom_id,
            'type': 'input',
            'input_type': type(body['input']),
            'n_input': len(body['input']),
            'n_input_0': len(body['input'][0]),
        }

for i, id_ in enumerate(sorted(all_files_df.id.to_list()[21:], reverse=True)):
    print_progress(f"---> {i=}: {id_}")
    jdict = dacc.s.json_files[id_]
    print(json_info(jdict))
    print("")
    


(28)13:49:55 - ---> i=0: file-zvibCKnvuH4aUFijR7ad35fB
{'custom_id': 'embeddings_batch_id-1724501970634621952', 'type': 'input', 'input_type': <class 'list'>, 'n_input': 2000, 'n_input_0': 23}

(28)13:49:57 - ---> i=1: file-zvSJANC824VN2BnxLBjuPCsE
{'custom_id': 'embeddings_batch_id-1724501975561022976', 'type': 'input', 'input_type': <class 'list'>, 'n_input': 2000, 'n_input_0': 584}

(28)13:49:58 - ---> i=2: file-zvIT8muZ1BANCvj70vhY36ih
{'custom_id': 'embeddings_batch_id-1724501630836592128', 'type': 'response', 'object': 'list', 'n_data': 2000}

(28)13:50:02 - ---> i=3: file-zrqNUeHJciihWh7sRNRYjJNR
{'custom_id': 'embeddings_batch_id-1724502280773035008', 'type': 'response', 'object': 'list', 'n_data': 2000}

(28)13:50:05 - ---> i=4: file-zrJoJ1IA1Tj3Wjzl0CAaPDaV
{'custom_id': 'embeddings_batch_id-1724496832063141888', 'type': 'response_error', 'error': None}

(28)13:50:06 - ---> i=5: file-zpQQmn9PlANQGsPOwfooEEWZ
{'custom_id': 'embeddings_batch_id-1724502884452224000', 'type': 'in

KeyboardInterrupt: 

In [129]:
json_info(jdict)

{'custom_id': 'embeddings_batch_id-1724501630836592128',
 'type': 'response',
 'object': 'list',
 'n_data': 2000}

In [127]:
print(list(jdict))
list(jdict['response'])


['id', 'custom_id', 'response', 'error']


['status_code', 'request_id', 'body']

In [112]:
rj = dacc.s.files_base['file-zrqNUeHJciihWh7sRNRYjJNR']


AttributeError: 'bytes' object has no attribute 'json'

In [124]:
list(rj['response'])#['body']

['status_code', 'request_id', 'body']

In [105]:
rj['error']

In [102]:
from dol.paths import paths_getter

f = paths_getter({x: x for x in ['custom_id', 'body.input.0']})
f(rj)

{'custom_id': 'embeddings_batch_id-1724501970634621952',
 'body.input.0': 'latitude and longitude?'}

In [80]:
all_files_df.iloc[-1]

id                file-DVT6hjyjOfGSPvGS6sIMmV7b
bytes                                        75
created_at                           1723223404
filename                      tmp8qb8dvh2.jsonl
object                                     file
purpose                                   batch
status                                processed
status_details                             None
Name: 2141, dtype: object

In [75]:

tt = r.json()

from oa.stores import print_some_jsonl_line_fields

print_some_jsonl_line_fields(tt)

list(line)=['id', 'custom_id', 'response', 'error']
list(line['response'])=['status_code', 'request_id', 'body']
list(line['response']['body'])=['object', 'data', 'model', 'usage']
line['response']['body']['object']='list'
len(line['response']['body']['data'])=4
list(line['response']['body']['data'][0])=['object', 'index', 'embedding']


In [78]:
len(tt['response']['body']['data'][0]['embedding'])

1536

In [72]:
tt['id'], tt['custom_id']
ttt = tt['response']['body']
list(ttt)
data = ttt['data']
len(data)

4

In [86]:
from oa.stores import print_some_jsonl_line_fields

print_some_jsonl_line_fields(line)


list(line)=['id', 'custom_id', 'response', 'error']
list(line['response'])=['status_code', 'request_id', 'body']
list(line['response']['body'])=['object', 'data', 'model', 'usage']
line['response']['body']['object']='list'
len(line['response']['body']['data'])=2000
list(line['response']['body']['data'][0])=['object', 'index', 'embedding']


In [6]:
ttt = tt['response']
list(ttt)  # ['status_code', 'request_id', 'body']
w = ttt['body']
list(w)  # ['object', 'data', 'model', 'usage']

['object', 'data', 'model', 'usage']

In [9]:
len(w['data'])

2000

In [11]:
bytes.splitlines(b'sd\nlkjf')

[b'sd', b'lkjf']

In [1]:
from oa.stores import OaStores

oa_stores = OaStores()

w = oa_stores.files_base[batch.output_file_id]

NameError: name 'batch' is not defined

In [79]:
tt = t[0]
len(tt)
list(tt)

['id', 'custom_id', 'response', 'error']

In [71]:
from oa.stores import jsonl_dumps, jsonl_loads

# t = jsonl_loads(d.content)

# t = list(map(json.loads, d.content.decode().split('\n')))

t = list(map(json.loads, d.iter_lines()))
t[0]

KeyboardInterrupt: 

JSONDecodeError: Extra data: line 3 column 1 (char 43)

In [43]:
type(batch)
from openai.types.batch import Batch
from openai.types import EmbeddingCreateParams

t = list(filter(callable, (getattr(Batch, x) for x in vars(Batch) if not x.startswith('_'))))
t = [f"{x.__name__}{Sig(x)}" for x in t]
t
    

[]

In [13]:
batch.to_dict()

{'id': 'batch_YY7gAytldnVxPRy3vgnlbsbT',
 'completion_window': '24h',
 'created_at': 1724503218,
 'endpoint': '/v1/embeddings',
 'input_file_id': 'file-d1dB20OYOXmnE3ENznpwvVZd',
 'object': 'batch',
 'status': 'completed',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1724503296,
 'error_file_id': None,
 'errors': None,
 'expired_at': None,
 'expires_at': 1724589618,
 'failed_at': None,
 'finalizing_at': 1724503293,
 'in_progress_at': 1724503219,
 'metadata': None,
 'output_file_id': 'file-g1iSPVYMatQRuCRQAkIGFBUA',
 'request_counts': {'completed': 1, 'failed': 0, 'total': 1}}

In [114]:
from oa import OaStores

s = OaStores()
batch_id = 'batch_SG05bpxRRsw9u2WtlR9qQOgK'
batch = s.batches[batch_id]
batch.to_dict()

{'id': 'batch_SG05bpxRRsw9u2WtlR9qQOgK',
 'completion_window': '24h',
 'created_at': 1724496845,
 'endpoint': '/v1/embeddings',
 'input_file_id': 'file-i4LQNnei6RZBGaIVlWLbhYtH',
 'object': 'batch',
 'status': 'completed',
 'cancelled_at': None,
 'cancelling_at': None,
 'completed_at': 1724496870,
 'error_file_id': 'file-zrJoJ1IA1Tj3Wjzl0CAaPDaV',
 'errors': None,
 'expired_at': None,
 'expires_at': 1724583245,
 'failed_at': None,
 'finalizing_at': 1724496870,
 'in_progress_at': 1724496847,
 'metadata': None,
 'output_file_id': None,
 'request_counts': {'completed': 0, 'failed': 1, 'total': 1}}

In [115]:
from dol import path_get
f = s.files_base[batch.error_file_id]
r = f.json()
body = r['response']['body']
print(body['error']['message'])

'$.input' is invalid. Please check the API reference: https://platform.openai.com/docs/api-reference.


In [124]:
t = s.files_base[batch.input_file_id]
type(t.content)

bytes

In [127]:
from oa.stores import jsonl_loads_iter

w = list(jsonl_loads_iter(t))
if len(w) == 1:
    w = w[0]
list(w)

['custom_id', 'method', 'url', 'body']

In [134]:
ww = w['body']['input']
'.input' in ''.join(ww)

True

In [135]:
''.join(ww).index('.input')

543006

In [145]:
has_input_string = list(filter(None, map(lambda x: x if '.input' in x else None, ww)))
has_input_string[1]


'这是你昨天帮我写的聊天室，能不能帮我改一下样式让这个页面看起来科技感一些，还有兼容一下移动端<html lang="en">\n<head>\n    <meta charset="UTF-8">\n    <meta name="viewport" content="width=device-width, initial-scale=1.0">\n    <title>Vue Chat</title>\n    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/element-ui/2.15.6/theme-chalk/index.css">\n    <script src="https://cdnjs.cloudflare.com/ajax/libs/vue/2.6.14/vue.min.js"></script>\n    <script src="https://cdnjs.cloudflare.com/ajax/libs/axios/0.23.0/axios.min.js"></script>\n    <script src="https://cdnjs.cloudflare.com/ajax/libs/element-ui/2.15.6/index.js"></script>\n    <style>\n        .chat-container {\n            display: flex;\n            height: 100vh;\n            width: 100%;\n        }\n        .user-list {\n            width: 25%;\n            border-right: 1px solid #f0f0f0;\n        }\n        .chat {\n            width: 75%;\n            display: flex;\n            flex-direction: column;\n            justify-content: space-between;\n           

In [137]:
''.join(ww)[(543006-100):(543006+100)]

"rives to honor Sitara's memory through her continued growth and dedication to the path of the hunter.input = What's the weather like today? temperature=0.1The weather today is quite cold with a temper"

## Misc

In [16]:
import oa

t = oa.embeddings(segments[-1])

In [18]:
df.language.value_counts()

language
English     2033782
Chinese      702062
Russian      461567
French       132346
Spanish      115912
             ...   
Armenian         30
Marathi          28
Tamil            14
Gujarati         14
Georgian          4
Name: count, Length: 74, dtype: int64

# Appendix

## Making the raw data

In [2]:
from collections import Counter
from imbed.base import HugfaceDaccBase

d = HugfaceDaccBase("allenai/WildChat-1M", name='wildchat')


In [58]:
df = d.train_data

In [59]:
df.shape

(840774, 14)

In [60]:
df.iloc[0]

conversation_hash                       c9ec5b440fbdd2a269333dd241f32f64
model                                                         gpt-4-0314
timestamp                                      2023-04-09 00:02:53+00:00
conversation           [{'content': 'Hey there! Are you familiar with...
turn                                                                   1
language                                                         English
openai_moderation      [{'categories': {'harassment': False, 'harassm...
detoxify_moderation    [{'identity_attack': 0.00020589135237969458, '...
toxic                                                              False
redacted                                                           False
state                                                              Texas
country                                                    United States
hashed_ip              22fd87ba9b98f3d379b23c7b52961f2d4a8505127e58b3...
header                 {'accept-language': 'en-US,e

In [64]:
pd.Series(dict(Counter(df.model).most_common()))

gpt-3.5-turbo-0613    371341
gpt-3.5-turbo-0301    195161
gpt-4-1106-preview    100972
gpt-4-0125-preview     62581
gpt-3.5-turbo-0125     57426
gpt-4-0314             53291
gpt-4-0613                 2
dtype: int64

In [66]:
t[0]

{'content': 'Hey there! Are you familiar with reality shifting? So, I’m refining a foolproof method for reality shifting and want to pick a destination. Want to help me? I’m thinking something pretty personalized. There are a few things that are required of my destination. 1. The quest. I have to have a clear overarching goal in my reality, and don’t make it too crazy. It should be more along the lines of “save the president’s daughter” or “escape this weird wacky sinister place” NOT “get an artifact that literally controls reality”. Seriously, don’t make me fetch an artifact, or fetch anything. Instead, make me DO something. 2. Babes. I need pretty girls. 3. The entry. I need to get to lose consciousness in order to begin my journey in my desired reality, preferably by having it knocked out by one of the aforementioned babes. 4. Action. It needs to be cool. 5. Unconsciousness. Myself and the babes need to pass out in this place, preferably by being knocked out in some way or fainting.

In [67]:
from tabled import expand_rows

w = expand_rows(df.iloc[:10], ['conversation', 'openai_moderation', 'detoxify_moderation'])
w.shape

(24, 14)

In [68]:
w.conversation.iloc[0]

{'content': 'Hey there! Are you familiar with reality shifting? So, I’m refining a foolproof method for reality shifting and want to pick a destination. Want to help me? I’m thinking something pretty personalized. There are a few things that are required of my destination. 1. The quest. I have to have a clear overarching goal in my reality, and don’t make it too crazy. It should be more along the lines of “save the president’s daughter” or “escape this weird wacky sinister place” NOT “get an artifact that literally controls reality”. Seriously, don’t make me fetch an artifact, or fetch anything. Instead, make me DO something. 2. Babes. I need pretty girls. 3. The entry. I need to get to lose consciousness in order to begin my journey in my desired reality, preferably by having it knocked out by one of the aforementioned babes. 4. Action. It needs to be cool. 5. Unconsciousness. Myself and the babes need to pass out in this place, preferably by being knocked out in some way or fainting.

In [69]:
w.openai_moderation.iloc[0]

{'categories': {'harassment': False,
  'harassment/threatening': False,
  'harassment_threatening': False,
  'hate': False,
  'hate/threatening': False,
  'hate_threatening': False,
  'self-harm': False,
  'self-harm/instructions': False,
  'self-harm/intent': False,
  'self_harm': False,
  'self_harm_instructions': False,
  'self_harm_intent': False,
  'sexual': False,
  'sexual/minors': False,
  'sexual_minors': False,
  'violence': False,
  'violence/graphic': False,
  'violence_graphic': False},
 'category_scores': {'harassment': 0.000484861753648147,
  'harassment/threatening': 0.00012471186346374452,
  'harassment_threatening': 0.00012471186346374452,
  'hate': 0.000457498652394861,
  'hate/threatening': 1.3141398994775955e-05,
  'hate_threatening': 1.3141398994775955e-05,
  'self-harm': 5.52945930394344e-05,
  'self-harm/instructions': 1.1666821819744655e-06,
  'self-harm/intent': 1.4811689652560744e-05,
  'self_harm': 5.52945930394344e-05,
  'self_harm_instructions': 1.16668218

In [70]:
w.detoxify_moderation.iloc[0]

{'identity_attack': 0.00020589135237969458,
 'insult': 0.002148400293663144,
 'obscene': 0.0004123652761336416,
 'severe_toxicity': 3.0470857382169925e-05,
 'sexual_explicit': 0.00012746300490107387,
 'threat': 6.507332727778703e-05,
 'toxicity': 0.005422386806458235}

... and from there we expand rows and columns so that we can have a flat dataframe.
This is born `expand_wildchat_data(df)` and `dacc.expanded_train`.

## Adding specific props and methods

In [1]:
from imbed_data_prep.wildchat import *

d = WildchatDacc()

In [2]:
list(d.saves)

['test.parquet',
 'wildchat_train.parquet',
 'test..parquet',
 'expanded_train.parquet']

In [72]:
# d.language_conversation_counts  # computes on train_data (raw data)

English     481268
Chinese     119303
Russian      87127
French       26977
Spanish      20447
             ...  
Marathi         11
Armenian         8
Tamil            2
Gujarati         2
Georgian         1
Length: 74, dtype: int64

In [7]:
d.language_turn_counts  # ~10s, for loading expanded_train

language
English     2033782
Chinese      702062
Russian      461567
French       132346
Spanish      115912
             ...   
Armenian         30
Marathi          28
Tamil            14
Gujarati         14
Georgian          4
Name: count, Length: 74, dtype: int64

In [10]:
df = d.expanded_en

In [41]:
df.shape

(2033782, 54)

In [13]:
all_content = '\n'.join(df['conversation.content'])
len(all_content)  # 3_008_604_331

3008604331

In [15]:
import oa

oa.num_tokens(all_content)  # = 673_443_658 (takes 5mn30 to compute)

673443658

In [42]:
0.02 * 673443658 / 1e6

13.46887316

In [32]:
import oa

# oa.compute_price('ada v2', 673443658)
oa.base.model_information_dict

oa.compute_price('text-embedding-3-small', 673443658)

13.468873160000001

In [50]:
from oa.util import embeddings_models

embeddings_models

{'text-embedding-3-small': {'price_per_million_tokens': 0.02,
  'pages_per_dollar': 62500,
  'performance_on_mteb_eval': 62.3,
  'max_input': 8191},
 'text-embedding-3-large': {'price_per_million_tokens': 0.13,
  'pages_per_dollar': 9615,
  'performance_on_mteb_eval': 64.6,
  'max_input': 8191},
 'text-embedding-ada-002': {'price_per_million_tokens': 0.1,
  'pages_per_dollar': 12500,
  'performance_on_mteb_eval': 61.0,
  'max_input': 8191},
 'batch__text-embedding-3-small': {'price_per_million_tokens': 0.01,
  'pages_per_dollar': 100,
  'performance_on_mteb_eval': 62.3,
  'max_input': 8191},
 'batch__text-embedding-3-large': {'price_per_million_tokens': 0.065,
  'pages_per_dollar': 15,
  'performance_on_mteb_eval': 64.6,
  'max_input': 8191},
 'batch__text-embedding-ada-002': {'price_per_million_tokens': 0.05,
  'pages_per_dollar': 20,
  'performance_on_mteb_eval': 61.0,
  'max_input': 8191}}

In [52]:
# oa.embeddings('dog')

In [6]:
data_for_cosmo = d.data_with_planar_embeddings()

Making dataframe from parquet bytes
Loading bytes of raw data from /Users/thorwhalen/Dropbox/_odata/figiri/github-repos.parquet
... Done
Dropping duplicate nameWithOwner (github stub)...
... Dropped 209524 duplicates
... Done


In [6]:
# d.cache['github_repo_for_cosmos.parquet'] = data_for_cosmo

In [19]:
# get numpy array of segment vectors
# the d.segment_vectors is a series whose elements are numpy arrays
# We want to have a 2 dimentional array with the same number of rows as the series
# and the number of columns as the length of the numpy arrays

import numpy as np

X = np.vstack(d.segment_vectors.tolist())
X.shape

(3065063, 256)

In [15]:
from sklearn.cluster import KMeans
d.embeddings_matrix

(3065063, 256)

In [9]:
t = d.cluster_indices

Computing cluster indices for num of clusters: (5, 8, 13, 21, 34)
... Done


In [24]:
df.iloc[0]

conversation_hash                                                            c9ec5b440fbdd2a269333dd241f32f64
model                                                                                              gpt-4-0314
timestamp                                                                           2023-04-09 00:02:53+00:00
turn                                                                                                        1
language                                                                                              English
toxic                                                                                                   False
redacted                                                                                                False
state                                                                                                   Texas
country                                                                                         United States
hashed_ip 

In [25]:
t = df[['conversation_hash', 'turn']].groupby('conversation_hash')

# see if the turn number is the same for each group

tt = t['turn'].nunique()
               

In [34]:
w = t['turn']
w

<pandas.core.groupby.generic.SeriesGroupBy object at 0x60e4ef460>

In [46]:
ww = df[df.conversation_hash == df.conversation_hash.iloc[10000]]
ww.shape

(18, 54)

In [47]:
ww

Unnamed: 0,conversation_hash,model,timestamp,turn,language,toxic,redacted,state,country,hashed_ip,...,openai_moderation.category_scores.hate,openai_moderation.category_scores.hate_threatening,openai_moderation.category_scores.self-harm,openai_moderation.category_scores.self_harm,openai_moderation.category_scores.self_harm_instructions,openai_moderation.category_scores.self_harm_intent,openai_moderation.category_scores.sexual,openai_moderation.category_scores.sexual_minors,openai_moderation.category_scores.violence,openai_moderation.category_scores.violence_graphic
10022,b6c9e101545621fe21b57cf071ea4fbb,gpt-3.5-turbo-0301,2023-04-10 00:19:13+00:00,9,English,False,False,,United Kingdom,655c90bccccbfe5d45155d8ba640968d3979be04a574f5...,...,8.560066e-06,1.177902e-07,5.698167e-08,5.698167e-08,7.000239e-07,3.92402e-08,1.607948e-05,1.403812e-07,4e-06,8.308701e-07
10023,b6c9e101545621fe21b57cf071ea4fbb,gpt-3.5-turbo-0301,2023-04-10 00:19:13+00:00,9,English,False,False,,United Kingdom,655c90bccccbfe5d45155d8ba640968d3979be04a574f5...,...,3.599308e-06,1.932818e-08,1.549814e-08,1.549814e-08,2.338652e-08,6.392519e-09,3.556191e-06,4.100534e-08,3e-06,9.448757e-07
10024,b6c9e101545621fe21b57cf071ea4fbb,gpt-3.5-turbo-0301,2023-04-10 00:19:13+00:00,9,English,False,False,,United Kingdom,655c90bccccbfe5d45155d8ba640968d3979be04a574f5...,...,0.0007716891,9.781449e-06,5.611879e-07,5.611879e-07,0.0001037431,6.468735e-06,3.651887e-07,6.270196e-08,1.8e-05,8.82046e-06
10025,b6c9e101545621fe21b57cf071ea4fbb,gpt-3.5-turbo-0301,2023-04-10 00:19:13+00:00,9,English,False,False,,United Kingdom,655c90bccccbfe5d45155d8ba640968d3979be04a574f5...,...,2.318166e-05,1.200334e-08,2.156982e-07,2.156982e-07,5.506181e-08,5.02872e-07,2.864497e-06,3.68043e-08,4.8e-05,3.79702e-05
10026,b6c9e101545621fe21b57cf071ea4fbb,gpt-3.5-turbo-0301,2023-04-10 00:19:13+00:00,9,English,False,False,,United Kingdom,655c90bccccbfe5d45155d8ba640968d3979be04a574f5...,...,8.972065e-05,3.721726e-08,6.696445e-08,6.696445e-08,2.632944e-09,1.321221e-08,3.964782e-05,5.612077e-07,7e-06,9.867542e-06
10027,b6c9e101545621fe21b57cf071ea4fbb,gpt-3.5-turbo-0301,2023-04-10 00:19:13+00:00,9,English,False,False,,United Kingdom,655c90bccccbfe5d45155d8ba640968d3979be04a574f5...,...,2.800229e-05,1.644585e-07,4.765648e-06,4.765648e-06,2.606096e-07,6.785344e-06,0.0002763604,1.187764e-05,0.00015,3.415837e-05
10028,b6c9e101545621fe21b57cf071ea4fbb,gpt-3.5-turbo-0301,2023-04-10 00:19:13+00:00,9,English,False,False,,United Kingdom,655c90bccccbfe5d45155d8ba640968d3979be04a574f5...,...,8.1868e-06,5.734495e-08,1.032312e-07,1.032312e-07,5.211108e-06,1.527691e-07,4.321943e-05,5.01063e-07,1.3e-05,1.206811e-06
10029,b6c9e101545621fe21b57cf071ea4fbb,gpt-3.5-turbo-0301,2023-04-10 00:19:13+00:00,9,English,False,False,,United Kingdom,655c90bccccbfe5d45155d8ba640968d3979be04a574f5...,...,2.822489e-05,6.462722e-08,3.939725e-07,3.939725e-07,3.200054e-08,2.077721e-08,2.953003e-06,7.248196e-08,5e-06,5.167577e-06
10030,b6c9e101545621fe21b57cf071ea4fbb,gpt-3.5-turbo-0301,2023-04-10 00:19:13+00:00,9,English,False,False,,United Kingdom,655c90bccccbfe5d45155d8ba640968d3979be04a574f5...,...,9.061228e-06,3.345552e-08,9.246688e-07,9.246688e-07,4.029054e-07,9.376783e-07,0.0002356749,1.261401e-05,5.7e-05,7.687549e-06
10031,b6c9e101545621fe21b57cf071ea4fbb,gpt-3.5-turbo-0301,2023-04-10 00:19:13+00:00,9,English,False,False,,United Kingdom,655c90bccccbfe5d45155d8ba640968d3979be04a574f5...,...,5.081911e-07,1.963464e-07,5.633334e-06,5.633334e-06,1.39987e-06,1.684211e-05,0.0005790507,1.82382e-05,9e-05,3.037581e-05


In [51]:
df[['conversation.content', 'conversation.role']]

Unnamed: 0,conversation.content,conversation.role
0,Hey there! Are you familiar with reality shift...,user
1,Hey there! I'm more than happy to help you pla...,assistant
2,Crea una imagen de una mujer corriente por la ...,user
3,"Como inteligencia artificial basada en texto, ...",assistant
4,Aceede a Amazon y dame una lista de lo TV más ...,user
...,...,...
3930079,In the haunting expanse of Red Dead Redemption...,assistant
3930080,"Write an interesting, riveting, action-packed,...",user
3930081,"In the twilight of 1955 AD, amidst a world int...",assistant
3930082,در مورد دوران پس از ظهور امام زمان انشا بنویس.,user


In [58]:
from collections import Counter
import pandas as pd

w = pd.Series(dict(Counter(df.language).most_common()))
w

English     2033782
Chinese      702062
Russian      461567
French       132346
Spanish      115912
             ...   
Armenian         30
Marathi          28
Tamil            14
Gujarati         14
Georgian          4
Length: 74, dtype: int64

In [60]:
w.iloc[:30]

English       2033782
Chinese        702062
Russian        461567
French         132346
Spanish        115912
Arabic          63734
German          53699
Portuguese      53488
Turkish         31256
Italian         29293
Vietnamese      27032
Korean          24622
Persian         22541
Polish          21539
Indonesian      19593
Japanese        16773
Dutch            9701
Ukrainian        8792
Latin            8219
Hungarian        7301
Maori            6766
Romanian         4571
Sotho            4560
Nolang           4282
Tagalog          3804
Afrikaans        3391
Czech            3174
Greek            3151
Malay            2165
Swedish          1969
dtype: int64

# Data prep misc

In [None]:
from imbed_data_prep.github_repos import *

d = GithubReposData()

## Can we take the github stub (nameWithOwner) as an id?

In [18]:
assert len(d.raw_data.nameWithOwner.dropna()) == len(d.raw_data), (
    "We assumed that there's no missing data in the 'nameWithOwner' column"
)

Making dataframe from parquet bytes
Loading bytes of raw data from /Users/thorwhalen/Dropbox/_odata/figiri/github-repos.parquet


In [24]:
assert d.raw_data.nameWithOwner.nunique() == len(d.raw_data), (
    "We assumed that the 'nameWithOwner' were unique"
)

AssertionError: We assumed that the 'nameWithOwner' were unique

### Let's look at the duplicates

In [25]:
from collections import Counter

Counter(d.raw_data.nameWithOwner).most_common(10)

[('plapadoo/ledger-jira-sync', 4),
 ('wasp-platform/wasp', 4),
 ('dimdenGD/OldTweetDeck', 3),
 ('TravisWThompson1/Makefile_Example_CUDA_CPP_To_Executable', 3),
 ('Roave/EmailTemplates', 3),
 ('grigio/meteor-weuse', 3),
 ('laewahn/CutCAD', 3),
 ('cockroachdb/cdc-sink', 3),
 ('AP-Atul/Torpido', 3),
 ('althonos/pyhmmer', 3)]

In [26]:
d.raw_data[d.raw_data.nameWithOwner == "plapadoo/ledger-jira-sync"]

Unnamed: 0,owner,name,stars,forks,watchers,isFork,isArchived,languages,languageCount,topics,...,createdAt,pushedAt,defaultBranchCommitCount,license,assignableUserCount,codeOfConduct,forkingAllowed,nameWithOwner,parent,embedding
2117813,plapadoo,ledger-jira-sync,6,0,6,False,False,"Python: 8602, Nix: 1530",2,"ledger: 15, python: 277441, accounting: 67, bo...",...,2018-05-19T08:00:12Z,2019-03-05T17:55:25Z,9.0,"BSD 3-Clause ""New"" or ""Revised"" License",3,,True,plapadoo/ledger-jira-sync,,"[-0.04487917, 0.025739525, 0.106529795, -0.135..."
2117814,plapadoo,ledger-jira-sync,6,0,6,False,False,"Python: 8602, Nix: 1530",2,"ledger: 15, python: 277441, accounting: 67, bo...",...,2018-05-19T08:00:12Z,2019-03-05T17:55:25Z,9.0,"BSD 3-Clause ""New"" or ""Revised"" License",3,,True,plapadoo/ledger-jira-sync,,"[-0.04487917, 0.025739525, 0.106529795, -0.135..."
2117815,plapadoo,ledger-jira-sync,6,0,6,False,False,"Python: 8602, Nix: 1530",2,"ledger: 15, python: 277441, accounting: 67, bo...",...,2018-05-19T08:00:12Z,2019-03-05T17:55:25Z,9.0,"BSD 3-Clause ""New"" or ""Revised"" License",3,,True,plapadoo/ledger-jira-sync,,"[-0.04487917, 0.025739525, 0.106529795, -0.135..."
2117816,plapadoo,ledger-jira-sync,5,0,6,False,False,"Python: 8602, Nix: 1530",2,"ledger: 15, python: 277500, accounting: 67, bo...",...,2018-05-19T08:00:12Z,2019-03-05T17:55:25Z,9.0,"BSD 3-Clause ""New"" or ""Revised"" License",3,,True,plapadoo/ledger-jira-sync,,"[-0.04487917, 0.025739525, 0.106529795, -0.135..."


It looks like the meta data is exactly the same, so the non-unique entries might be duplicates. Let's drop any rows that are exact duplicates (except for the embeddings, which are numpy arrays, so not hashable, and could have slight variations). 

In [35]:
# drop duplicates, ignoring the embedding column (which is not hashable)
t = d.raw_data.drop_duplicates(subset=d.raw_data.columns.difference(["embedding"]))
assert d.raw_data.shape[1] == t.shape[1], "Assumed no columns were dropped"
print(f"{len(d.raw_data) - len(t)} rows were dropped")
d.raw_data.shape, t.shape

207147 rows were dropped


((3274587, 26), (3067440, 26))

In [36]:
# Still some duplicate nameWithOwner, with different stars etc.
# So we'll just drop duplicate nameWithOwner
assert t.nameWithOwner.nunique() == len(t), (
    "We assumed that the 'nameWithOwner' were unique"
)

AssertionError: We assumed that the 'nameWithOwner' were unique

In [39]:
Counter(t.nameWithOwner).most_common(10)

[('dimdenGD/OldTweetDeck', 3),
 ('weaigc/gradio-chatbot', 3),
 ('MoyuruAizawa/Cropify', 3),
 ('Lakr233/BBackupp', 3),
 ('daboigbae/react-native-template', 3),
 ('esp-rs/esp-mbedtls', 3),
 ('Nahid1911/space-travelers-proj-react', 3),
 ('Rel1cx/eslint-plugin-react-ts', 3),
 ('redknotmiaoyuqiao/EyerH264Decoder', 2),
 ('ShuifaHe/STM32', 2)]

In [40]:
d.raw_data[d.raw_data.nameWithOwner == 'dimdenGD/OldTweetDeck']

Unnamed: 0,owner,name,stars,forks,watchers,isFork,isArchived,languages,languageCount,topics,...,createdAt,pushedAt,defaultBranchCommitCount,license,assignableUserCount,codeOfConduct,forkingAllowed,nameWithOwner,parent,embedding
8747,dimdenGD,OldTweetDeck,129,5,10,False,False,JavaScript: 5093,1,,...,2023-08-17T00:31:05Z,2023-08-18T19:55:14Z,14.0,MIT License,1,,True,dimdenGD/OldTweetDeck,,"[-0.093423545, -0.07952896, -0.038683187, -0.0..."
8748,dimdenGD,OldTweetDeck,97,4,8,False,False,JavaScript: 5093,1,,...,2023-08-17T00:31:05Z,2023-08-18T16:44:39Z,13.0,MIT License,1,,True,dimdenGD/OldTweetDeck,,"[-0.093423545, -0.07952896, -0.038683187, -0.0..."
8749,dimdenGD,OldTweetDeck,128,5,10,False,False,JavaScript: 5093,1,,...,2023-08-17T00:31:05Z,2023-08-18T19:55:14Z,14.0,MIT License,1,,True,dimdenGD/OldTweetDeck,,"[-0.093423545, -0.07952896, -0.038683187, -0.0..."


## Different ways to get an array out of a dataframe

In [3]:
import pandas as pd
import numpy as np
t = pd.DataFrame(np.random.randint(0, 10000, (10000, 2)), columns=['a', 'b'])

In [12]:
%timeit tt = t.a.to_numpy()  # 3.38 µs ± 10.1 ns

3.38 µs ± 10.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [13]:
%timeit tt = np.array(t.a.to_list())  # 351 µs ± 2.92 µs

351 µs ± 2.92 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


# Scrap: Github Repos Embeddings

In [1]:
from imbed_data_prep.github_repos import _DFLT_RAW_DATA_FILEPATH, _DFLT_CACHE_DIR
from imbed_data_prep.github_repos import *

In [1]:
log_flag_attr = 'verbose'
args = ()
getattr(next(iter(args), None), log_flag_attr, False)

False

In [4]:
from lkj import log_calls

class MyClass:
    def __init__(self, verbose=False):
        self.verbose = verbose

    @log_calls(log_flag_attr='verbose')
    def foo(self):
        print("Executing foo")

# Example usage
obj = MyClass(verbose=True)
obj.foo()  # This will log


Calling foo...
Executing foo
.... Done calling foo


# Scrap: 

In [None]:
# An attempt to use a different method than fixed size chunks, hoping we'll 
# never go over the API limits (max MBs, max number of items (embeddings), etc.)
# Here, we (try) to iterate over the items, and release maximal bundles of items that
# meet a condition.
# To implement, should look into itertools and creek

from typing import Callable, Iterable, TypeVar, Union


Item = TypeVar('Item')
Bundle = TypeVar('Bundle')


def return_none(item: Item) -> None:
    return None

def smart_bundles(
        items: Iterable[Item], 
        bundle_release_condition: Callable[[Bundle], bool],
        *,
        init_bundle: Callable[[], Bundle] = list, 
        update_bundle: Callable[[Bundle, Item], Union[Bundle, None]] = list.append, 
        exceptional_item_callback: Callable[[Item], None] = return_none,
    ):
    """ 
    Generate bundles of items that (number of items) maximal and meet a condition.

    A bundle is just an aggregate of items that can be initialized and updated. 
    
    The bundle_release_condition is a function that takes a bundle and returns True 
    if the bundle is releasable. 
    When a non-relasable bundle is found, the function will not release it, but 
    instead release the last bundle (which should be releasable), and take the item
    that tipped the last bundle over the edge and make it the first item in the a 
    new bundle.

    There is one special case to this: What if a bundle with one item doesn't meet 
    the criteria for realease? In this case, there's no "previous bundle" to release.
    This is where the `exceptional_item_callback` comes in play. 
    It will be called with the item that didn't fit in the bundle, and then be skipped
    (i.e. a new bundle will be started with the next item).

    This `exceptional_item_callback` allows us to control what happens when an item
    doesn't fit in the bundle. This could be things like raising or logging the 
    exception, saving the item to a file, etc.
    Note that the result of the callback is yielded, so if you want to skip such items,
    you should make sure that `exceptional_item_callback` returns None, since 
    the function will not yield None.

    Args:
        items: The items to bundle.
        bundle_release_condition: A function that takes a bundle and returns True 
            if the bundle is releasable.
        init_bundle: A function that returns a new bundle.
        update_bundle: A function that takes a bundle and an item, and returns the 
            updated bundle, or returns None (in which case it is assumed that the
            bundle was updated in place).
        exceptional_item_callback: A function that takes an item that didn't fit in 
            the bundle.

    Example:
    
    >>> def bundle_release_condition(bundle, max_sum=9):
    ...     return sum(bundle) <= max_sum
    >>> items = [1, 2, 3, 4, 5, 6, 7, 10, 1, 2]
    >>> list(smart_bundles(items, bundle_release_condition))
    [[1, 2, 3], [4, 5], [6], [7], [1, 2]]

    >>> list(smart_bundles(items, 
    ...     lambda x: x >= 9,
    ...     init_bundle=lambda: 0, 
    ...     update_bundle=lambda x, y: x + y,
    ...     exceptional_item_callback=lambda x: print(f"Exceptional item: {x}")))
    ... )
    Exceptional item: 10
    [[1, 2, 3], [4, 5], [6], [7], [1, 2]]
    
    """
