# OpenAI backend

## single word openai embeddings

Here, we'll compute openai embeddings for the most frequent English words and uplaod them to a github repository in various formats (which we will compare): csv, zipped csv, pickle, and possibly recode.

In [41]:
chunker = lambda seq, size: (seq[pos:pos + size] for pos in range(0, len(seq), size))

from oa import openai

def oa_embeddings(terms, engine='text-embedding-ada-002'):
    if isinstance(terms, str):
        terms = [terms]
    responses = openai.Embedding.create(input=terms, engine=engine)
    embeddings = {term: response['embedding'] for term, response in zip(terms, responses['data'])}
    return embeddings

from py2store import PickleStore

def get_and_save_embeddings(terms, save_store= PickleStore('.'), chk_size=2000):
    save_store = PickleStore('.')

    from lkj import print_progress

    chk_size = 2000
    n = int(len(terms) / chk_size)
    for i, chk in enumerate(chunker(list(terms), chk_size)):
        print_progress(f"({i}/{n})")
        new_embeddings = oa_embeddings(chk)
        first = i * chk_size
        last = (i + 1) * chk_size
        save_store[f'openai_word_embeddings_{first}_{last}.pkl'] = new_embeddings

        

In [3]:
from idiom import most_frequent_words

words = most_frequent_words()
list(words)[:10]

['the', 'of', 'and', 'to', 'a', 'in', 'for', 'is', 'on', 'that']

In [4]:
list(words.items())[:10]

[('the', 23135851162),
 ('of', 13151942776),
 ('and', 12997637966),
 ('to', 12136980858),
 ('a', 9081174698),
 ('in', 8469404971),
 ('for', 5933321709),
 ('is', 4705743816),
 ('on', 3750423199),
 ('that', 3400031103)]

In [42]:
# get_and_save_embeddings(words)
# 3m41s to run on my 100_000 word list!
    

In [3]:
from typing import Mapping, MutableMapping
# go through all src data, chunked by 5, dict-merging the datas, and saving the result in targ 
def rechunk_and_save(
        src: Mapping, 
        targ: MutableMapping, 
        *,
        aggregate_chk_size = 5,
        original_chk_size = 2000
):
    src_keys = list(src)
    new_chunk_size = original_chk_size * aggregate_chk_size
    for i, chk in enumerate(chunker(range(len(src)), aggregate_chk_size)):
        d = dict()
        for first, last in src_keys[slice(chk[0], chk[-1] + 1)]:
            d.update(src[(first, last)])
        targ[i * new_chunk_size, (i + 1) * new_chunk_size] = d


from dol import KeyTemplate

st = KeyTemplate(
    'openai_word_embeddings_{first}_{last}.pkl', field_patterns=dict(first='[0-9]+', last='[0-9]+')
)

from py2store import PickleStore
from dol import wrap_kvs, filt_iter, Pipe, cached_keys

embeddings_wrapper = Pipe(
    filt_iter(filt=lambda k: k.endswith('.pkl')),
    wrap_kvs(
        key_of_id=lambda x: tuple(map(int, st.str_to_tuple(x))), 
        id_of_key=st.tuple_to_str
    ),
)



In [5]:
src = embeddings_wrapper(PickleStore('./embeddings'))
src = cached_keys(src, keys_cache=sorted)
targ = embeddings_wrapper(PickleStore('./embeddings_2'))

# rechunk_and_save(src, targ)

In [48]:
import numpy as np
from dol import wrap_kvs

wordvec_store = embeddings_wrapper(PickleStore('./embeddings'))
wordvec = dict(**wordvec_store[0, 10000], **wordvec_store[10000, 20000], **wordvec_store[20000, 30000])
# wordvec = wrap_kvs(wordvec, obj_of_data=np.array)

In [7]:
from typing import Mapping

Word2Vec = Mapping[str, list]

def try_wordvec_arthimetic(
        wordvec: Mapping,
        obj_word_1 = 'king',
        obj_word_2 = 'queen',
        feature_word_1 = 'man',
        feature_word_2 = 'woman',
):
    from scipy.spatial.distance import cosine
    import numpy as np

    def cosine_similarity(vec1, vec2):
        return 1 - cosine(vec1, vec2)


    print(f'"{obj_word_1}" and "{obj_word_2}" similarity: {cosine_similarity(wordvec[obj_word_1], wordvec[obj_word_2]):.4f}')
    print(f'"{obj_word_1}" and "{feature_word_1}" similarity: {cosine_similarity(wordvec[obj_word_1], wordvec[feature_word_1]):.4f}')
    print(f'"{obj_word_2}" and "{feature_word_1}" similarity: {cosine_similarity(wordvec[obj_word_2], wordvec[feature_word_1]):.4f}')
    print(f'"{obj_word_2}" and "{feature_word_2}" similarity: {cosine_similarity(wordvec[obj_word_2], wordvec[feature_word_2]):.4f}')
    print(f'"{feature_word_1}" and "{feature_word_2}" similarity: {cosine_similarity(wordvec[feature_word_1], wordvec[feature_word_2]):.4f}')
    print("")
    v = np.array(wordvec[obj_word_1]) - np.array(wordvec[feature_word_1]) + np.array(wordvec[feature_word_2])

    print(f'New vector and "{obj_word_1}" similarity: {cosine_similarity(v, wordvec[obj_word_1]):.4f}')
    print(f'New vector and "{feature_word_1}" similarity: {cosine_similarity(v, wordvec[feature_word_1]):.4f}')
    print(f'New vector and "{feature_word_2}" similarity: {cosine_similarity(v, wordvec[feature_word_2]):.4f}')
    print(f'New vector and "{obj_word_2}" similarity: {cosine_similarity(v, wordvec[obj_word_2]):.4f}')
    print("\n")

try_wordvec_arthimetic(wordvec)
try_wordvec_arthimetic(wordvec, 'paris', 'rome', 'france', 'italy')

    

"king" and "queen" similarity: 0.9155
"king" and "man" similarity: 0.8138
"queen" and "man" similarity: 0.8259
"queen" and "woman" similarity: 0.8799
"man" and "woman" similarity: 0.9029

New vector and "king" similarity: 0.9155
New vector and "man" similarity: 0.6542
New vector and "woman" similarity: 0.8342
New vector and "queen" similarity: 0.8848


"paris" and "rome" similarity: 0.8460
"paris" and "france" similarity: 0.8926
"rome" and "france" similarity: 0.8258
"rome" and "italy" similarity: 0.8579
"france" and "italy" similarity: 0.8816

New vector and "paris" similarity: 0.8903
New vector and "france" similarity: 0.7291
New vector and "italy" similarity: 0.9006
New vector and "rome" similarity: 0.8270




In [36]:
from typing import Mapping

Word2Vec = Mapping[str, list]

def try_wordvec_arthimetic(t: Mapping):
    from scipy.spatial.distance import cosine


    def cosine_similarity(vec1, vec2):
        return 1 - cosine(vec1, vec2)


    print(f"{cosine_similarity(t['king'], t['queen'])=}")
    print(f"{cosine_similarity(t['king'], t['man'])=}")
    print(f"{cosine_similarity(t['queen'], t['man'])=}")
    print(f"{cosine_similarity(t['queen'], t['woman'])=}")
    print(f"{cosine_similarity(t['man'], t['woman'])=}")
    print("")
    v = np.array(t['king']) - np.array(t['man']) + np.array(t['woman'])

    print(f"{cosine_similarity(v, t['king'])=}")
    print(f"{cosine_similarity(v, t['man'])=}")
    print(f"{cosine_similarity(v, t['woman'])=}")
    print(f"{cosine_similarity(v, t['queen'])=}")

cosine_similarity(t['king'], t['queen'])=0.9154539883981325
cosine_similarity(t['king'], t['man'])=0.8138182688759626
cosine_similarity(t['queen'], t['man'])=0.8259629343785966
cosine_similarity(t['queen'], t['woman'])=0.8798956693397046
cosine_similarity(t['man'], t['woman'])=0.9029889578791404

cosine_similarity(v, t['king'])=0.9156055782123844
cosine_similarity(v, t['man'])=0.6543302273978167
cosine_similarity(v, t['woman'])=0.8342070103171549
cosine_similarity(v, t['queen'])=0.8848948703426839


In [49]:
import time
import pickle
import gzip
import os

def get_default_codecs():
    codecs = {}

    # Attempt to add gzipped pickle codec
    try:
        import pickle
        import gzip
        codecs['gzipped_pickle'] = {
            'encoder': lambda f, d: pickle.dump(d, gzip.open(f, 'wb')),
            'decoder': lambda f: pickle.load(gzip.open(f, 'rb'))
        }
    except ImportError:
        pass

    # Attempt to add numpy npz codec
    try:
        import numpy as np
        codecs['numpy_npz'] = {
            'encoder': lambda f, d: np.savez_compressed(f, d),
            'decoder': lambda f: {k: v for k, v in np.load(f).items()}
        }
    except ImportError:
        pass

    # Attempt to add hdf5 codec
    # try:
    #     import h5py
    #     def _hdf5_encode(data, filename):
    #         with h5py.File(filename, 'w') as f:
    #             for key, values in data.items():
    #                 f.create_dataset(key, data=values, compression="gzip")

    #     def _hdf5_decode(filename):
    #         with h5py.File(filename, 'r') as f:
    #             return {key: f[key][()] for key in f.keys()}

    #     codecs['hdf5'] = {
    #         'encoder': lambda f, d: _hdf5_encode(d, f),
    #         'decoder': lambda f: _hdf5_decode(f)
    #     }
    # except ImportError:
    #     pass

    # Attempt to add parquet codec
    try:
        import pandas as pd
        codecs['parquet'] = {
            'encoder': lambda f, d: pd.DataFrame.from_dict(d, orient='index').transpose().to_parquet(f, compression='gzip'),
            'decoder': lambda f: pd.read_parquet(f).to_dict(orient='list')
        }
    except ImportError:
        pass

    return codecs


# You can then use this function in your benchmark_storage function
# codecs = get_default_codecs()
# results = benchmark_storage(data, codecs)


def benchmark_storage(data, codecs=None, *, verbose=True):
    from lkj import print_progress, clog
    from functools import partial

    _clog = partial(clog, verbose, log_func=print_progress)

    if codecs is None:
        codecs = get_default_codecs()

    results = {}
    for name, codec in codecs.items():
        _clog(f'Benchmarking {name}')

        filename = f'temp_{name}.data'
        
        try:
            start_time = time.time()
            codec['encoder'](filename, data)
            encoding_time = time.time() - start_time

            encoded_size = os.path.getsize(filename)

            start_time = time.time()
            decoded_data = codec['decoder'](filename)
            decoding_time = time.time() - start_time

            decoded_size = sum([len(v) * 8 for v in decoded_data.values()])  # Assuming float64 (8 bytes per float)

            results[name] = {
                'decoded_n_bytes': decoded_size,
                'encoded_n_bytes': encoded_size,
                'encoding_time_in_seconds': encoding_time,
                'decoding_time_in_seconds': decoding_time
            }
        except Exception as e:
            print(f"Error: {e}")

            os.remove(filename)

    return results


# Example usage
# data = {"key1": [0.1, 0.2, ...], "key2": [1.1, 1.2, ...]}
# results = benchmark_storage(data)
# print(results)


In [57]:
import pandas as pd
# results = benchmark_storage(wordvec)
# pd.DataFrame.from_dict(results)

# 	                        gzipped_csv	gzipped_pickle	parquet
# decoded_n_bytes	            1.310808e+06	1.310808e+06	1.310808e+06
# encoded_n_bytes	            3.925119e+08	1.761167e+08	2.718476e+08
# encoding_time_in_seconds	1.388668e+02	2.779556e+02	3.537151e+01
# decoding_time_in_seconds	8.359523e+00	2.618939e+00	5.103618e+00

Unnamed: 0,gzipped_csv,gzipped_pickle,parquet
decoded_n_bytes,1310808.0,1310808.0,1310808.0
encoded_n_bytes,392511900.0,176116700.0,271847600.0
encoding_time_in_seconds,138.8668,277.9556,35.37151
decoding_time_in_seconds,8.359523,2.618939,5.103618


In [1]:
from dol import ValueCodecs, Pipe, Files, KeyTemplate

# decoder here will unpickle data and remove remove the .pkl extension from the key
src_wrap = Pipe(
    KeyTemplate(
        'openai_word_embeddings_{from_word:05.0f:\d+}_{to_word:05.0f:\d+}.pkl.gz',
        from_str_funcs=dict(from_word=int, to_word=int),
    ).key_codec(), 
    ValueCodecs.pickle() + ValueCodecs.gzip()
)

targ_wrap = Pipe(
    KeyTemplate(
        'openai_word_embeddings_{from_word:06.0f:\d+}_{to_word:06.0f:\d+}.pkl.gz',
        from_str_funcs=dict(from_word=int, to_word=int),
    ).key_codec(), 
    ValueCodecs.pickle() + ValueCodecs.gzip()
)

src = src_wrap(Files('./embeddings'))
targ = targ_wrap(Files('./embeddings_2'))
k, v = src.head()

Files(rootdir='./embeddings_2', subpath='', pattern_for_field=None, max_levels=None, include_hidden=False, assert_rootdir_existence=False)

In [2]:
from slang import fixed_step_chunker
from functools import partial

chk_size = 2500
chunker = partial(fixed_step_chunker, chk_size=chk_size)
for k in sorted(src):
    v = src[k]
    start_idx, _ = k
    for i, chk in enumerate(chunker(v.items())):
        new_start_idx = start_idx + i * chk_size
        new_end_idx = new_start_idx + chk_size
        targ[new_start_idx, new_end_idx] = dict(chk)
        



  snip_of_unichr_code = (nan * ones(unichr_code_of_snip.max() + 1)).astype(int)


In [93]:
list(src)

[(90000, 100000),
 (0, 10000),
 (50000, 60000),
 (60000, 70000),
 (70000, 80000),
 (10000, 20000),
 (40000, 50000),
 (30000, 40000),
 (80000, 90000),
 (20000, 30000)]

In [94]:
t  = src[0, 10000]

## tiktoken


In [7]:
import tiktoken
print(dir(tiktoken))

['Encoding', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '_tiktoken', 'core', 'encoding_for_model', 'encoding_name_for_model', 'get_encoding', 'list_encoding_names', 'model', 'registry']


In [4]:
import tiktoken

EMBEDDING_MODEL = "text-embedding-ada-002"
GPT_MODEL = "gpt-3.5-turbo"

def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))

num_tokens('hi my name is anticonstitutional')


tiktoken.encoding_for_model(GPT_MODEL).encode('hi my name is anticonstitutional')


[6151, 856, 836, 374, 3276, 1965, 10663, 278]

In [5]:
tiktoken.encoding_for_model(GPT_MODEL).encode('the be to of and')


[1820, 387, 311, 315, 323]

In [7]:
t = tiktoken.encoding_for_model(GPT_MODEL)
dir(t)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_core_bpe',
 '_encode_bytes',
 '_encode_only_native_bpe',
 '_encode_single_piece',
 '_mergeable_ranks',
 '_pat_str',
 '_special_tokens',
 'decode',
 'decode_batch',
 'decode_bytes',
 'decode_bytes_batch',
 'decode_single_token_bytes',
 'decode_tokens_bytes',
 'decode_with_offsets',
 'encode',
 'encode_batch',
 'encode_ordinary',
 'encode_ordinary_batch',
 'encode_single_token',
 'encode_with_unstable',
 'eot_token',
 'max_token_value',
 'n_vocab',
 'name',
 'special_tokens_set',
 'token_byte_values']

In [20]:
print(f"{t.max_token_value=}, {t.n_vocab=}")

t.max_token_value=100276, t.n_vocab=100277


In [14]:
t.decode(range(100))

'!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~������'

In [19]:
list(map(lambda x: t.decode([x]), (range(300, 400))))

['as',
 'el',
 'ct',
 'nd',
 ' in',
 ' h',
 'ent',
 'id',
 ' n',
 'am',
 '           ',
 ' to',
 ' re',
 '--',
 ' {',
 ' of',
 'om',
 ');\n',
 'im',
 '\r\n',
 ' (',
 'il',
 '//',
 ' and',
 'ur',
 'se',
 ' l',
 'ex',
 ' S',
 'ad',
 ' "',
 'ch',
 'ut',
 'if',
 '**',
 ' }',
 'em',
 'ol',
 '                ',
 'th',
 ')\n',
 ' {\n',
 ' g',
 'ig',
 'iv',
 ',\n',
 'ce',
 'od',
 ' v',
 'ate',
 ' T',
 'ag',
 'ay',
 ' *',
 'ot',
 'us',
 ' C',
 ' st',
 ' I',
 'un',
 'ul',
 'ue',
 ' A',
 'ow',
 " '",
 'ew',
 ' <',
 'ation',
 '()',
 ' for',
 'ab',
 'ort',
 'um',
 'ame',
 ' is',
 'pe',
 'tr',
 'ck',
 '�',
 ' y',
 'ist',
 '----',
 '.\n\n',
 'he',
 ' e',
 'lo',
 ' M',
 ' be',
 'ers',
 ' on',
 ' con',
 'ap',
 'ub',
 ' P',
 '               ',
 'ass',
 'int',
 '>\n',
 'ly',
 'urn']

In [16]:
t.decode([1820, 387, 311, 315, 323])

'the be to of and'