In [1]:
import datasets

with open('./texts/txt_samples.txt','r') as file:
    line = file.readlines()

lines = list()
for line_item in line:
    lines.append(line_item.replace(';','').replace('\n',''))

dataset = datasets.Dataset.from_dict({'data':lines})
print(dataset)

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['data'],
    num_rows: 113
})


In [2]:
import tiktoken
encoding_method = tiktoken.encoding_for_model('gpt-3.5-turbo')
encoding_method_name = encoding_method.name

In [3]:
import tiktoken

tokenizer = tiktoken.get_encoding(encoding_method_name)

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""]
)

In [5]:
chunks = text_splitter.split_text(dataset[0]['data'])[:3]

In [6]:
OPENAI_API_KEY  = 'None'

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings

model_name = 'text-embedding-ada-002'

embed = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=OPENAI_API_KEY
)

In [8]:
texts = [
    'this is the first chunk of text',
    'then another second chunk of text is here'
]
res = embed.embed_documents(texts)
len(res), len(res[0])

(2, 1536)

In [9]:
import pinecone
pinecone.init(api_key="741ade23-d552-477b-8026-72f95d81e04e", environment="northamerica-northeast1-gcp")

In [10]:
index_name = 'alyarz-pinecone'
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
        name=index_name,
        metric='cosine',
        dimension=len(res[0])  # 1536 dim of text-embedding-ada-002
    )

In [11]:
index = pinecone.Index(index_name)

In [12]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 100

texts = []
metadatas = []

for i, record in enumerate(tqdm(dataset)):
#     # first get metadata fields for this record
    metadata = {
        'data': str(record['data']),
    }
#     # now we create chunks from the record text
    record_texts = text_splitter.split_text(record['data'])
#     # create individual metadata dicts for each chunk
    record_metadatas = [{
        "chunk": j, "text": text, **metadata
    } for j, text in enumerate(record_texts)]
#     # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
#     # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        index.upsert(vectors=zip(ids, embeds, metadatas))
        texts = []
        metadatas = []

if len(texts) > 0:
    ids = [str(uuid4()) for _ in range(len(texts))]
    embeds = embed.embed_documents(texts)
    index.upsert(vectors=zip(ids, embeds, metadatas))

100%|██████████| 113/113 [00:06<00:00, 17.05it/s]
