In [7]:
%%capture

!pip install tiktoken==0.3.3
!pip install tqdm

In [None]:
!pip install transformers

In [9]:
from tqdm import tqdm
import tiktoken
import requests
import logging
import os

In [10]:
tokenizer = tiktoken.get_encoding('cl100k_base')

In [11]:
DOC_DIR_PATH = './docs'

In [18]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [12]:
def num_tokens_from_doc(doc: str) -> int:
    """
    Returns the number of tokens in a text string.
    """
    num_tokens = len(encoding.encode(doc))
    return num_tokens

In [21]:
CHUNK_SIZE = 256

In [14]:
def doc_iterator(dir_path: str):
    for root, _, filenames in os.walk(dir_path):
        for filename in filenames:
            file_path = os.path.join(root, filename)
            if os.path.isfile(file_path):
                with open(file_path, 'r') as file:
                    file_contents = file.read()
                    yield filename, file_contents

In [22]:
n_docs = 0
n_passages = 0

for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
    print(f"doc_name: {doc_name}")
    doc_id = doc_name.split('.')[0]
    tokens = tokenizer.encode(doc)
    chunks = []
    chunk_id = 1
    n_docs += 1
    for i in range(0, len(tokens), CHUNK_SIZE):
        chunk_tokens = tokens[i: i+CHUNK_SIZE]
        if not len(chunk_tokens) < 256:
            chunk = tokenizer.decode(chunk_tokens)
            with open(f'./chunks/{doc_id}_{chunk_id}', 'w') as f:
                f.write(chunk)
            chunk_id += 1
            n_passages += 1
logger.info(f'{n_docs} documents segmented into {n_passages} passages')

1it [00:00,  6.97it/s]
1 documents segmented into 15 passages


doc_name: Cleanroom_FAQ.txt


In [48]:
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")
endpoint_name = "st-paraphrase-mpnet-base-v2-2023-04-14-04-17-29-625-endpoint"

def get_embedding(smr_client, text_input):
    parameters = {
      #"early_stopping": True,
      #"length_penalty": 2.0,
      "max_new_tokens": 50,
      "temperature": 0,
      "min_length": 10,
      "no_repeat_ngram_size": 2,
    }

    response_model = smr_client.invoke_endpoint(
                EndpointName=endpoint_name,
                Body=json.dumps(
                {
                    "inputs": [text_input],
                    "parameters": parameters
                }
                ),
                ContentType="application/json",
            )
    
    return response_model['Body'].read().decode('utf8')

In [52]:
for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
    if doc_name == "Cleanroom_FAQ.txt":
        lines = doc.splitlines()
        q_lines = [ line for line in lines if line.startswith('Question') ]
        a_lines = [ line for line in lines if line.startswith('Answer') ]
        print(get_embedding(smr_client, q_lines[1]))
        
        

2it [00:00,  8.52it/s]

b'{\n  "sentence_embeddings":[\n    [\n      0.01183790247887373,\n      0.0629643127322197,\n      -0.010602585040032864,\n      0.00385498208925128,\n      -0.08580745756626129,\n      0.06613323092460632,\n      0.06986038386821747,\n      0.06612421572208405,\n      -0.07077949494123459,\n      0.10049458593130112,\n      0.2561466693878174,\n      -0.05845275893807411,\n      0.060686249285936356,\n      0.06303957104682922,\n      -0.14984124898910522,\n      -0.014457563869655132,\n      0.029539579525589943,\n      0.10507740080356598,\n      -0.017891407012939453,\n      0.04731087386608124,\n      0.06511484086513519,\n      -0.11624763906002045,\n      -0.002014192519709468,\n      0.01344772893935442,\n      -0.024276848882436752,\n      -0.057371679693460464,\n      -0.0956144779920578,\n      -0.05627714842557907,\n      0.04029081016778946,\n      -0.05579474940896034,\n      -0.018311265856027603,\n      0.06787178665399551,\n      0.013893834315240383,\n      0.1188027


