In [7]:
%%capture

!pip install tiktoken==0.3.3
!pip install tqdm

In [None]:
!pip install transformers

In [4]:
from tqdm import tqdm
import tiktoken
import requests
import logging
import os

In [5]:
tokenizer = tiktoken.get_encoding('cl100k_base')

In [6]:
DOC_DIR_PATH = './docs'

In [7]:
logger = logging.getLogger('sagemaker')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())

In [8]:
def num_tokens_from_doc(doc: str) -> int:
    """
    Returns the number of tokens in a text string.
    """
    num_tokens = len(encoding.encode(doc))
    return num_tokens

In [9]:
CHUNK_SIZE = 256

In [10]:
def doc_iterator(dir_path: str):
    for root, _, filenames in os.walk(dir_path):
        for filename in filenames:
            file_path = os.path.join(root, filename)
            if os.path.isfile(file_path):
                with open(file_path, 'r') as file:
                    file_contents = file.read()
                    yield filename, file_contents

按照token切分文档

In [12]:
!mkdir -p chunks

In [None]:
n_docs = 0
n_passages = 0

for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
    print(f"doc_name: {doc_name}")
    doc_id = doc_name.split('.')[0]
    tokens = tokenizer.encode(doc)
    chunks = []
    chunk_id = 1
    n_docs += 1
    for i in range(0, len(tokens), CHUNK_SIZE):
        chunk_tokens = tokens[i: i+CHUNK_SIZE]
        if not len(chunk_tokens) < 256:
            chunk = tokenizer.decode(chunk_tokens)
            with open(f'./chunks/{doc_id}_{chunk_id}', 'w') as f:
                f.write(chunk)
            chunk_id += 1
            n_passages += 1
logger.info(f'{n_docs} documents segmented into {n_passages} passages')

In [20]:
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")
endpoint_name = "st-paraphrase-mpnet-base-v2-2023-04-14-04-17-29-625-endpoint"

def get_embedding(smr_client, text_input):
    parameters = {
      #"early_stopping": True,
      #"length_penalty": 2.0,
      "max_new_tokens": 50,
      "temperature": 0,
      "min_length": 10,
      "no_repeat_ngram_size": 2,
    }

    response_model = smr_client.invoke_endpoint(
                EndpointName=endpoint_name,
                Body=json.dumps(
                {
                    "inputs": [text_input],
                    "parameters": parameters
                }
                ),
                ContentType="application/json",
            )
    
    return response_model['Body'].read().decode('utf8')

测试paraphrase-mpnet-base-v2的setence2embedding模型

In [29]:
import json
import numpy as np

def parseJson2vector(input:str):
    """
    Parse Json string, and extract key "sentence_embeddings" to get the vector string, then convert it to numpy array
    """
    json_obj = json.loads(input)
    vector_array = json_obj["sentence_embeddings"]
    return vector_array

def calulate_cosine(vector1,vector2):
    """
    Calculate cosine similarity between two vectors
    """
    return np.dot(vector1,vector2)/(np.linalg.norm(vector1)*np.linalg.norm(vector2))

def calulate_cosine_between_sentence_pair(smr_client, q_str, a_str):
    q_vec = parseJson2vector(get_embedding(smr_client, q_str))[0]
    a_vec = parseJson2vector(get_embedding(smr_client, a_str))[0]
    return calulate_cosine(q_vec, a_vec)

In [None]:
for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
    if doc_name == "Cleanroom_FAQ.txt":
        lines = doc.splitlines()
        q_lines = [ line for line in lines if line.startswith('Question') ]
        a_lines = [ line for line in lines if line.startswith('Answer') ]
        for q_idx, q_line in enumerate(q_lines):
            max_cos = 0.0
            max_a_line = ""
            for a_idx, a_line in enumerate(a_lines):
                cos_val = calulate_cosine_between_sentence_pair(smr_client, q_line, a_line)
                if cos_val > max_cos:
                    max_cos = cos_val
                    max_a_line = a_line
            print(f'{max_cos} | {q_line} | {max_a_line}')

测试bloomz的setence2embedding 模型

In [None]:
TEXT_EMBEDDING_MODEL_ENDPOINT_NAME='huggingface-textembedding-bloom-7b1-fp1-2023-04-13-11-29-28-700'

def get_bloomz_embedding(smr_client, text_input):
    payload = {'text_inputs': [text_input]}
    payload = json.dumps(payload).encode('utf-8')

    response = smr_client.invoke_endpoint(EndpointName=TEXT_EMBEDDING_MODEL_ENDPOINT_NAME, 
                                                ContentType='application/json', 
                                                Body=payload)
    body = json.loads(response['Body'].read())
    embedding = body['embedding'][0]
    
    return embedding

In [None]:
get_bloomz_embedding(smr_client, '请问AWS Clean Rooms的一个协作中可以有多少个参与方?')

In [None]:
def calulate_cosine_between_sentence_pair2(smr_client, q_str, a_str):
    q_vec = get_bloomz_embedding(smr_client, q_str)
    a_vec = get_bloomz_embedding(smr_client, a_str)
    return calulate_cosine(q_vec, a_vec)

CHUNK_DIR_PATH='./chunks'
for doc_name, doc in tqdm(doc_iterator(DOC_DIR_PATH)):
    if doc_name == "Cleanroom_FAQ.txt":
        lines = doc.splitlines()
        q_lines = [ line for line in lines if line.startswith('Question') ]
        a_lines = [ line for line in lines if line.startswith('Answer') ]
        for q_idx, q_line in enumerate(q_lines):
            max_cos = 0.0
            max_a_doc = ""
            for doc_name, a_doc in tqdm(doc_iterator(CHUNK_DIR_PATH)):
                cos_val = calulate_cosine_between_sentence_pair2(smr_client, q_line, a_doc)
                if cos_val > max_cos:
                    max_cos = cos_val
                    max_a_doc = a_doc
            print(f'{max_cos} | {q_line} | {max_a_doc}')