In [None]:
%load_ext autoreload 
%autoreload 2

In [None]:
!pip install python_docx

In [None]:
#Sagemaker Endpoint Deploy
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()
# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'shibing624/text2vec-base-chinese',
	'HF_TASK':'feature-extraction'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	transformers_version='4.17.0',
	pytorch_version='1.10.2',
	py_version='py38',
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	endpoint_name='huggingface-inference-text2vec-base-chinese-v1',
	initial_instance_count=1, # number of instances
	# instance_type='ml.m5.xlarge' # ec2 instance type
	instance_type='ml.p3.2xlarge'
)

In [None]:
# Inference testing
import time

hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-text2vec-base-chinese-v1')

t0 = time.time()
for i in range(10):
    hfp.predict({'inputs':''.join(['打印' for _ in range(100)])})[0][0][0]
print(time.time()-t0)

In [None]:
#Preprocess Data
import os
import docx
import pandas as pd
from docx import Document
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
import json
import boto3
import requests

def is_all_black(s):
    for si in s:
        if si != ' ':
            return False
    return True

def read_doc(path):
    title = get_title(path)
    titles = []
    paragraphs = []
    sentences = []
    paragraphs_id = []
    sentences_id = []
    
    document = Document(path)  # 读入文件
    for i in range(len(document.paragraphs)):
        p0 = document.paragraphs[i].text
        p = document.paragraphs[i].text.replace('. ', '。')
        if p != '':
            ss = p.split('。')
            for j in range(len(ss)):
                if ss[j] != '' and is_all_black(ss[j])==False:
                    titles.append(title)
                    paragraphs.append(p0)
                    sentences.append(ss[j])
                    paragraphs_id.append(i)
                    sentences_id.append(j)
    df = pd.DataFrame({'title':titles, 'paragraph':paragraphs, 'sentence':sentences,
                      'paragraph_id':paragraphs_id, 'sentence_id':sentences_id})          
    return df

def get_title(path):
    try:
        title = os.path.split(os.path.splitext(path)[0])[1].replace('——', '-').split('-')[1]
    except:
        title = os.path.split(os.path.splitext(path)[0])[1]
    return title

In [None]:
hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-text2vec-base-chinese-v1')

def get_vector(q):
    if len(q) > 400:
        return [-1000 for _ in range(768)]
    return hfp.predict({'inputs':[q]})[0][0][0]

def embbeding(df):
    df['title_vector'] = ''
    df['sentence_vector'] = ''
    title_vector = str(get_vector(df.iloc[0, 0]))
    for i in range(len(df)):
        df.iloc[i, 5] = title_vector
        df.iloc[i, 6] = str(get_vector(df.iloc[i, 2]))
        print('\r embbeding %i out of %i finished'%(i, len(df)), end='')
    return df

### Input your customerized index_name

In [None]:
#import data to OpenSearch
import boto3
import requests
import json
import os
sm_client = boto3.client('secretsmanager')
host_url = sm_client.get_secret_value(SecretId='opensearch-host-url')['SecretString']
urldata= json.loads(host_url)
host = urldata.get('host') # cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com/
region = os.getenv('AWS_REGION', '') # e.g. cn-north-1
index_name = "docs"     # pls customized your index name, default is 'docs'
master_user = sm_client.get_secret_value(SecretId='opensearch-master-user')['SecretString']
data= json.loads(master_user)
username = data.get('username')
password = data.get('password')

service = 'es'
credentials = boto3.Session().get_credentials()


awsauth = (username, password)


url = host+'_bulk'

headers = { "Content-Type": "application/json" }

def import_data(df, id_start=0, before_import=0):
    payloads = ''
    for i in range(id_start, len(df)+id_start):
        first = json.dumps({ "index": { "_index": index_name, "_id": str(i+before_import) } }, ensure_ascii=False) + "\n"
        second = json.dumps({"title": str(df.iloc[i-id_start, 0]), 
                     "paragraph": str(df.iloc[i-id_start, 1]), 
                     "sentence": str(df.iloc[i-id_start, 2]), 
                     "paragraph_id": str(df.iloc[i-id_start, 3]), 
                     "sentence_id": str(df.iloc[i-id_start, 4]), 
                     "title_vector": json.loads(df.iloc[i-id_start, 5]),
                     "sentence_vector": json.loads(df.iloc[i-id_start, 6])},
                   ensure_ascii=False) + "\n"
        payloads += first + second
    # print(payloads)
    r = requests.post(url, auth=awsauth, headers=headers, data=payloads.encode()) # requests.get, post, and delete have similar syntax
#     print(r.text)

### Input your customized folder_path

In [None]:
#Preprocess Data and Import

hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-text2vec-base-chinese-v1')

folder_path = ''

slice = 10

names = os.listdir(folder_path)
before_import = 0
for j in range(len(names)):
    name = names[j]
    df = read_doc(os.path.join(folder_path, name))
    df = embbeding(df)
    for i in range(len(df)//slice+1):
        import_data(df[slice*i:slice*(i+1)], slice*i, before_import)
        print('\r import %i out of %i finished'%(i, len(df)//slice+1), end='')
    before_import += len(df)
    print(' file %i out of %i finished'%(j, len(names)//slice+1))
    

In [None]:
#Create Dynamo DB
client = boto3.client('dynamodb', region_name='us-west-2')

try:
    resp = client.create_table(
        TableName="FeedbackRecordsSEWCFAQ",
        # Declare your Primary Key in the KeySchema argument
        KeySchema=[
            {
                "AttributeName": "SearchInputs",
                "KeyType": "HASH"
            },
            {
                "AttributeName": "_id",
                "KeyType": "RANGE"
            }
        ],
        # Any attributes used in KeySchema or Indexes must be declared in AttributeDefinitions
        AttributeDefinitions=[
            {
                "AttributeName": "SearchInputs",
                "AttributeType": "S"
            },
            {
                "AttributeName": "_id",
                "AttributeType": "S"
            }
        ],
        # ProvisionedThroughput controls the amount of data you can read or write to DynamoDB per second.
        # You can control read and write capacity independently.
        ProvisionedThroughput={
            "ReadCapacityUnits": 50,
            "WriteCapacityUnits": 50
        }
    )
    print("Table created successfully!")
except Exception as e:
    print("Error creating table:")
    print(e)