In [46]:
#Sagemaker Endpoint Deploy
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()
# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'shibing624/text2vec-base-chinese',
	'HF_TASK':'feature-extraction'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	transformers_version='4.17.0',
	pytorch_version='1.10.2',
	py_version='py38',
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	endpoint_name='huggingface-inference-text2vec-base-chinese-v1',
	initial_instance_count=1, # number of instances
	instance_type='ml.m5.xlarge' # ec2 instance type
)

-----!

In [2]:
#Preprocess Data
import pandas as pd
import sagemaker
from sagemaker.huggingface import HuggingFaceModel
import json
import boto3
import requests
import json

hfp = sagemaker.huggingface.model.HuggingFacePredictor('huggingface-inference-text2vec-base-chinese-v1')

def get_vector(q):
    return hfp.predict({'inputs':[q]})[0][0][0]

path = 'material.csv'
df = pd.read_excel(path)
df['question_vector'] = ''
data = df['question'].tolist()
for i in range(len(df)):
    df.iloc[i, 2] = str(get_vector(df.iloc[i, 0]))
    print('\r%i out of %i finished'%(i, len(df)), end='')

2672 out of 2673 finished

In [41]:
#import data to OpenSearch
import boto3
import requests
import json


host = '' # cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com/
region = 'us-east-1' # e.g. us-west-1
index_name = "sewc_faq"
username = ""
password = ""

service = 'es'
credentials = boto3.Session().get_credentials()


awsauth = (username, password)


url = host+'_bulk'

headers = { "Content-Type": "application/json" }

def import_data(df, id_start=0):
    payloads = ''
    for i in range(id_start, len(df)+id_start):
        first = json.dumps({ "index": { "_index": index_name, "_id": str(i) } }, ensure_ascii=False) + "\n"
        second = json.dumps({"question": str(df.iloc[i-id_start, 0]), 
                     "answer": str(df.iloc[i-id_start, 1]), 
                     "question_vector": json.loads(df.iloc[i-id_start, 2])},
                   ensure_ascii=False) + "\n"
        payloads += first + second
    # print(payloads)
    r = requests.post(url, auth=awsauth, headers=headers, data=payloads.encode()) # requests.get, post, and delete have similar syntax
    # print(r.text)

slice = 100
for i in range(len(df)//slice+1):
    import_data(df[slice*i:slice*(i+1)], slice*i)
    print(slice*i)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600


In [53]:
#Create Dynamo DB
client = boto3.client('dynamodb', region_name='us-east-1')

try:
    resp = client.create_table(
        TableName="FeedbackRecordsSEWCFAQ",
        # Declare your Primary Key in the KeySchema argument
        KeySchema=[
            {
                "AttributeName": "SearchInputs",
                "KeyType": "HASH"
            },
            {
                "AttributeName": "_id",
                "KeyType": "RANGE"
            }
        ],
        # Any attributes used in KeySchema or Indexes must be declared in AttributeDefinitions
        AttributeDefinitions=[
            {
                "AttributeName": "SearchInputs",
                "AttributeType": "S"
            },
            {
                "AttributeName": "_id",
                "AttributeType": "S"
            }
        ],
        # ProvisionedThroughput controls the amount of data you can read or write to DynamoDB per second.
        # You can control read and write capacity independently.
        ProvisionedThroughput={
            "ReadCapacityUnits": 50,
            "WriteCapacityUnits": 50
        }
    )
    print("Table created successfully!")
except Exception as e:
    print("Error creating table:")
    print(e)

Table created successfully!
