<b>Mount EFS to model dir.</b>

In [1]:
%%sh 

mkdir model

mkdir: cannot create directory ‘model’: File exists


In [2]:
%%sh

sudo mount -t nfs \
    -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 \
    fs-xxxxx.efs.ap-southeast-2.amazonaws.com:/ \
    ./model

In [3]:
#!sudo umount -l ./model
#!sudo mount -t efs fs-aeced997:/ ./model

In [4]:
%%sh
sudo chmod go+rw ./model

## Installing packages 

In [5]:
%%sh
pip install sentence-transformers
pip install elasticsearch
pip install requests_aws4auth
pip install kaggle

Process is terminated.


In [6]:
# %%sh
# wget https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/distilbert-base-nli-mean-tokens.zip
# unzip distilbert-base-nli-mean-tokens.zip -d distilbert

In [7]:
#from sentence_transformers import SentenceTransformer
#model = SentenceTransformer('/home/ec2-user/SageMaker/distilbert',device="cpu")

## Downstream task

The number of hidden units of Bert is either 768 or 1024, To change the dimension, you must add a dense layer after the pooling

In [None]:
from sentence_transformers import models, losses, SentenceTransformer

word_embedding_model = models.DistilBERT('distilbert-base-uncased')

pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                            pooling_mode_mean_tokens=True,
                            pooling_mode_cls_token=False,
                            pooling_mode_max_tokens=False)
# reduce dim from 768 to 256
dense_model = models.Dense(in_features=768, out_features=256)
transformer = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

In [None]:
sentences = ['Where is the edge of the Universe?']
sentence_embeddings = transformer.encode(sentences)

<b>Save sentence embedder to mounted EFS directory, hence rest api can use it for embedding.</b>

In [None]:
transformer.save("model/transformer-v1/")

In [None]:
local_transformer =  SentenceTransformer('/home/ec2-user/SageMaker/model/transformer-v1/')

In [None]:
_sentences = ['Where is the edge of the Universe?']
_sentence_embeddings = local_transformer.encode(_sentences)

In [None]:
assert _sentence_embeddings[0].tolist() == sentence_embeddings[0].tolist()

## Download datasets
Please be aware of the following requirements about ackonwledgment, copyright and availability, cited from the dataset description page.

<blockquote>Question Pairs Dataset on kaggle via @KaggleDatasets https://kaggle.com/quora/question-pairs-dataset?utm_medium=social&utm_campaign=kaggle-dataset-share&utm_source=twitter</blockquote>

In [None]:
%%sh
mkdir /home/ec2-user/.kaggle/
cp /home/ec2-user/SageMaker/kaggle.json /home/ec2-user/.kaggle/
ls /home/ec2-user/.kaggle/
chmod 600 /home/ec2-user/.kaggle/kaggle.json

In [None]:
import pandas as pd
from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()
api.dataset_download_files("quora/question-pairs-dataset", path='quora_dataset', unzip=True)

## Preprocessing

In [None]:
import pandas as pd

pd.set_option('display.max_colwidth', -1)
df = pd.read_csv("quora_dataset/questions.csv", usecols=["qid1", "question1"], index_col=False)
df = df.sample(frac=1).reset_index(drop=True)
df_questions_imp = df[:3000]

<br>Creat an index with the index.knn setting and add one or more fields of the knn_vector data type.</b>

In [None]:
import boto3
from requests_aws4auth import AWS4Auth
from elasticsearch import Elasticsearch, RequestsHttpConnection

region = 'ap-southeast-2'
service = 'es'
ssm = boto3.client('ssm', region_name=region)
es_parameter = ssm.get_parameter(Name='/KNNSearch/ESUrl')
es_host = es_parameter['Parameter']['Value']
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                   region, service, session_token=credentials.token)
es = Elasticsearch(
    hosts=[{'host': es_host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

knn_index = {
    "settings": {
        "index.knn": True
    },
    "mappings": {
        "properties": {
            "question_vector": {
                "type": "knn_vector",
                "dimension": 256
            }
        }
    }
}

es.indices.create(index="questions",body=knn_index,ignore=400)

In [None]:
#!curl -X DELETE "https://vpc-knn-search-infra-es-c6mx5v7cqowwbuus3h5ek2v5kq.ap-southeast-2.es.amazonaws.com/questions"

## Records Indexing
<b>Store the actual data or document.</b>

In [None]:
def es_import(df):
    for index, row in df.iterrows():
        vectors = local_transformer.encode([row["question1"]])
        print(row["question1"])
#         print(vectors[0].tolist())
        es.index(index='questions',
                 id=row["qid1"], 
                 body={"question_vector": vectors[0].tolist(), 
                       "question": row["question1"]})

In [None]:
es_import(df_questions_imp)

In [None]:
## Testing

<b>Testing knn search from rest api deployed on ECS.</b>

In [None]:
!curl -d 'question=How can you fight depression?' http://knn-s-Publi-C8MSNTB6EVFM-207238135.ap-southeast-2.elb.amazonaws.com/search

In [None]:
sentence_embeddings = local_transformer.encode(["Does the Universe Have an Edge?"])
es.search(index="questions",
                    body={
                        "size": 5,
                        "_source": {
                            "exclude": ["question_vector"]
                        },
                        "min_score": 0.2,
                        "query": {
                            "knn": {
                                "question_vector": {
                                    "vector": sentence_embeddings[0].tolist(),
                                    "k": 5
                                }
                            }
                        }
                    })

<b>Copy sentence embedder to S3 as backup.</b>

In [None]:
!aws s3 cp /home/ec2-user/SageMaker/model/transformer-v1/  s3://aiyi.fuzzysearch/transformer-model/ --recursive