In [1]:
import boto3
import re
from sagemaker import get_execution_role
from io import StringIO

role = get_execution_role()

In [2]:
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import os
import sys
import time
import json
from IPython.display import display
from time import strftime, gmtime
import sagemaker
from sagemaker.pytorch import PyTorch

# Test at local

In [3]:
import pickle
from sentence_transformers import SentenceTransformer
from node import node

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
model_dir = './source_dir'

In [5]:
# Load the vectorizer.
with open(os.path.join(model_dir, 'modules.pickle'), 'rb') as f:
    modules = pickle.load(f)
vectorizer = SentenceTransformer(modules=modules)

In [6]:
# Load the tree.
with open(os.path.join(model_dir, 'tree.pkl'), 'rb') as f:
    tree = pickle.load(f)

In [7]:
# Vectorize the text.
text = ['It still looks brand new too!! Really great!!']
embeddings = vectorizer.encode(text)

In [8]:
# Search items.
response = tree.binary_search(embeddings[0])
response = {f'result{str(i)}': res for i, res in enumerate(response)}

In [9]:
for k, r in response.items():
    print(f'- {k}')
    print(r)

- result0
{'review_id': 'R14A9UA4963BHV', 'product_id': 'B00N762OVC', 'review_body': 'Took to Universal Studios and they work well at both parks for the Harry Potter Interactive areas.  These are the same ones you can buy at Universal Studios.', 'sentence': 'these are the same ones you can buy at universal studios'}
- result1
{'review_id': 'RSMVI4CNQFOQ', 'product_id': 'B00RJNM9Q4', 'review_body': 'Took to Universal Studios and they work well at both parks for the Harry Potter Interactive areas.  These are the same ones you can buy at Universal Studios.', 'sentence': 'these are the same ones you can buy at universal studios'}
- result2
{'review_id': 'R1LIZTMDTFHQWJ', 'product_id': 'B00XTRG5UA', 'review_body': 'It was exactly what I expected! Received it and its in mint condition and is brand new! Very happy!', 'sentence': 'received it and its in mint condition and is brand new'}


# Training

In [12]:
# Create estimator.
estimator = PyTorch(
    entry_point='entry_point.py',
    source_dir='source_dir',
    dependencies=['node'],
    role=role,
    framework_version='1.3.1',
    train_instance_count=1,
    train_instance_type='ml.m4.xlarge')

In [13]:
# Train.
estimator.fit()

2020-01-11 09:59:44 Starting - Starting the training job...
2020-01-11 09:59:45 Starting - Launching requested ML instances.........
2020-01-11 10:01:15 Starting - Preparing the instances for training......
2020-01-11 10:02:23 Downloading - Downloading input data...
2020-01-11 10:02:58 Training - Downloading the training image..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2020-01-11 10:03:25,652 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2020-01-11 10:03:25,655 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-01-11 10:03:25,669 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2020-01-11 10:03:25,886 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m

2020-01-11 10:03:24 Training - Training image download completed. Training in p

# Inference

In [14]:
# Deploy the trained model.
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m4.xlarge')

---------------------------------------------------------------------------------------------------!

In [15]:
from sagemaker.predictor import json_serializer, json_deserializer

predictor.content_type = 'application/json'
predictor.serializer = json_serializer
predictor.deserializer = json_deserializer

In [23]:
request = {
    'query': 'it still looks brand new too',
    'n_items': 10}
response = predictor.predict(request)

In [24]:
for value in response.values():
    print('-' * 10)
    for k, v in value.items():
        print(f'- {k}')
        print(f'{v}')

----------
- review_id
R2JFFVNUQIFY7J
- product_id
B00DSIQS6G
- review_body
My son loves closing these; he's just learning how they open. The toy is very solid and stands up to his using his feet on it, standing on it, throwing it around, etc. I don't know how long he'll continue to play with it but for now it gets a lot of attention from my 1 year old.
- sentence
the toy is very solid and stands up to his using his feet on it, standing on it, throwing it around, etc
----------
- review_id
R2YJOKZYDR789Z
- product_id
B00FIX22YQ
- review_body
This is quality wooden product. My 2 years old love playing with them. At this stage, she likes to sort all the same color or same shape. really recommend.
- sentence
at this stage, she likes to sort all the same color or same shape
----------
- review_id
RJTE957HDA1SQ
- product_id
B00DJ49AHI
- review_body
Great dress.  Has been washed multiple times, and still remains intact. My niece absolutely adores it and I think she actually wore it on a dail

In [25]:
predictor.delete_endpoint()