In [1]:
!pip install sagemaker==1.72.0

Collecting sagemaker==1.72.0
  Downloading sagemaker-1.72.0.tar.gz (297 kB)
[K     |████████████████████████████████| 297 kB 3.2 MB/s eta 0:00:01
Collecting smdebug-rulesconfig==0.1.4
  Downloading smdebug_rulesconfig-0.1.4-py2.py3-none-any.whl (10 kB)
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-1.72.0-py2.py3-none-any.whl size=386358 sha256=1358dcfb968ba0e981d5a076a9cd027e94666c3ff26dfaa3adf6609a83f36aaf
  Stored in directory: /home/ec2-user/.cache/pip/wheels/c3/58/70/85faf4437568bfaa4c419937569ba1fe54d44c5db42406bbd7
Successfully built sagemaker
Installing collected packages: smdebug-rulesconfig, sagemaker
  Attempting uninstall: smdebug-rulesconfig
    Found existing installation: smdebug-rulesconfig 1.0.0
    Uninstalling smdebug-rulesconfig-1.0.0:
      Successfully uninstalled smdebug-rulesconfig-1.0.0
  Attempting uninstall: sagemaker
    Found existing install

##### downloading the data

In [2]:
%mkdir data
!wget -O data/aclImdb_v1.tar.gz http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -zxf data/aclImdb_v1.tar.gz -C data

mkdir: cannot create directory ‘data’: File exists
--2021-01-05 05:24:12--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘data/aclImdb_v1.tar.gz’


2021-01-05 05:24:40 (2.86 MB/s) - ‘data/aclImdb_v1.tar.gz’ saved [84125825/84125825]



##### Preparing and Preprocessing the Data

In [3]:
import os
import glob

def read_imdb_data(data_dir='data/aclImdb'):
    data={}
    labels={}
    
    for data_type in ['train', 'test']:
        data[data_type] = {}
        labels[data_type] = {}
        
        for sentiment in ['pos', 'neg']:
            data[data_type][sentiment] = []
            labels[data_type][sentiment] = []
            
            path = os.path.join(data_dir, data_type, sentiment, '*.txt')
            files = glob.glob(path)
            
            for f in files:
                with open(f) as review:
                    data[data_type][sentiment].append(review.read())
                    labels[data_type][sentiment].append(1 if sentiment == 'pos' else 0) 
                 
            assert len(data[data_type][sentiment]) == len(labels[data_type][sentiment]), \
            "{}/{} data size does not match labels size".format(data_type, sentiment)
            
            
    return data, labels            

In [4]:
data, labels = read_imdb_data()
print('IMDB reviews: train => {} pos / {} neg, test => {} pos / {} neg'.format(
                     len(data['train']['pos']), len(data['train']['neg']),
                     len(data['test']['pos']), len(data['test']['neg'])))

IMDB reviews: train => 12500 pos / 12500 neg, test => 12500 pos / 12500 neg


In [5]:
from sklearn.utils import shuffle

def prepare_imdb_data(data, labels):
    """Prepare training and test sets form IMDB movie reviews"""
    #     combine Positive and Negative reviews and labels
    data_train = data['train']['pos'] + data['train']['neg']
    data_test = data['test']['pos'] + data['test']['neg']
    
    labels_train = labels['train']['pos'] + labels['train']['neg']
    labels_test = labels['test']['pos'] + labels['test']['neg']
    
    #      shuffle
    data_train, labels_train = shuffle(data_train, labels_train)
    data_test, labels_test = shuffle(data_test, labels_test)
    
    #      return
    return data_train, data_test, labels_train, labels_test

In [6]:
train_X, test_X, train_y, test_y = prepare_imdb_data(data, labels)
print('IMDB reviews (combined): train = {}, test = {}'.format(len(train_X), len(test_X)))

IMDB reviews (combined): train = 25000, test = 25000


In [7]:
#  space ?
data, labels = None, None

In [8]:
train_X[100], train_y[100]

("A Nightmare on Elm Street: The Dream Child, the fifth installment in the Nightmare on Elm Street series and the worst sequel ever in the series, even worse than A Nightmare on Elm Street 2. I was lucky enough to get the Nightmare on Elm Street DVD box set for my birthday and I watched all the sequels. The dream child was the worst without a doubt, I was surprised too since they were doing so well with the last two sequels. But I guess they just lost the charm, the story was just ridicules and I wasn't happy with where it went. Alice just became more annoying, she's not Nancy or Kirsten, so her carrying this film on her own didn't work for me. Freddy is also loosing his scare, this was just getting a bit silly.<br /><br />Alice is back and she's carrying a child, she couldn't be happier with her life. But Freddy is also back and he's not going to be too light on her since she defeated him so easily in the fourth movie. But anyways, he wants her child and to be born into the world agai

##### Processing the data

In [9]:
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def review_to_words(review):
    words = REPLACE_NO_SPACE.sub("", review.lower())
    words = REPLACE_WITH_SPACE.sub(" ", words)
    return words

In [10]:
[review_to_words(train_X[100])]

['a nightmare on elm street the dream child the fifth installment in the nightmare on elm street series and the worst sequel ever in the series even worse than a nightmare on elm street 2 i was lucky enough to get the nightmare on elm street dvd box set for my birthday and i watched all the sequels the dream child was the worst without a doubt i was surprised too since they were doing so well with the last two sequels but i guess they just lost the charm the story was just ridicules and i wasnt happy with where it went alice just became more annoying shes not nancy or kirsten so her carrying this film on her own didnt work for me freddy is also loosing his scare this was just getting a bit silly alice is back and shes carrying a child she couldnt be happier with her life but freddy is also back and hes not going to be too light on her since she defeated him so easily in the fourth movie but anyways he wants her child and to be born into the world again did you ever wonder if freddy had

In [11]:
import pickle

cache_dir = os.path.join('cache', 'sentiment_web_app')
os.makedirs(cache_dir, exist_ok=True)

def preprocess_data(data_train, data_test, labels_train, labels_test,
                    cache_dir=cache_dir, cache_file='preprocessed_data.pkl'):
    """Convert each review to words; read from cache if available."""
    
    #if cache file is not none try to read it first.     
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), 'rb') as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file: ", cache_file)
        except:
            pass 
    
    #if cache is missing, then do the heavy lifting.
    if cache_data is None:
       # Preprocess training and test data to obtain words for each review
        words_train = [review_to_words(review) for review in data_train]
        words_test = [review_to_words(review) for review in data_test]
        
       # write to cache files for future use.
        if cache_file is not None:
            cache_data = dict(words_train=words_train, words_test=words_test,
                              labels_train=labels_train, labels_test=labels_test)
            with open(os.path.join(cache_dir, cache_file), 'wb') as f:
                pickle.dump(cache_data, f)
            print('Wrote preprocessed data to cached file: ', cache_file)
    else:
        # Unpack data loaded from cache file
        words_train, words_test, labels_train, labels_test = (cache_data['words_train'],
                                 cache_data['words_test'], cache_data['labels_train'], cache_data['labels_test'])
    
    return words_train, words_test, labels_train, labels_test

In [12]:
# preprocess the data
train_X, test_X, train_y, test_y = preprocess_data(train_X, test_X, train_y, test_y)

Read preprocessed data from cache file:  preprocessed_data.pkl


In [13]:
train_X[100] == review_to_words(train_X[100])

True

#### Extract the Bag-of-Words features.

In [14]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

import joblib # enhanced version of pickle that is more effiecient for storing Numpy arrays.

def extract_BoW_features(words_train, words_test, vocabulary_size=5000,
                         cache_dir=cache_dir, cache_file='bow_features.pkl'):
    
    """Extract Bag-Of-Words for a given set of documents, already preprocessed into words."""
    # if cache file is not None try to read from it first
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), 'rb') as f:
                cache_data = joblib.load(f)
            print("Read features from cache file: ", cache_file)
        except:
            pass
    
    # if cache is missing, then do the heavy lifting.
    if cache_data is None:
        vectorizer = CountVectorizer(max_features=vocabulary_size)
        
        feature_train = vectorizer.fit_transform(words_train).toarray()
        feature_test = vectorizer.transform(words_test).toarray()
        
        # write: cache file for future uses.
        if cache_file is not None:
            vocabulary = vectorizer.vocabulary_
            
            cache_data = dict(feature_train=feature_train, 
                              feature_test=feature_test,
                              vocabulary=vocabulary)
            
            with open(os.path.join(cache_dir, cache_file), 'wb') as f:
                joblib.dump(cache_data, f)
            
            print('Wrote features & Vocabulary to cache file: ', cache_file)
    else:
        # Unpack data loaded from cache file.
        feature_train, feature_test, vocabulary = (cache_data['feature_train'],
                                                   cache_data['feature_test'],
                                                   cache_data['vocabulary'])
    # return both: extracted features & vocabulary
    return feature_train, feature_test, vocabulary

In [15]:
# Extract the Bag-of-Words for both training and test datasets.
train_X, test_X, vocabulary = extract_BoW_features(train_X, test_X)

Read features from cache file:  bow_features.pkl


In [16]:
a = np.where(train_X[100] == 1)

In [17]:
a

(array([  67,  311,  382,  403,  418,  742,  890, 1100, 1254, 1302, 1468,
        1538, 1690, 1783, 1848, 1852, 1864, 1901, 1931, 1964, 2043, 2052,
        2181, 2366, 2373, 2533, 2626, 2661, 2713, 2775, 2904, 2951, 3013,
        3043, 3106, 3141, 3154, 3298, 3761, 3798, 3809, 3811, 3861, 4420,
        4424, 4427, 4431, 4438, 4495, 4522, 4525, 4558, 4683, 4723, 4811,
        4862, 4868, 4871, 4888, 4918]),)

In [18]:
train_X[100][133], train_X[100][1912], train_X[100][4095], train_X[100][4991]

(0, 0, 0, 0)

##### Upload data to s3. 

In [19]:
import pandas as pd

val_X = pd.DataFrame(train_X[:10000])
train_X = pd.DataFrame(train_X[10000:])

val_y = pd.DataFrame(train_y[:10000])
train_y = pd.DataFrame(train_y[10000:])

In [20]:
len(val_X)

10000

In [21]:
# first make sure data storing directory exists.
data_dir = 'data/sentiment_web_app'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [22]:
pd.DataFrame(test_X).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

In [23]:
pd.concat([val_y, val_X], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)

In [24]:
pd.concat([train_y, train_X], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [25]:
# Save memory ?
test_X = train_X = val_X = train_y = val_y = None

Uploading training/validation to s3

In [26]:
import sagemaker

session = sagemaker.Session()

# s3 prefix
prefix = 'sentiment-web-app'

test_loc = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_loc = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_loc = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

##### Creating the XGboost model

In [27]:
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

role = get_execution_role()
container = get_image_uri(session.boto_region_name, 'xgboost')
op_path = 's3://{}/{}/output'.format(session.default_bucket(), prefix)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').


In [28]:
#  create the Sagemaker estimator object
xgb = sagemaker.estimator.Estimator(container,
                                    role,
                                    train_instance_count=1,
                                    train_instance_type='ml.m4.xlarge',
                                    output_path=op_path,
                                    sagemaker_session=session)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [29]:
# set the algorithm specific parameters
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

fit the XGboost model.

In [30]:
s3_ip_train = sagemaker.s3_input(s3_data=train_loc, content_type='csv')
s3_ip_validation = sagemaker.s3_input(s3_data=val_loc, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [31]:
xgb.fit({'train': s3_ip_train, 'validation': s3_ip_validation})

2021-01-05 05:26:50 Starting - Starting the training job...
2021-01-05 05:26:52 Starting - Launching requested ML instances......
2021-01-05 05:27:55 Starting - Preparing the instances for training...
2021-01-05 05:28:51 Downloading - Downloading input data......
2021-01-05 05:29:42 Training - Training image download completed. Training in progress.[34mArguments: train[0m
[34m[2021-01-05:05:29:43:INFO] Running standalone xgboost training.[0m
[34m[2021-01-05:05:29:43:INFO] File size need to be processed in the node: 238.5mb. Available memory size in the node: 8436.83mb[0m
[34m[2021-01-05:05:29:43:INFO] Determined delimiter of CSV input is ','[0m
[34m[05:29:43] S3DistributionType set as FullyReplicated[0m
[34m[05:29:45] 15000x5000 matrix with 75000000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-01-05:05:29:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[05:29:45] S3DistributionType set as FullyReplicated[0m
[3

[34m[05:30:46] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 16 pruned nodes, max_depth=5[0m
[34m[43]#011train-error:0.1456#011validation-error:0.1819[0m
[34m[05:30:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 32 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[44]#011train-error:0.144867#011validation-error:0.1808[0m
[34m[05:30:49] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[45]#011train-error:0.143933#011validation-error:0.18[0m
[34m[05:30:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 36 extra nodes, 6 pruned nodes, max_depth=5[0m
[34m[46]#011train-error:0.142333#011validation-error:0.1794[0m
[34m[05:30:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 18 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[47]#011train-error:0.141467#011validation-error:0.1796[0m
[34m[05:30:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 

[34m[05:31:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[91]#011train-error:0.1068#011validation-error:0.1581[0m
[34m[05:31:50] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 4 pruned nodes, max_depth=5[0m
[34m[92]#011train-error:0.105667#011validation-error:0.1574[0m
[34m[05:31:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 16 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[93]#011train-error:0.105667#011validation-error:0.157[0m
[34m[05:31:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 22 extra nodes, 4 pruned nodes, max_depth=5[0m
[34m[94]#011train-error:0.1056#011validation-error:0.1558[0m
[34m[05:31:54] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 24 extra nodes, 14 pruned nodes, max_depth=5[0m
[34m[95]#011train-error:0.104467#011validation-error:0.1554[0m
[34m[05:31:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 1

[34m[05:32:51] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 6 pruned nodes, max_depth=5[0m
[34m[139]#011train-error:0.0866#011validation-error:0.1443[0m
[34m[05:32:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 4 pruned nodes, max_depth=5[0m
[34m[140]#011train-error:0.086467#011validation-error:0.1443[0m
[34m[05:32:53] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 18 pruned nodes, max_depth=5[0m
[34m[141]#011train-error:0.085067#011validation-error:0.1443[0m
[34m[05:32:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 8 pruned nodes, max_depth=5[0m
[34m[142]#011train-error:0.0838#011validation-error:0.1454[0m
[34m[05:32:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 26 extra nodes, 4 pruned nodes, max_depth=5[0m
[34m[143]#011train-error:0.083#011validation-error:0.1446[0m
[34m[05:32:57] src/tree/updater_prune.cc:74: tree pruning end, 1 roots

###### Testing the model.

In [32]:
xgb_transformer = xgb.transformer(instance_count=1, instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [33]:
xgb_transformer.transform(test_loc, content_type='text/csv', split_type='Line')

In [34]:
xgb_transformer.wait()

...............................[32m2021-01-05T05:38:27.852:[sagemaker logs]: MaxConcurrentTransforms=4, MaxPayloadInMB=6, BatchStrategy=MULTI_RECORD[0m
[34mArguments: serve[0m
[34m[2021-01-05 05:38:27 +0000] [1] [INFO] Starting gunicorn 19.7.1[0m
[34m[2021-01-05 05:38:27 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-01-05 05:38:27 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-01-05 05:38:27 +0000] [36] [INFO] Booting worker with pid: 36[0m
[34m[2021-01-05 05:38:27 +0000] [37] [INFO] Booting worker with pid: 37[0m
[35mArguments: serve[0m
[35m[2021-01-05 05:38:27 +0000] [1] [INFO] Starting gunicorn 19.7.1[0m
[35m[2021-01-05 05:38:27 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2021-01-05 05:38:27 +0000] [1] [INFO] Using worker: gevent[0m
[35m[2021-01-05 05:38:27 +0000] [36] [INFO] Booting worker with pid: 36[0m
[35m[2021-01-05 05:38:27 +0000] [37] [INFO] Booting worker with pid: 37[0m
[34m[2021-01-05:05:38:27:INFO

[34m[2021-01-05:05:38:45:INFO] Sniff delimiter as ','[0m
[34m[2021-01-05:05:38:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-01-05:05:38:45:INFO] Sniff delimiter as ','[0m
[34m[2021-01-05:05:38:45:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-01-05:05:38:45:INFO] Sniff delimiter as ','[0m
[35m[2021-01-05:05:38:45:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-01-05:05:38:45:INFO] Sniff delimiter as ','[0m
[35m[2021-01-05:05:38:45:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-01-05:05:38:47:INFO] Sniff delimiter as ','[0m
[34m[2021-01-05:05:38:47:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-01-05:05:38:47:INFO] Sniff delimiter as ','[0m
[34m[2021-01-05:05:38:47:INFO] Determined delimiter of CSV input is ','[0m
[34m[2021-01-05:05:38:47:INFO] Sniff delimiter as ','[0m
[34m[2021-01-05:05:38:47:INFO] Determined delimiter of CSV input is ','[0m
[35m[2021-01-05:05:38:47:INFO] Sniff delimiter 

In [35]:
# copy the file to data directory
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

Completed 256.0 KiB/369.5 KiB (2.7 MiB/s) with 1 file(s) remainingCompleted 369.5 KiB/369.5 KiB (3.8 MiB/s) with 1 file(s) remainingdownload: s3://sagemaker-ap-south-1-878116372356/xgboost-2021-01-05-05-33-34-681/test.csv.out to data/sentiment_web_app/test.csv.out


In [36]:
prediction = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
prediction = [round(num) for num in prediction.squeeze().values]

In [37]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, prediction)

0.85188

##### Deploying the model.

In [38]:
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type = 'ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Using already existing model: xgboost-2021-01-05-05-26-50-317


-------------!

test the model again.

In [39]:
from sagemaker.predictor import csv_serializer

# We need to tell the endpoint what format the data we are sending is in so that SageMaker can perform the serialization.
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

In [40]:
def predict(data, rows=512):
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])
    
    return np.fromstring(predictions[1:], sep=',')

In [41]:
test_X = pd.read_csv(os.path.join(data_dir, 'test.csv'), header=None).values

predictions = predict(test_X)
predictions = [round(num) for num in predictions]

In [42]:
from sklearn.metrics import accuracy_score
accuracy_score(test_y, predictions)

0.85188

Cleaning Up

In [67]:
xgb_predictor.delete_endpoint()

###### Putting our model to work.

In [70]:
print(str(vocabulary))



Processing a single review

In [44]:
test_review = "Nothing but a disgusting materialistic pageant of glistening abed remote control greed zombies, totally devoid of any heart or heat. A romantic comedy that has zero romantic chemestry and zero laughs!"

In [45]:
test_words = review_to_words(test_review)
print(test_words)

nothing but a disgusting materialistic pageant of glistening abed remote control greed zombies totally devoid of any heart or heat a romantic comedy that has zero romantic chemestry and zero laughs


In [46]:
def bow_encoding(words, vocabulary):
    bow = [0] * len(vocabulary) # Start by setting the count for each word in the vocabulary to zero.
    for word in words.split():  # For each word in the string
        if word in vocabulary:  # If the word is one that occurs in the vocabulary, increase its count.
            bow[vocabulary[word]] += 1
    return bow

In [48]:
test_bow = bow_encoding(test_words, vocabulary)
# print(test_bow)
len(test_bow)

5000

In [71]:
# 
xgb_predictor = xgb.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Using already existing model: xgboost-2021-01-05-05-26-50-317


-------------!

In [72]:
import boto3
runtime = boto3.Session().client('sagemaker-runtime')

In [73]:
xgb_predictor.endpoint

'xgboost-2021-01-05-05-26-50-317'

In [74]:
response = runtime.invoke_endpoint(EndpointName = xgb_predictor.endpoint, # The name of the endpoint we created
                                   ContentType = 'text/csv',                     # The data format that is expected
                                   Body = ','.join([str(val) for val in test_bow]).encode('utf-8'))

In [75]:
print(response)

{'ResponseMetadata': {'RequestId': 'fcd89aed-d691-4696-8728-96ddb8f35c73', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': 'fcd89aed-d691-4696-8728-96ddb8f35c73', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Tue, 5 Jan 2021 06:40:56 GMT', 'content-type': 'text/csv; charset=utf-8', 'content-length': '14'}, 'RetryAttempts': 0}, 'ContentType': 'text/csv; charset=utf-8', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7f2e636c9c88>}


In [76]:
print(response['Body'].read().decode('utf-8'))

0.505412817001


#### Setting up a Lambda function
<br> a: create a IAM Role for the Lambda function
<br> b:  Create a Lambda function
<br> c:  Setting up API Gateway
<br> d; Deploying our web app with api gateway url.

In [None]:
# We need to use the low-level library to interact with SageMaker since the SageMaker API
# is not available natively through Lambda.
import boto3

# And we need the regular expression library to do some of the data processing
import re

REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

def review_to_words(review):
    words = REPLACE_NO_SPACE.sub("", review.lower())
    words = REPLACE_WITH_SPACE.sub(" ", words)
    return words

def bow_encoding(words, vocabulary):
    bow = [0] * len(vocabulary) # Start by setting the count for each word in the vocabulary to zero.
    for word in words.split():  # For each word in the string
        if word in vocabulary:  # If the word is one that occurs in the vocabulary, increase its count.
            bow[vocabulary[word]] += 1
    return bow


def lambda_handler(event, context):

    vocab = "*** ACTUAL VOCABULARY GOES HERE ***"

    words = review_to_words(event['body'])
    bow = bow_encoding(words, vocab)

    # The SageMaker runtime is what allows us to invoke the endpoint that we've created.
    runtime = boto3.Session().client('sagemaker-runtime')

    # Now we use the SageMaker runtime to invoke our endpoint, sending the review we were given
    response = runtime.invoke_endpoint(EndpointName = '***ENDPOINT NAME HERE***',# The name of the endpoint we created
                                       ContentType = 'text/csv',                 # The data format that is expected
                                       Body = ','.join([str(val) for val in bow]).encode('utf-8')) # The actual review

    # The response is an HTTP response whose body contains the result of our inference
    result = response['Body'].read().decode('utf-8')

    # Round the result so that our web app only gets '1' or '0' as a response.
    result = round(float(result))

    return {
        'statusCode' : 200,
        'headers' : { 'Content-Type' : 'text/plain', 'Access-Control-Allow-Origin' : '*' },
        'body' : str(result)
    }

In [77]:
#  name of the endpoint
xgb_predictor.endpoint

'xgboost-2021-01-05-05-26-50-317'

In [None]:
# the string of vocabulary for the lambda function.
print(str(vocabulary))

##### Delete the endpoint

In [78]:
xgb_predictor.delete_endpoint()

Optional/must: Clean Up.