In [7]:
import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role
role = get_execution_role()
prefix = 'sagemaker/DEMO-xgboost-dm'
containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container
my_region = boto3.session.Session().region_name # set the region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the eu-west-1 region. You will use the 685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [8]:
import boto3
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle

In [12]:
dynamodb = boto3.resource("dynamodb")

s3 = boto3.resource("s3")

In [11]:
#Get data from database
table = dynamodb.Table("consumer_reviews_PS5")
response = table.scan()

#Keep only text from reviews
data = pd.DataFrame(response["Items"])
col_titles = data['review_text'].copy()

#The vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

#Apply transformation
tf = vectorizer.fit_transform(col_titles).toarray()

#tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names()

#Select number of topics
number_of_topics = 10

#LDA object
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

#Fit data to model
model.fit(tf)

#Serialise
model_pickle = pickle.dumps(model)

#Put serialised object to s3
bucket_name = 'ucl-msin0166-2021-individual-tmakloklas'
object = s3.Object(bucket_name, 'LDA_topic_model_sagemaker.pkl')
object.put(Body=model_pickle)

{'ResponseMetadata': {'RequestId': 'JWGKB4Z0GZBFB7XP',
  'HostId': 'DL1hJfdV0aP398IXdJWGzZR/rpZGS8R/y24ZNrAKPVYZiDwsNce1tSCY745NPTvD1deaUvSHfc8=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'DL1hJfdV0aP398IXdJWGzZR/rpZGS8R/y24ZNrAKPVYZiDwsNce1tSCY745NPTvD1deaUvSHfc8=',
   'x-amz-request-id': 'JWGKB4Z0GZBFB7XP',
   'date': 'Sun, 02 May 2021 10:58:59 GMT',
   'etag': '"73989d6483f0cdf11c10259e11e651dc"',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 1},
 'ETag': '"73989d6483f0cdf11c10259e11e651dc"'}