### Background Processing for Personalize Lab

The purpose of this notebook is to do the time-consuming steps in the background to not disrupt the flow of the lab.

**Input needed:**

**Prefix** - A prefix to apply to S3 buckets generated by the lab.  Must be globally unique.  Suggestion is to use your initials and the current date, e.g. **CV-3-20-2020**

In [None]:

prefix = "{initials}-{date}"


In [1]:
import boto3

import json
import numpy as np
import pandas as pd
import time

#If you have to rerun the notebook, increment this suffix
bg_suffix = 'background-1'

# Replace with the name of your S3 bucket
bucket = prefix + "-personalize-lab-" + bg_suffix

%store bg_suffix
%store bucket


personalize = boto3.client(service_name='personalize')
personalize_runtime = boto3.client(service_name='personalize-runtime')

s3 = boto3.client('s3')
if boto3.resource('s3').Bucket(bucket).creation_date is None:
    s3.create_bucket(ACL = "private", Bucket = bucket)
    print("Creating bucket: {}".format(bucket))
    
filename = 'gas_station_data_5k'

!wget -N https://bp-personalize-lab-2020.s3.amazonaws.com/gas_station_data_5k.zip
!unzip -o gas_station_data_5k.zip
data = pd.read_csv('./gas_station_data_5k.csv', sep=',', names=['USER_ID', 'ITEM_ID', 'TIMESTAMP'])
pd.set_option('display.max_rows', 5)

filename = "gas_station_data.csv"

data = data[data['ITEM_ID'] < 15]                  # keep only non-alcoholic beverages
data = data[['USER_ID', 'ITEM_ID', 'TIMESTAMP']]   # select columns that match the columns in the schema below
data['TIMESTAMP'] = data['TIMESTAMP'] + 660833618  # make purchases end 1st April 2019 rather than 23rd April 1998
data.to_csv(filename, index=False)

boto3.Session().resource('s3').Bucket(bucket).Object(filename).upload_file(filename)

schema_name = "bp-pers-schema" + bg_suffix

schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = schema_name,
    schema = json.dumps(schema)
)

schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

%store schema_arn


dataset_name = "bp-pers-dataset-group" + bg_suffix

create_dataset_group_response = personalize.create_dataset_group(
    name = dataset_name
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

%store dataset_group_arn



status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(
        datasetGroupArn = dataset_group_arn
    )
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(15)
    
    
    
    
dataset_type = "INTERACTIONS"

create_dataset_response = personalize.create_dataset(
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = schema_arn,
    name="bp-pers-dataset-" + bg_suffix
)

dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

%store dataset_arn


policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket),
                "arn:aws:s3:::{}/*".format(bucket)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket, Policy=json.dumps(policy));




iam = boto3.client("iam")

role_name = "PersonalizeRoleForLab"
role_list = iam.list_roles()

for role in role_list['Roles']:
    if role_name in (role['Arn']):
        role_arn = (role['Arn'])
        
role_arn


dataSource = {"dataLocation": "s3://{}/{}".format(bucket, filename)}
dataSource

create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "bp-personalize-import-job" + bg_suffix,
    datasetArn = dataset_arn,
    dataSource = {"dataLocation": "s3://{}/{}".format(bucket, filename)},
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

%store dataset_import_job_arn


status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_import_job_arn
    )
    
    dataset_import_job = describe_dataset_import_job_response["datasetImportJob"]
    if "latestDatasetImportJobRun" not in dataset_import_job:
        status = dataset_import_job["status"]
        print("DatasetImportJob: {}".format(status))
    else:
        status = dataset_import_job["latestDatasetImportJobRun"]["status"]
        print("LatestDatasetImportJobRun: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)
    
    
    
    

recipe_list = [
    "arn:aws:personalize:::recipe/aws-hrnn",
    "arn:aws:personalize:::recipe/aws-hrnn-coldstart",
    "arn:aws:personalize:::recipe/aws-hrnn-metadata",
    "arn:aws:personalize:::recipe/aws-personalized-ranking",
    "arn:aws:personalize:::recipe/aws-popularity-count",
    "arn:aws:personalize:::recipe/aws-sims"
]

recipe_arn = recipe_list[0]
print(recipe_arn)




create_solution_response = personalize.create_solution(
    name = "bp-beverage-solution" + bg_suffix,
    datasetGroupArn = dataset_group_arn,
    recipeArn = recipe_arn
)

solution_arn = create_solution_response['solutionArn']
print(json.dumps(create_solution_response, indent=2))

%store solution_arn


create_solution_version_response = personalize.create_solution_version(
    solutionArn = solution_arn
)

solution_version_arn = create_solution_version_response['solutionVersionArn']
print(json.dumps(create_solution_version_response, indent=2))

%store solution_version_arn

status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_solution_version_response = personalize.describe_solution_version(
        solutionVersionArn = solution_version_arn
    )
    status = describe_solution_version_response["solutionVersion"]["status"]
    print("SolutionVersion: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)
    
    
    
    
    
get_solution_metrics_response = personalize.get_solution_metrics(
    solutionVersionArn = solution_version_arn
)

print(json.dumps(get_solution_metrics_response, indent=2))




create_campaign_response = personalize.create_campaign(
    name = "bp-beverage-campaign" + bg_suffix,
    solutionVersionArn = solution_version_arn,
    minProvisionedTPS = 1
)

campaign_arn = create_campaign_response['campaignArn']
print(json.dumps(create_campaign_response, indent=2))

%store campaign_arn


status = None
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_campaign_response = personalize.describe_campaign(
        campaignArn = campaign_arn
    )
    status = describe_campaign_response["campaign"]["status"]
    print("Campaign: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)
    
    
    
items = pd.read_csv('./beverages.csv', sep=',', usecols=[0,1], header=None)
items.columns = ['ITEM_ID', 'TITLE']

user_id, item_id, _ = data.sample().values[0]
item_title = items.loc[items['ITEM_ID'] == item_id].values[0][-1]
print("USER: {}".format(user_id))
print("ITEM: {}".format(item_title))

items



get_recommendations_response = personalize_runtime.get_recommendations(
    campaignArn = campaign_arn,
    userId = str(user_id)
)

item_list = get_recommendations_response['itemList']
title_list = [items.loc[items['ITEM_ID'] == np.int(item['itemId'])].values[0][-1] for item in item_list]

print("Recommendations: {}".format(json.dumps(title_list, indent=2)))


images = {
    "Coca-Cola/Pepsi":          {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/coke-pepsi.jpg"},
    "Sprite/7Up":               {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/7up.jpg"},
    "Fanta":                    {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/fanta.jpg"},
    "DrPepper/MrPibb":          {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/drpepper-mypibb.jpg"},
    "Mountain Dew/Sierra Mist": {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/mtdew-smist.jpg"},
    "Root Beer":                {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/rootbeer.jpg"},
    "Energy Drink":             {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/energydrink.jpg"},
    "Sports Drink":             {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/sportsdrink.jpg"},
    "Coffee, Bottled":          {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/coffeebottle.jpg"},
    "Coffee, Store":            {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/coffeestore.jpg"},
    "Tea":                      {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/tea.jpg"},
    "Juice":                    {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/juice.jpg"},
    "Milk":                     {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/milk.jpg"},
    "Water, Bottled":           {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/water.jpg"},
    "Domestic Beer":            {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/beerdomestic.jpg"},
    "Craft Beer":               {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/beercraft.jpg"},
    "Wine":                     {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/wine.jpg"},
    "Misc Alcoholic Beverage":  {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/miscalcohol.png"},
    "Other 1":                  {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/other.jpg"},
    "Other 2":                  {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/other.jpg"},
    "Other 3":                  {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/other.jpg"},
    "Other 4":                  {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/other.jpg"},
    "Other 5":                  {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/other.jpg"},
    "Other 6":                  {"url": "https://bp-personalize-lab-2020.s3.amazonaws.com/images/other.jpg"}
}

%store images

Stored 'bg_suffix' (str)
Stored 'bucket' (str)
Creating bucket: cjv-bp-personalize-lab-background1
--2020-03-16 16:12:32--  https://bp-personalize-lab-2020.s3.amazonaws.com/gas_station_data_5k.zip
Resolving bp-personalize-lab-2020.s3.amazonaws.com (bp-personalize-lab-2020.s3.amazonaws.com)... 52.217.43.164
Connecting to bp-personalize-lab-2020.s3.amazonaws.com (bp-personalize-lab-2020.s3.amazonaws.com)|52.217.43.164|:443... connected.
HTTP request sent, awaiting response... 304 Not Modified
File ‘gas_station_data_5k.zip’ not modified on server. Omitting download.

Archive:  gas_station_data_5k.zip
  inflating: gas_station_data_5k.csv  
{
  "schemaArn": "arn:aws:personalize:us-east-1:471551772664:schema/bp-pers-schemabackground1",
  "ResponseMetadata": {
    "RequestId": "282dd3d6-20d9-49e7-b464-ce620844d21a",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 16 Mar 2020 16:12:33 GMT",
      "x-amzn-requestid": "282dd