In [1]:
import time
from time import sleep

import json
import boto3
import pandas as pd

from datetime import datetime

In [2]:
data_dir = "data"
!mkdir $data_dir

!cd $data_dir && wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
!cd $data_dir && unzip ml-latest-small.zip
dataset_dir = data_dir + "/ml-latest-small/"
!ls $dataset_dir

--2021-02-15 00:23:05--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2021-02-15 00:23:06 (3.93 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  
links.csv  movies.csv  ratings.csv  README.txt	tags.csv


In [3]:
original_data = pd.read_csv(dataset_dir + '/ratings.csv')
print(original_data.info())

original_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB
None


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
watched_df = original_data.copy()
watched_df = watched_df[watched_df['rating'] > 3]
watched_df = watched_df[['userId', 'movieId', 'timestamp']]
watched_df['EVENT_TYPE'] = 'watch'

clicked_df = original_data.copy()
clicked_df = clicked_df[clicked_df['rating'] > 1]
clicked_df = clicked_df[['userId', 'movieId', 'timestamp']]
clicked_df['EVENT_TYPE'] = 'click'

interactions_df = clicked_df.copy()
interactions_df = interactions_df.append(watched_df)
interactions_df.sort_values("timestamp", axis=0, ascending=True,
                            inplace=True, na_position='last')

In [6]:
interactions_df.rename(columns={'userId':'USER_ID',
                                'movieId':'ITEM_ID',
                                'timestamp':'TIMESTAMP'
                               }, inplace=True)
interactions_filename = "interactions.csv"
interactions_df.to_csv((data_dir+"/"+interactions_filename), index=False, float_format='%.0f')

In [10]:
# configure the sdk to personalize
personalize = boto3.client('personalize')
personalize_runtime = boto3.client('personalize-runtime')

create_dataset_group_response = personalize.create_dataset_group(
    name = "personalize-demo-movielens-xinli"
)

dataset_group_arn = create_dataset_group_response['datasetGroupArn']
print(json.dumps(create_dataset_group_response, indent=2))

{
  "datasetGroupArn": "arn:aws:personalize:us-west-2:900019131056:dataset-group/personalize-demo-movielens-xinli",
  "ResponseMetadata": {
    "RequestId": "069327a9-6d89-459b-95dd-0ec0f038fb70",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 15 Feb 2021 00:37:42 GMT",
      "x-amzn-requestid": "069327a9-6d89-459b-95dd-0ec0f038fb70",
      "content-length": "111",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [11]:
%%time
max_time = time.time() + 3*60*60
while time.time() < max_time:
    describe_dataset_group_response = personalize.describe_dataset_group(datasetGroupArn=dataset_group_arn)
    status = describe_dataset_group_response["datasetGroup"]["status"]
    print("DatasetGroup: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATED FAILED":
        break
    
    time.sleep(60)

DatasetGroup: ACTIVE
CPU times: user 12.5 ms, sys: 1.5 ms, total: 14 ms
Wall time: 65 ms


In [12]:
interactions_schema = {
    "type": "record",
    "name": "Interactions",
    "namespace": "com.amazonaws.personalize.schema",
    "fields": [
        {
            "name": "USER_ID",
            "type": "string"
        },
        {
            "name": "ITEM_ID",
            "type": "string"
        },
        {
            "name": "EVENT_TYPE",
            "type": "string"
        },
        {
            "name": "TIMESTAMP",
            "type": "long"
        }
    ],
    "version": "1.0"
}

create_schema_response = personalize.create_schema(
    name = "personalize-demo-movielens-interactions",
    schema = json.dumps(interactions_schema)
)

interaction_schema_arn = create_schema_response['schemaArn']
print(json.dumps(create_schema_response, indent=2))

{
  "schemaArn": "arn:aws:personalize:us-west-2:900019131056:schema/personalize-demo-movielens-interactions",
  "ResponseMetadata": {
    "RequestId": "7eb7f2d9-e8e0-4ec8-9539-28cb68fa4d66",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 15 Feb 2021 00:44:29 GMT",
      "x-amzn-requestid": "7eb7f2d9-e8e0-4ec8-9539-28cb68fa4d66",
      "content-length": "105",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [13]:
dataset_type = "INTERACTIONS"
create_dataset_response = personalize.create_dataset(
    name = "personalize-demo-movielens-ints",
    datasetType = dataset_type,
    datasetGroupArn = dataset_group_arn,
    schemaArn = interaction_schema_arn
)

interactions_dataset_arn = create_dataset_response['datasetArn']
print(json.dumps(create_dataset_response, indent=2))

{
  "datasetArn": "arn:aws:personalize:us-west-2:900019131056:dataset/personalize-demo-movielens-xinli/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "f731eec3-5d6b-41c4-98e3-f48ee51876b6",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 15 Feb 2021 00:46:14 GMT",
      "x-amzn-requestid": "f731eec3-5d6b-41c4-98e3-f48ee51876b6",
      "content-length": "113",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [15]:
session = boto3.session.Session()
region = session.region_name
s3 = boto3.client('s3')
account_id = boto3.client('sts').get_caller_identity().get('Account')
bucket_name = account_id + "-" + region + "-" + "personalizedemoml"

print(bucket_name)

900019131056-us-west-2-personalizedemoml


In [16]:
if region == 'us-east-1':
    s3.create_bucket(Bucket=bucket_name)
else:
    s3.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={'LocationConstraint': region}
    )

In [17]:
interactions_file_path = data_dir + "/" + interactions_filename
boto3.Session().resource('s3').Bucket(bucket_name).Object(interactions_filename).upload_file(interactions_file_path)
interactions_s3DataPath = "s3://" + bucket_name + "/" + interactions_filename

In [18]:
policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com"
            },
            "Action": [
                "s3:*Object",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket_name),
                "arn:aws:s3:::{}/*".format(bucket_name)
            ]
        }
    ]
}

In [19]:
s3.put_bucket_policy(Bucket=bucket_name, Policy=json.dumps(policy))

{'ResponseMetadata': {'RequestId': 'FF2B1B693020F9D0',
  'HostId': '6KTfPWnmt0Fim/Hq6bdE5D6IXBqxakMRpx/Q6dflGD1kIXy3TAnw8NW4wp+dcpJWc7nOBneiEck=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': '6KTfPWnmt0Fim/Hq6bdE5D6IXBqxakMRpx/Q6dflGD1kIXy3TAnw8NW4wp+dcpJWc7nOBneiEck=',
   'x-amz-request-id': 'FF2B1B693020F9D0',
   'date': 'Mon, 15 Feb 2021 01:13:24 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

In [20]:
iam = boto3.client("iam")

role_name = "PersonalizeRolePOC"
assume_role_policy_document = {
    "Version": "2012-10-17",
    "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "personalize.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
    ]
}

In [26]:
create_role_response = iam.create_role(
    RoleName = role_name,
    AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
)

policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonPersonalizeFullAccess"
iam.attach_role_policy(
    RoleName = role_name,
    PolicyArn = policy_arn
)

# add s3 support
iam.attach_role_policy(
    PolicyArn='arn:aws:iam::aws:policy/AmazonS3FullAccess',
    RoleName=role_name
)

# wait for a minute to allow IAM role policy attachment to propagate
time.sleep(60)

role_arn = create_role_response["Role"]["Arn"]
print(role_arn)

EntityAlreadyExistsException: An error occurred (EntityAlreadyExists) when calling the CreateRole operation: Role with name PersonalizeRolePOC already exists.

In [28]:
create_dataset_import_job_response = personalize.create_dataset_import_job(
    jobName = "personalize-demo-import2",
    datasetArn = interactions_dataset_arn,
    dataSource = {
        "dataLocation": "s3://{}/{}".format(bucket_name, interactions_filename)
    },
    roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_response['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_response, indent=2))

{
  "datasetImportJobArn": "arn:aws:personalize:us-west-2:900019131056:dataset-import-job/personalize-demo-import2",
  "ResponseMetadata": {
    "RequestId": "b3d4778f-c4ca-4967-90a1-c40cffc8349a",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 15 Feb 2021 01:53:19 GMT",
      "x-amzn-requestid": "b3d4778f-c4ca-4967-90a1-c40cffc8349a",
      "content-length": "112",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [29]:
%%time
max_time = time.time() + 6*60*60 # 6 hours
while time.time() < max_time:
    describe_dataset_import_job_response = personalize.describe_dataset_import_job(
        datasetImportJobArn = dataset_import_job_arn
    )
    status = describe_dataset_import_job_response["datasetImportJob"]['status']
    print("DatasetImportJob: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

DatasetImportJob: CREATE PENDING
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: CREATE IN_PROGRESS
DatasetImportJob: ACTIVE
CPU times: user 32.5 ms, sys: 5.1 ms, total: 37.6 ms
Wall time: 4min


In [30]:
# aws-user-personalization selected for demo purposes
recipe_arn = "arn:aws:personalize:::recipe/aws-user-personalization"

In [32]:
create_solution_response = personalize.create_solution(
    name = "personalize-demo-soln-user-personalization",
    datasetGroupArn = dataset_group_arn,
    recipeArn = recipe_arn
)

solution_arn = create_solution_response['solutionArn']
print(json.dumps(create_solution_response, indent=2))

{
  "solutionArn": "arn:aws:personalize:us-west-2:900019131056:solution/personalize-demo-soln-user-personalization",
  "ResponseMetadata": {
    "RequestId": "a1b66cea-03bb-4b0d-b92a-11393ebd4654",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 15 Feb 2021 02:14:02 GMT",
      "x-amzn-requestid": "a1b66cea-03bb-4b0d-b92a-11393ebd4654",
      "content-length": "112",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [34]:
create_solution_version_response = personalize.create_solution_version(
    solutionArn = solution_arn
)

solution_version_arn = create_solution_version_response['solutionVersionArn']
print(json.dumps(create_solution_version_response, indent=2))

{
  "solutionVersionArn": "arn:aws:personalize:us-west-2:900019131056:solution/personalize-demo-soln-user-personalization/9c0ce0db",
  "ResponseMetadata": {
    "RequestId": "bce2324b-ae0c-40ba-be0c-ff9c408e6a4d",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "content-type": "application/x-amz-json-1.1",
      "date": "Mon, 15 Feb 2021 02:15:28 GMT",
      "x-amzn-requestid": "bce2324b-ae0c-40ba-be0c-ff9c408e6a4d",
      "content-length": "128",
      "connection": "keep-alive"
    },
    "RetryAttempts": 0
  }
}


In [36]:
%%time
max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_solution_version_response = personalize.describe_solution_version(
        solutionVersionArn = solution_version_arn
    )
    status = describe_solution_version_response["solutionVersion"]["status"]
    print("SolutionVersion: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS
SolutionVersion: CREATE IN_PROGRESS


KeyboardInterrupt: 

In [37]:
get_solution_metrics_response = personalize.get_solution_metrics(
    solutionVersionArn = solution_version_arn
)

print(json.dumps(get_solution_metrics_response, indent=2))

ResourceInUseException: An error occurred (ResourceInUseException) when calling the GetSolutionMetrics operation: Cannot get metrics for solution since it is not in ACTIVE state.

In [None]:
create_campaign_response = personalize.create_campaign(
    name = "personalize-demo-camp",
    solutionVersionArn = solution_version_arn,
    minProvisionedTPS = 1,
    campaignConfig = {
        "itemExplorationConfig": {
            "explorationWeight": "0.3",
            "explorationItemAgeCutOff": "30"
        }
    }
)

campaign_arn = create_campaign_response['campaignArn']
print(json.dumps(create_campaign_response, indent=2))

max_time = time.time() + 3*60*60 # 3 hours
while time.time() < max_time:
    describe_campaign_response = personalize.describe_campaign(
        campaignArn = campaign_arn
    )
    status = describe_campaign_response["campaign"]["status"]
    print("Campaign: {}".format(status))
    
    if status == "ACTIVE" or status == "CREATE FAILED":
        break
        
    time.sleep(60)

In [38]:
# build a map to convert a movie id to the movie title
movies = pd.read_csv(dataset_dir + '/movies.csv', usecols=[0,1])
movies['movieId'] = movies['movieId'].astype(str)
movie_map = dict(movies.values)

# get a random user
user_id, item_id = interactions_df[['SER_ID', 'ITEM_ID']].sample().values[0]
get_recommendations_response = personalize_runtime.get_recommendations(
    campaignArn = campaign_ar,
    userId = str(user_id),
)

# udpate df rendering
pd.set_option('display.max_rows', 30)

print("Recommendations for user: ", user_id)

item_list = get_recommendations_response['itemList']
recommendation_list = []

for item in item_list:
    title = movie_map[item['itemId']]
    recommendation_list.append(title)
    
recommendations_df = pd.DataFrame(recommendation_list, columns = ['OriginalRecs'])
recommendations_df.head()

KeyError: "['SER_ID'] not in index"