#### S3 Upload
- 정제되 파일 읽기 및 확인
- Personalize가 사용할 S3 Bucket 생성, Policy 적용
- record가 너무 많으며, 학습속도에 문제가 있으므로 10 만 건으로 샘플링. 단. event_cnt가 최소 3개 되도록 설정 (성능을 위해서)
- 샘플 데이터에서 evnet_cnt drop 후 s3에 uplaod

In [1]:
import pandas as pd
import datetime
import numpy as np
pd.set_option('display.width', 1000)

In [2]:
interactions = pd.read_csv('./data/cleaned_user_interactions.csv')
print(interactions.shape)
interactions.head()

(5493809, 10)


Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE,PRICE,BRAND,CAT_0,CAT_1,CAT_2,EVENT_CNT
0,533326659,17300764,1671161000.0,cart,0.017464,lanvin,apparel,shoes,sandals,10
1,533326659,17300814,1671986000.0,cart,0.009719,lacoste,apparel,shoes,sandals,10
2,522355747,4804056,1667261000.0,cart,0.062099,apple,electronics,audio,headphone,8
3,522355747,100086235,1676053000.0,cart,0.044348,samsung,construction,tools,light,8
4,522355747,100086203,1676053000.0,cart,0.044348,samsung,construction,tools,light,8


In [3]:
insteractins_sample = interactions.sample(n=150000).query('EVENT_CNT >=3')
# insteractins_sample['EVENT_CNT'].value_counts())

record_cnt = insteractins_sample.shape[0]
print("sample record count before : ", record_cnt)
if record_cnt >=100000:
  insteractins_sample = insteractins_sample.iloc[:100000, :]
else:
  print("need resampling")

print("sample length: {}".format(len(insteractins_sample)))

sample record count before :  136743
sample length: 100000


In [4]:
insteractins_sample.isnull().sum()

USER_ID       0
ITEM_ID       0
TIMESTAMP     0
EVENT_TYPE    0
PRICE         0
BRAND         0
CAT_0         0
CAT_1         0
CAT_2         0
EVENT_CNT     0
dtype: int64

In [5]:
# 추천에 필요없는 EVENT_CNT column drop
insteractins_sample.drop(columns=['EVENT_CNT'], inplace=True)
insteractins_sample.head()

Unnamed: 0,USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE,PRICE,BRAND,CAT_0,CAT_1,CAT_2
817529,513748663,3701284,1670520000.0,cart,0.009692,elenberg,appliances,environment,vacuum
1707441,550455144,17300822,1671534000.0,purchase,0.017584,gucci,apparel,shoes,sandals
820181,512626294,1004250,1672385000.0,cart,0.318797,apple,construction,tools,light
3683276,551200527,1004250,1673273000.0,purchase,0.314779,apple,construction,tools,light
2266268,547738636,1004767,1675048000.0,purchase,0.08744,samsung,construction,tools,light


In [6]:
insteractins_sample['TIMESTAMP'].isnull().sum()

0

In [24]:
#  칼럼 타입 변환
insteractins_sample = insteractins_sample.astype({'USER_ID': 'string', "ITEM_ID": "string", "TIMESTAMP": "long"})

In [25]:
# insteractions sampling data을 파일로 저장
insteractins_sample.to_csv('./data/interactions-sample.csv', index=False)

In [27]:
# read sampled interactions file
sampled_intreactions = pd.read_csv('./data/interactions-sample.csv')
#  칼럼 타입 변환
sampled_intreactions = sampled_intreactions.astype({'USER_ID': 'string', "ITEM_ID": "string", "TIMESTAMP": "long"})
sampled_intreactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   USER_ID     100000 non-null  string 
 1   ITEM_ID     100000 non-null  string 
 2   TIMESTAMP   100000 non-null  int64  
 3   EVENT_TYPE  100000 non-null  object 
 4   PRICE       100000 non-null  float64
 5   BRAND       100000 non-null  object 
 6   CAT_0       100000 non-null  object 
 7   CAT_1       100000 non-null  object 
 8   CAT_2       100000 non-null  object 
dtypes: float64(1), int64(1), object(5), string(2)
memory usage: 6.9+ MB


#### Create Bucket and policy

In [9]:
import boto3
from botocore.exceptions import ClientError
import numpy as np
import string
import random
import json
import time

#### Configure S3 bucket and an IAM Role

In [11]:
random.seed(2023)
suffix = ''.join(random.choice(string.ascii_lowercase) for i in range(5))
# bucket  생성
bucket_name = "osungmart-personalize-" + suffix
region = 'ap-northeast-2'
# print(bucket_name)
s3 = boto3.client("s3")
try:
  create_bucket_resp = s3.create_bucket(
    Bucket = bucket_name,
    CreateBucketConfiguration = {
      'LocationConstraint' : region
    },
  )
except ClientError as err:
  if err.response['Error']['Code'] == 'BucketAlreadyOwnedByYou':
    print(f"Buket {bucket_name} alread exist")
  else:
    raise

Buket osungmart-personalize-mwomk alread exist


Upload data to S3

In [12]:
data_dir = 'data'
interactions_filename = data_dir + '/interactions-sample.csv'
try: 
  boto3.Session().resource('s3').Bucket(bucket_name).Object(interactions_filename).upload_file(interactions_filename)
except ClientError as e:
  print(e)

#### Dataset Group and Dataset

In [13]:
personalize = boto3.client('personalize')

In [14]:
# Create Dataset Group
dataset_group_name = "osungmart-dataset-group" + suffix

create_dataset_group_resp = personalize.create_dataset_group(
  name = dataset_group_name
)

dataset_group_arn = create_dataset_group_resp['datasetGroupArn']
print(json.dumps(create_dataset_group_resp, indent=2))

{
  "datasetGroupArn": "arn:aws:personalize:ap-northeast-2:532805286864:dataset-group/osungmart-dataset-groupmwomk",
  "ResponseMetadata": {
    "RequestId": "3fb997fe-e4ad-47f3-a983-b5498ba1d65c",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Thu, 02 Feb 2023 08:44:08 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "112",
      "connection": "keep-alive",
      "x-amzn-requestid": "3fb997fe-e4ad-47f3-a983-b5498ba1d65c"
    },
    "RetryAttempts": 0
  }
}


In [15]:
# dataset group 생성 모니터링
status = None
max_time = time.time()* 3*60*60 # 3h
while time.time() < max_time:
  describe_dataset_group_resp = personalize.describe_dataset_group(
    datasetGroupArn = dataset_group_arn
  )
  status = describe_dataset_group_resp['datasetGroup']['status']
  print("DatasetGroup: {}".format(status))

  if status == "ACTIVE" or status == 'CREATE FAILED':
    break

  time.sleep(10)

DatasetGroup: CREATE PENDING
DatasetGroup: ACTIVE


#### dataset 생성
interactions dataset

In [16]:
schema_name = "osungmart-interactions-schema"

In [28]:
# delete schema
# personalize.delete_schema(
#   schemaArn = "arn:aws:personalize:ap-northeast-2:532805286864:schema/osungmart-interactions-schema"
# )

{'ResponseMetadata': {'RequestId': '888198d6-a6de-4a83-85d4-f98de391122e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Thu, 02 Feb 2023 07:59:30 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'connection': 'keep-alive',
   'x-amzn-requestid': '888198d6-a6de-4a83-85d4-f98de391122e'},
  'RetryAttempts': 0}}

In [17]:
# schema 생성
# USER_ID,ITEM_ID,TIMESTAMP,EVENT_TYPE,PRICE,BRAND,CAT_0,CAT_1,CAT_2


schema = {
  "type": "record",
  "name": "Interactions",
  "namespace": "com.amazonaws.personalize.schema",
  "fields": [
    {
      "name": "ITEM_ID",
      "type": "string"
    },
    {
      "name": "USER_ID",
      "type": "string"
    },
    {
      "name": "TIMESTAMP",
      "type": "long"
    },
    {
      "name": "EVENT_TYPE",
      "type": "string"
    },
    {
      "name": "PRICE",
      "type": "float"
    },
    {
      "name":"BRAND",
      "type": "string",
      "categorical": True
    },
    {
      "name": "CAT_0",
      "type": "string",
      "categorical": True
    },
    {
      "name": "CAT_1",
      "type": "string",
      "categorical": True
    },
    {
      "name": "CAT_2",
      "type": "string",
      "categorical": True
    },
  ]
}

create_schema_resp = personalize.create_schema(
  name = schema_name,
  schema = json.dumps(schema)
)

schema_arn = create_schema_resp['schemaArn']
print(json.dumps(create_schema_resp, indent=2))

{
  "schemaArn": "arn:aws:personalize:ap-northeast-2:532805286864:schema/osungmart-interactions-schema",
  "ResponseMetadata": {
    "RequestId": "d85717da-d601-48f1-bfef-a300789e2e21",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Thu, 02 Feb 2023 08:44:53 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "100",
      "connection": "keep-alive",
      "x-amzn-requestid": "d85717da-d601-48f1-bfef-a300789e2e21"
    },
    "RetryAttempts": 0
  }
}


In [18]:
# create dataset
dataset_type = "INTERACTIONS"
create_dataset_resp = personalize.create_dataset(
  datasetType = dataset_type,
  datasetGroupArn = dataset_group_arn,
  schemaArn = schema_arn,
  name = "osugnmart-dataset-interactions"
)

interactions_dataset_arn = create_dataset_resp['datasetArn']
print(json.dumps(create_dataset_resp, indent=2))


{
  "datasetArn": "arn:aws:personalize:ap-northeast-2:532805286864:dataset/osungmart-dataset-groupmwomk/INTERACTIONS",
  "ResponseMetadata": {
    "RequestId": "6b5037bd-589a-4410-880f-cde71dc3ebab",
    "HTTPStatusCode": 200,
    "HTTPHeaders": {
      "date": "Thu, 02 Feb 2023 08:44:56 GMT",
      "content-type": "application/x-amz-json-1.1",
      "content-length": "114",
      "connection": "keep-alive",
      "x-amzn-requestid": "6b5037bd-589a-4410-880f-cde71dc3ebab"
    },
    "RetryAttempts": 0
  }
}


#### S3 Bucket policy 설정
peronalize service가 s3 bucket에 접근할 수 있도록 bucket permission 설정

In [37]:
policy = {
    "Version": "2012-10-17",
    "Id": "PersonalizeS3BucketAccessPolicy",
    "Statement": [
        {
            "Sid": "PersonalizeS3BucketAccessPolicy",
            "Effect": "Allow",
            "Principal": {
                "Service": "personalize.amazonaws.com",
            },
            "Action": [
                "s3:GetObject",
                "s3:ListBucket"
            ],
            "Resource": [
                "arn:aws:s3:::{}".format(bucket_name),
                "arn:aws:s3:::{}/*".format(bucket_name)
            ]
        }
    ]
}

s3.put_bucket_policy(Bucket=bucket_name, Policy=json.dumps(policy))

{'ResponseMetadata': {'RequestId': 'Z32PVWR5FQ6HM9N0',
  'HostId': 'yG/6ZWtvbnSBUFhONIi8quRAURWz2hSqqs9US4j3XO3dM9KAUyuzpdmHPurcM09z4fsoy07F+t4=',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'x-amz-id-2': 'yG/6ZWtvbnSBUFhONIi8quRAURWz2hSqqs9US4j3XO3dM9KAUyuzpdmHPurcM09z4fsoy07F+t4=',
   'x-amz-request-id': 'Z32PVWR5FQ6HM9N0',
   'date': 'Thu, 02 Feb 2023 04:46:26 GMT',
   'server': 'AmazonS3'},
  'RetryAttempts': 0}}

#### IAM Role 생성

In [20]:
iam = boto3.client("iam")

role_name = "PersonalizeS3-Role" + suffix
assume_role_policy_document = {
  "Version": "2012-10-17",
  "Statement": [
    {
      "Effect": "Allow",
      "Principal": {
        "Service": "personalize.amazonaws.com"
      },
      "Action": "sts:AssumeRole"
    }
  ]
}

try:
  create_role_resp = iam.create_role(
    RoleName = role_name,
    AssumeRolePolicyDocument = json.dumps(assume_role_policy_document)
  )

  iam.attach_role_policy(
    RoleName = role_name,
    PolicyArn = "arn:aws:iam::aws:policy/AmazonS3ReadOnlyAccess"
  )

  role_arn = create_role_resp['Role']['Arn']
except ClientError as e:
  if e.response['Error']['Code'] == 'EntityAlreadyExists':
    role_arn = iam.get_role(RoleName=role_name)['Role']['Arn']
  else:
    raise

time.sleep(45)
print(role_arn)

arn:aws:iam::532805286864:role/PersonalizeS3-Rolemwomk


In [16]:
# dataset_group_arn = "arn:aws:personalize:ap-northeast-2:532805286864:dataset-group/osungmart-dataset-groupmwomk"
# bucket_name = "osungmart-personalize-mwomk"
# interactions_filename ="data/interactions-sample.csv"
# role_arn = "arn:aws:iam::532805286864:role/PersonalizeS3-Rolemwomk"
# print(dataset_group_arn)

arn:aws:personalize:ap-northeast-2:532805286864:dataset-group/osungmart-dataset-groupmwomk


#### Dataset import jobs 생성
upload s3 data to dataset

In [28]:
# import the interactions data
create_dataset_import_job_resp = personalize.create_dataset_import_job(
  jobName = "osungmart-dataset-import-job-interactions",
  datasetArn = interactions_dataset_arn,
  dataSource = {
    "dataLocation": "s3://{}/{}".format(bucket_name, interactions_filename)
  },
  roleArn = role_arn
)

dataset_import_job_arn = create_dataset_import_job_resp['datasetImportJobArn']
print(json.dumps(create_dataset_import_job_resp, indent=2))

InvalidInputException: An error occurred (InvalidInputException) when calling the CreateDatasetImportJob operation: Input csv has rows that do not conform to the dataset schema. Please ensure all required data fields are present and that they are of the type specified in the schema.