In [51]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import time
import os
import boto3
import sagemaker
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sagemaker.amazon.amazon_estimator import get_image_uri
import io
import sagemaker.amazon.common as smac
import scipy.sparse as sp
from sklearn.preprocessing import LabelEncoder
import json

In [52]:
# setting bucket and session
role = sagemaker.get_execution_role()

sess = sagemaker.Session()
bucket = "gcu-sg02-007-ml2"
prefix = "builtin-notebooks/Recomendation-Machine/Explicit"
print(f"role: {role} bucket: {bucket}")

train_key = 'train.protobuf'
train_prefix = '{}/{}'.format(prefix, 'train')
s3_train = 's3://{}/{}/train/'.format(bucket,prefix)

test_key = 'test.protobuf'
test_prefix = '{}/{}'.format(prefix, 'test')

#ubicación S3 de salida
output_prefix = 's3://{}/{}/output'.format(bucket, prefix)
my_region = boto3.session.Session().region_name

print(my_region)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
role: arn:aws:iam::629515838455:role/service-role/AmazonSageMaker-ExecutionRole-20231201T000262 bucket: gcu-sg02-007-ml2
ap-northeast-2


In [53]:
# read data
col_name = ["User_ID", "Stream_ID", "Streamer_username", "Time_start", "Time_stop"]
df = pd.read_csv('./100k_a.csv',header = None, names = col_name)
df.head()

Unnamed: 0,User_ID,Stream_ID,Streamer_username,Time_start,Time_stop
0,1,33842865744,mithrain,154,156
1,1,33846768288,alptv,166,169
2,1,33886469056,mithrain,587,588
3,1,33887624992,wtcn,589,591
4,1,33890145056,jrokezftw,591,594


In [54]:
print("최대 측정 시간 : " + str(df["Time_stop"].max()))
print("최소 측정 시간 : " + str(df["Time_start"].min()))

최대 측정 시간 : 6148
최소 측정 시간 : 0


In [55]:
# rating 추가

df["rating"] = (df["Time_stop"] - df["Time_start"]) * 10 
df.head()

Unnamed: 0,User_ID,Stream_ID,Streamer_username,Time_start,Time_stop,rating
0,1,33842865744,mithrain,154,156,20
1,1,33846768288,alptv,166,169,30
2,1,33886469056,mithrain,587,588,10
3,1,33887624992,wtcn,589,591,20
4,1,33890145056,jrokezftw,591,594,30


In [56]:
# grouping and add rating
df = df.groupby(['User_ID', 'Streamer_username']).agg({'rating': 'sum'}).reset_index()

df

Unnamed: 0,User_ID,Streamer_username,rating
0,1,alptv,30
1,1,berkriptepe,30
2,1,elraenn,20
3,1,eraymaskulen,10
4,1,esl_csgo,10
...,...,...,...
1505153,100000,mckytv,10
1505154,100000,natehill,10
1505155,100000,ninja,30
1505156,100000,replays,10


In [57]:
print(df["rating"].min())
print(df["rating"].max())

10
6340


In [58]:
# sampling 100000
df = df.sample(n=100000, random_state=42)

In [59]:
# df = df.drop(['Stream_ID', 'Time_start', 'Time_stop'], axis = 1)
# df.dropna(inplace=True)
# df

In [60]:
# # 각 열에 대해 LabelEncoder 생성
# user_id_encoder = LabelEncoder()
# streamer_username_encoder = LabelEncoder()

# # LabelEncoder를 사용하여 데이터 변환
# df['User_ID'] = user_id_encoder.fit_transform(df['User_ID'])
# df['Streamer_username'] = streamer_username_encoder.fit_transform(df['Streamer_username'])

# df

In [61]:
# train, test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [62]:
# encoding
enc = OneHotEncoder(handle_unknown='ignore', sparse=True)
enc.fit(df[['User_ID','Streamer_username']])

X_train_OH = enc.transform(train_df[['User_ID','Streamer_username']]).astype('float32')
Y_train_OH = train_df['rating'].astype('float32')  

X_test_OH = enc.transform(test_df[['User_ID','Streamer_username']]).astype('float32')
Y_test_OH = test_df['rating'].astype('float32')  



In [63]:
# User_ID Streamer_username unique data
user_ids = enc.categories_[0]
streamer_usernames = enc.categories_[1] 

print("User_IDs:", user_ids)
print("Streamer_usernames:", streamer_usernames)


User_IDs: [     2      7     14 ...  99997  99999 100000]
Streamer_usernames: ['030_hi' '080808080' '0_doublezero_0' ... 'zzzerrr' 'zzzireael'
 'zzztantrikazzz']


In [64]:
print(X_train_OH)
print(X_test_OH)

  (0, 20060)	1.0
  (0, 65418)	1.0
  (1, 17878)	1.0
  (1, 82138)	1.0
  (2, 13104)	1.0
  (2, 71128)	1.0
  (3, 36499)	1.0
  (3, 77585)	1.0
  (4, 22688)	1.0
  (4, 82166)	1.0
  (5, 32560)	1.0
  (5, 64781)	1.0
  (6, 47248)	1.0
  (6, 67002)	1.0
  (7, 25214)	1.0
  (7, 76043)	1.0
  (8, 20818)	1.0
  (8, 70442)	1.0
  (9, 33288)	1.0
  (9, 71539)	1.0
  (10, 14118)	1.0
  (10, 76526)	1.0
  (11, 46378)	1.0
  (11, 61261)	1.0
  (12, 44053)	1.0
  :	:
  (79987, 67969)	1.0
  (79988, 4298)	1.0
  (79988, 70145)	1.0
  (79989, 24959)	1.0
  (79989, 65660)	1.0
  (79990, 17155)	1.0
  (79990, 66507)	1.0
  (79991, 29248)	1.0
  (79991, 56865)	1.0
  (79992, 23542)	1.0
  (79992, 62179)	1.0
  (79993, 34313)	1.0
  (79993, 62212)	1.0
  (79994, 20976)	1.0
  (79994, 64435)	1.0
  (79995, 24490)	1.0
  (79995, 67300)	1.0
  (79996, 33960)	1.0
  (79996, 68279)	1.0
  (79997, 9440)	1.0
  (79997, 78306)	1.0
  (79998, 41314)	1.0
  (79998, 53175)	1.0
  (79999, 46273)	1.0
  (79999, 78347)	1.0
  (0, 6301)	1.0
  (0, 55976)	1.0
  (1, 48

In [65]:
print(Y_train_OH.sort_index())
print(Y_test_OH.sort_index())

0        700.0
1         10.0
2        120.0
3         70.0
4         10.0
         ...  
79995     30.0
79996    110.0
79997     20.0
79998    370.0
79999     10.0
Name: rating, Length: 80000, dtype: float32
0        30.0
1        10.0
2        20.0
3        50.0
4        80.0
         ... 
19995    10.0
19996    30.0
19997    10.0
19998    50.0
19999    20.0
Name: rating, Length: 20000, dtype: float32


In [66]:
# row and col number
columns = X_train_OH.shape[1]

print("Columns:{} ".format(X_train_OH.shape[1]))
print("Rows:{} ".format(X_train_OH.shape[0]))

Columns:82636 
Rows:80000 


In [67]:
# train_df.to_csv('train_data.csv', mode="w", header=False, index=False)
# test_df.to_csv('test_data.csv', mode="w", header=False, index=False)

In [68]:
# save data at bucket
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    # Y = Y.reset_index(drop=True)
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket, obj)
  
train_data = writeDatasetToProtobuf(X_train_OH, Y_train_OH, bucket, train_prefix, train_key)    
test_data = writeDatasetToProtobuf(X_test_OH, Y_test_OH, bucket, test_prefix, test_key)    
  
print(train_data)
print(test_data)
print('Output: {}'.format(output_prefix))

s3://gcu-sg02-007-ml2/builtin-notebooks/Recomendation-Machine/Explicit/train/train.protobuf
s3://gcu-sg02-007-ml2/builtin-notebooks/Recomendation-Machine/Explicit/test/test.protobuf
Output: s3://gcu-sg02-007-ml2/builtin-notebooks/Recomendation-Machine/Explicit/output


In [69]:
# bucket에 올리기
# boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train_data.csv')).upload_file('train_data.csv')
# boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test_data.csv')).upload_file('test_data.csv')

In [70]:
# s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_data, content_type="text/csv")
# s3_input_test = sagemaker.inputs.TrainingInput(s3_data=test_data, content_type='text/csv')

In [71]:
# sagemaker model
instance_type='ml.m5.large'
fm = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, "factorization-machines"),
                                   role, 
                                   train_instance_count=1, 
                                   train_instance_type=instance_type,
                                   output_path=output_prefix,
                                   sagemaker_session=sagemaker.Session())

fm.set_hyperparameters(feature_dim=columns,
                      predictor_type='regressor',
                      mini_batch_size=1000,
                      num_factors=64,
                      epochs=100)

See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker.image_uris:Same images used for training and inference. Defaulting to image scope: inference.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [72]:
data_channels = {
    "train": train_data,
    "test": test_data
}

# train
fm.fit(inputs=data_channels, logs=True)

INFO:sagemaker:Creating training-job with name: factorization-machines-2023-12-02-10-18-06-961


2023-12-02 10:18:07 Starting - Starting the training job...
2023-12-02 10:18:23 Starting - Preparing the instances for training......
2023-12-02 10:19:29 Downloading - Downloading input data......
2023-12-02 10:20:09 Training - Downloading the training image........[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
[34m[12/02/2023 10:21:51 INFO 139702296962880] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.00001', 'bias_init_method': 'normal', 'bias_init_sigma': '0.01', 'linear_init_method': 'normal', 'linear_init_sigma': '0.01', 'factors_init_method': 'normal', 'factors_init_sigma': '0.001', 'batch_me

In [74]:
# save endpoint
endpoint_name = 'sg-matrix-factorization'
fm_predictor = fm.deploy(
    initial_instance_count=1,
    instance_type=instance_type,
    endpoint_name=endpoint_name
)

endpoint_name = fm_predictor.endpoint_name
print(f"Endpoints name: {endpoint_name}")


INFO:sagemaker:Creating model with name: factorization-machines-2023-12-02-10-24-26-357
INFO:sagemaker:Creating endpoint-config with name sg-matrix-factorization
INFO:sagemaker:Creating endpoint with name sg-matrix-factorization


-------!Endpoints name: sg-matrix-factorization


In [75]:
buf = io.BytesIO()
smac.write_spmatrix_to_sparse_tensor(buf, X_test_OH.astype('float32'))
buf.seek(0)

# make SageMaker client
client = boto3.client('runtime.sagemaker')

# predict
response = client.invoke_endpoint(
    EndpointName=endpoint_name, 
    ContentType='application/x-recordio-protobuf',
    Body=buf.read()
)

# result
predictions = json.loads(response['Body'].read().decode())


In [88]:
# predict result change list
predictions_list = [pred['score'] for pred in predictions["predictions"]]


predictions_array = np.array(predictions_list)
Y_test_array = Y_test_OH.values

# mae
diff = np.abs(predictions_array - Y_test_array)
mean_difference = np.mean(np.abs(predictions_array - Y_test_array))
print(diff)
print(len(sum(np.where(diff < 20))))
print(len(diff))
print("평균 차이:", mean_difference)

[34.94242859 55.3748703  45.21507263 ... 55.28605652 14.54455566
 43.73901367]
2507
20000
평균 차이: 64.01425802650452


In [73]:
# endpoint delete
# fm_predictor.delete_endpoint()

INFO:sagemaker:Deleting endpoint configuration with name: sg-matrix-factorization
INFO:sagemaker:Deleting endpoint with name: sg-matrix-factorization
