In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.datasets import dump_svmlight_file


import boto3
import sagemaker.amazon.common as smac

Matplotlib is building the font cache; this may take a moment.


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


### Download Movie dataset

In [2]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

--2024-05-22 01:18:12--  http://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2024-05-22 01:18:13 (3.68 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]



In [3]:
ls

data_preprocessing.ipynb  ml-latest-small.zip  ratings.csv
links.csv                 movies.csv           tags.csv


In [4]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [5]:
df_movies = pd.read_csv(r'ml-latest-small/movies.csv')

In [6]:
df_movies.shape

(9742, 3)

In [7]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
genre_list = df_movies.genres.map(lambda value: value.split('|'))

In [10]:
genre_list[:10]

0    [Adventure, Animation, Children, Comedy, Fantasy]
1                       [Adventure, Children, Fantasy]
2                                    [Comedy, Romance]
3                             [Comedy, Drama, Romance]
4                                             [Comedy]
5                            [Action, Crime, Thriller]
6                                    [Comedy, Romance]
7                                [Adventure, Children]
8                                             [Action]
9                        [Action, Adventure, Thriller]
Name: genres, dtype: object

In [11]:
def get_unique_genres (genre_list):
    unique_list = set()
    
    for items in genre_list:
        for item in items:
            unique_list.add(item)
    
    return sorted(unique_list)

In [12]:
genre = get_unique_genres(genre_list)
genre

['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western']

In [13]:
# Table of genre for each movie
df_genre = pd.DataFrame(index=range(df_movies.shape[0]),columns=genre)

In [14]:
df_genre = df_genre.fillna(0)
df_genre.shape
df_genre.head()

  df_genre = df_genre.fillna(0)


Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
# Fill genre for each movie
for row, movie_genre in enumerate(genre_list):
    df_genre.loc[row,movie_genre] = 1
    
df_genre.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
# Some movies don't have genre listed
df_genre[df_genre['(no genres listed)'] > 0].head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
8517,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8684,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8687,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8782,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8836,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
# Merge with movie description
df_movies = df_movies.join(df_genre)
df_movies.head()

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
df_movies.to_csv(r'ml-latest-small/movies_genre.csv', index=False)

### Load ratings

In [19]:
df_ratings = pd.read_csv(r'ml-latest-small/ratings.csv')

In [20]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [21]:
print(df_ratings.userId.unique().shape)
print(df_ratings.movieId.unique().shape)

(610,)
(9724,)


In [22]:
df_ratings.drop(axis=1,columns=['timestamp'],inplace=True)

In [23]:
# Merge rating and movie description
df_movie_ratings = pd.merge(df_ratings,df_movies,on='movieId')
df_movie_ratings.head(2)

Unnamed: 0,userId,movieId,rating,title,genres,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [24]:
df_movie_ratings.tail(2)

Unnamed: 0,userId,movieId,rating,title,genres,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
100834,610,168252,5.0,Logan (2017),Action|Sci-Fi,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
100835,610,170875,3.0,The Fate of the Furious (2017),Action|Crime|Drama|Thriller,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### Training and Validation Set

In [25]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df_movie_ratings.index)
np.random.shuffle(l)
df = df_movie_ratings.iloc[l]

In [26]:
rows = df.shape[0]
train = int(.7 * rows)
test = rows-train

In [27]:
rows,train,test

(100836, 70585, 30251)

In [28]:
df.head(2)

Unnamed: 0,userId,movieId,rating,title,genres,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
92163,597,11,3.0,"American President, The (1995)",Comedy|Drama|Romance,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
71427,459,72998,5.0,Avatar (2009),Action|Adventure|Sci-Fi|IMAX,0,1,1,0,0,...,0,0,1,0,0,0,1,0,0,0


In [29]:
# SageMaker Factorization Machine expects all columns to be of float32
# Let's get the target variable as float32
y = df['rating'].astype(np.float32).ravel()
y

  y = df['rating'].astype(np.float32).ravel()


array([3. , 5. , 1. , ..., 3. , 3.5, 3. ], dtype=float32)

In [30]:
y.dtype

dtype('float32')

In [31]:
# We will create two different training datasets.
# Training 1: rating, user id, movie id
# Training 2: rating, user id, movie id, and movie genre attributes
columns_user_movie = ['userId','movieId']
columns_all = columns_user_movie + genre

In [32]:
# Store a copy of user id, movie id and rating
# Train and Test
df[['rating','userId','movieId']][:train].to_csv(r'ml-latest-small/user_movie_train.csv', index=False)
df[['rating','userId','movieId']][train:].to_csv(r'ml-latest-small/user_movie_test.csv',index=False)

In [33]:
# One Hot Encode
# Training 1: user id, movie id
# Training 2: user id, movie id, and movie genre attributes
encoder = preprocessing.OneHotEncoder(dtype=np.float32)

In [34]:
X = encoder.fit_transform(df[columns_user_movie])

In [35]:
df.userId.unique().shape, df.movieId.unique().shape

((610,), (9724,))

In [36]:
# Write Dimensions - we need it for training and prediction
# Number of unique users and movies
dim_movie = df.userId.unique().shape[0] + df.movieId.unique().shape[0]
with open(r'ml-latest-small/movie_dimension.txt','w') as f:
    f.write(str(dim_movie))

In [37]:
X.shape[1]

10334

In [38]:
# Create a spare matrix recordio file
def write_sparse_recordio_file (filename, x, y=None):
    with open(filename, 'wb') as f:
        smac.write_spmatrix_to_sparse_tensor (f, x, y)

In [39]:
# Training recordIO file
write_sparse_recordio_file(r'ml-latest-small/user_movie_train.recordio',X[:train],y[:train])

In [40]:
# Test recordIO file
write_sparse_recordio_file(r'ml-latest-small/user_movie_test.recordio',X[train:],y[train:])

### Cloud Training

In [41]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

In [42]:
# Specify your bucket name
bucket_name = 'swati-ml-sagemaker'
training_file_key = 'movie/user_movie_train.recordio'
test_file_key = 'movie/user_movie_test.recordio'

s3_model_output_location = r's3://{0}/movie/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_file_key)

In [43]:
# Read Dimension: Number of unique users + Number of unique movies in our dataset
dim_movie = 0

# Update movie dimension - from file used for training 
with open(r'ml-latest-small/movie_dimension.txt','r') as f:
    dim_movie = int(f.read())
    
dim_movie

10334

In [44]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_test_file_location)

s3://swati-ml-sagemaker/movie/model
s3://swati-ml-sagemaker/movie/user_movie_train.recordio
s3://swati-ml-sagemaker/movie/user_movie_test.recordio


In [45]:
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [46]:
write_to_s3(r'ml-latest-small/user_movie_train.recordio',bucket_name,training_file_key)
write_to_s3(r'ml-latest-small/user_movie_test.recordio',bucket_name,test_file_key)

Training Algorithm Docker Image

In [47]:
# We will use spot for training
use_spot_instances = True
max_run = 3600 # in seconds
max_wait = 3600 if use_spot_instances else None # in seconds

job_name = 'fm-movie-v1'

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket_name}/movie/checkpoints/{job_name}'
    
print (f'Checkpoint uri: {checkpoint_s3_uri}')

Checkpoint uri: s3://swati-ml-sagemaker/movie/checkpoints/fm-movie-v1


In [48]:
sess = sagemaker.Session()
role = get_execution_role()
print(role)

arn:aws:iam::637423580352:role/service-role/AmazonSageMaker-ExecutionRole-20240325T165146


In [49]:
# Use factorization-machines
container = sagemaker.image_uris.retrieve("factorization-machines",sess.boto_region_name)

print (f'Using FM Container {container}')

Using FM Container 382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:1


Build Model

In [50]:
estimator = sagemaker.estimator.Estimator(container,
                                          role,                                        
                                          instance_count=1, 
                                          instance_type='ml.m5.xlarge',
                                          output_path=s3_model_output_location,
                                          sagemaker_session=sess,
                                          base_job_name = job_name,
                                          use_spot_instances=use_spot_instances,
                                          max_run=max_run,
                                          max_wait=max_wait,
                                          checkpoint_s3_uri=checkpoint_s3_uri)

### New Configuration After Tuning

In [51]:
estimator.set_hyperparameters(feature_dim=dim_movie,
                              num_factors=8,
                              predictor_type='regressor', 
                              mini_batch_size=1000,
                              epochs=500,
                              bias_init_method='normal',
                              bias_lr=0.010000000000000004,
                              factors_init_method='normal',
                              factors_lr=0.00012163193136767434,
                              linear_init_method='normal',
                              linear_lr=0.00010000000000000009)

In [52]:
estimator.hyperparameters()

{'feature_dim': 10334,
 'num_factors': 8,
 'predictor_type': 'regressor',
 'mini_batch_size': 1000,
 'epochs': 500,
 'bias_init_method': 'normal',
 'bias_lr': 0.010000000000000004,
 'factors_init_method': 'normal',
 'factors_lr': 0.00012163193136767434,
 'linear_init_method': 'normal',
 'linear_lr': 0.00010000000000000009}

### Train the model

In [54]:
estimator.fit({'train':s3_training_file_location, 'test': s3_test_file_location})

INFO:sagemaker:Creating training-job with name: fm-movie-v1-2024-05-22-04-39-37-105


2024-05-22 04:39:37 Starting - Starting the training job...
2024-05-22 04:39:52 Starting - Preparing the instances for training...
2024-05-22 04:40:34 Downloading - Downloading the training image.....................
2024-05-22 04:43:55 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  if num_device is 1 and 'dist' not in kvstore:[0m
[34m[05/22/2024 04:44:04 INFO 140374101440320] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.00001', 'bias_init_method': 'normal', 'bias_init_sigma': '0.01', 'linear_init_method': 'normal', 'linear_init_sigma': '0.01', 'factors_init_method': 'normal', '

[34m[2024-05-22 04:44:13.904] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 62, "duration": 259, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:44:13 INFO 140374101440320] #quality_metric: host=algo-1, epoch=30, train rmse <loss>=0.993936349476479[0m
[34m[05/22/2024 04:44:13 INFO 140374101440320] #quality_metric: host=algo-1, epoch=30, train mse <loss>=0.9879094668106294[0m
[34m[05/22/2024 04:44:13 INFO 140374101440320] #quality_metric: host=algo-1, epoch=30, train absolute_loss <loss>=0.782980528065856[0m
[34m#metrics {"StartTime": 1716353053.6429179, "EndTime": 1716353053.905269, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 261.7621421813965, "count": 1, "min": 261.7621421813965, "max": 261.7621421813965}}}[0m
[34m[05/22/2024 04:44:13 INFO 140374101440320] #progress_metric: host=algo-1, completed 6.2 % of epochs[0m
[34m#metrics {

[34m[2024-05-22 04:44:23.845] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 134, "duration": 269, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:44:23 INFO 140374101440320] #quality_metric: host=algo-1, epoch=66, train rmse <loss>=0.904985720861917[0m
[34m[05/22/2024 04:44:23 INFO 140374101440320] #quality_metric: host=algo-1, epoch=66, train mse <loss>=0.8189991549639635[0m
[34m[05/22/2024 04:44:23 INFO 140374101440320] #quality_metric: host=algo-1, epoch=66, train absolute_loss <loss>=0.6946435426523988[0m
[34m#metrics {"StartTime": 1716353063.5742202, "EndTime": 1716353063.8464816, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 271.8493938446045, "count": 1, "min": 271.8493938446045, "max": 271.8493938446045}}}[0m
[34m[05/22/2024 04:44:23 INFO 140374101440320] #progress_metric: host=algo-1, completed 13.4 % of epochs[0m
[34m#metri

[34m[2024-05-22 04:44:33.985] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 208, "duration": 268, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:44:33 INFO 140374101440320] #quality_metric: host=algo-1, epoch=103, train rmse <loss>=0.8691740916270012[0m
[34m[05/22/2024 04:44:33 INFO 140374101440320] #quality_metric: host=algo-1, epoch=103, train mse <loss>=0.7554636015556228[0m
[34m[05/22/2024 04:44:33 INFO 140374101440320] #quality_metric: host=algo-1, epoch=103, train absolute_loss <loss>=0.6614377647722272[0m
[34m#metrics {"StartTime": 1716353073.7153227, "EndTime": 1716353073.9859629, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 270.0676918029785, "count": 1, "min": 270.0676918029785, "max": 270.0676918029785}}}[0m
[34m[05/22/2024 04:44:33 INFO 140374101440320] #progress_metric: host=algo-1, completed 20.8 % of epochs[0m
[34m#m

[34m[2024-05-22 04:44:44.111] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 282, "duration": 278, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:44:44 INFO 140374101440320] #quality_metric: host=algo-1, epoch=140, train rmse <loss>=0.8499261316092955[0m
[34m[05/22/2024 04:44:44 INFO 140374101440320] #quality_metric: host=algo-1, epoch=140, train mse <loss>=0.7223744291923415[0m
[34m[05/22/2024 04:44:44 INFO 140374101440320] #quality_metric: host=algo-1, epoch=140, train absolute_loss <loss>=0.6438357165699273[0m
[34m#metrics {"StartTime": 1716353083.831161, "EndTime": 1716353084.112464, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 280.6134223937988, "count": 1, "min": 280.6134223937988, "max": 280.6134223937988}}}[0m
[34m[05/22/2024 04:44:44 INFO 140374101440320] #progress_metric: host=algo-1, completed 28.2 % of epochs[0m
[34m#met

[34m[2024-05-22 04:44:54.063] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 354, "duration": 289, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:44:54 INFO 140374101440320] #quality_metric: host=algo-1, epoch=176, train rmse <loss>=0.8376087117362124[0m
[34m[05/22/2024 04:44:54 INFO 140374101440320] #quality_metric: host=algo-1, epoch=176, train mse <loss>=0.7015883539763974[0m
[34m[05/22/2024 04:44:54 INFO 140374101440320] #quality_metric: host=algo-1, epoch=176, train absolute_loss <loss>=0.632697090256382[0m
[34m#metrics {"StartTime": 1716353093.771583, "EndTime": 1716353094.063723, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 291.5372848510742, "count": 1, "min": 291.5372848510742, "max": 291.5372848510742}}}[0m
[34m[05/22/2024 04:44:54 INFO 140374101440320] #progress_metric: host=algo-1, completed 35.4 % of epochs[0m
[34m#metr

[34m[2024-05-22 04:45:04.136] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 422, "duration": 333, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:45:04 INFO 140374101440320] #quality_metric: host=algo-1, epoch=210, train rmse <loss>=0.8288078160514374[0m
[34m[05/22/2024 04:45:04 INFO 140374101440320] #quality_metric: host=algo-1, epoch=210, train mse <loss>=0.6869223959479533[0m
[34m[05/22/2024 04:45:04 INFO 140374101440320] #quality_metric: host=algo-1, epoch=210, train absolute_loss <loss>=0.6248632735131492[0m
[34m#metrics {"StartTime": 1716353103.7996507, "EndTime": 1716353104.1369464, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 336.58361434936523, "count": 1, "min": 336.58361434936523, "max": 336.58361434936523}}}[0m
[34m[05/22/2024 04:45:04 INFO 140374101440320] #progress_metric: host=algo-1, completed 42.2 % of epochs[0m
[34

[34m[2024-05-22 04:45:14.013] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 494, "duration": 267, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:45:14 INFO 140374101440320] #quality_metric: host=algo-1, epoch=246, train rmse <loss>=0.8210646540178914[0m
[34m[05/22/2024 04:45:14 INFO 140374101440320] #quality_metric: host=algo-1, epoch=246, train mse <loss>=0.6741471660775198[0m
[34m[05/22/2024 04:45:14 INFO 140374101440320] #quality_metric: host=algo-1, epoch=246, train absolute_loss <loss>=0.6180756792954996[0m
[34m#metrics {"StartTime": 1716353113.7435539, "EndTime": 1716353114.0139146, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 270.078182220459, "count": 1, "min": 270.078182220459, "max": 270.078182220459}}}[0m
[34m[05/22/2024 04:45:14 INFO 140374101440320] #progress_metric: host=algo-1, completed 49.4 % of epochs[0m
[34m#metr

[34m[2024-05-22 04:45:24.093] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 568, "duration": 267, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:45:24 INFO 140374101440320] #quality_metric: host=algo-1, epoch=283, train rmse <loss>=0.813988149445067[0m
[34m[05/22/2024 04:45:24 INFO 140374101440320] #quality_metric: host=algo-1, epoch=283, train mse <loss>=0.6625767074370048[0m
[34m[05/22/2024 04:45:24 INFO 140374101440320] #quality_metric: host=algo-1, epoch=283, train absolute_loss <loss>=0.6119228825099031[0m
[34m#metrics {"StartTime": 1716353123.823544, "EndTime": 1716353124.0944753, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 270.3213691711426, "count": 1, "min": 270.3213691711426, "max": 270.3213691711426}}}[0m
[34m[05/22/2024 04:45:24 INFO 140374101440320] #progress_metric: host=algo-1, completed 56.8 % of epochs[0m
[34m#met

[34m[2024-05-22 04:45:39.124] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 678, "duration": 274, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:45:39 INFO 140374101440320] #quality_metric: host=algo-1, epoch=338, train rmse <loss>=0.8042262668641584[0m
[34m[05/22/2024 04:45:39 INFO 140374101440320] #quality_metric: host=algo-1, epoch=338, train mse <loss>=0.6467798883142606[0m
[34m[05/22/2024 04:45:39 INFO 140374101440320] #quality_metric: host=algo-1, epoch=338, train absolute_loss <loss>=0.6035726198008363[0m
[34m#metrics {"StartTime": 1716353138.848219, "EndTime": 1716353139.1253345, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 276.49855613708496, "count": 1, "min": 276.49855613708496, "max": 276.49855613708496}}}[0m
[34m[05/22/2024 04:45:39 INFO 140374101440320] #progress_metric: host=algo-1, completed 67.8 % of epochs[0m
[34m

[34m[2024-05-22 04:45:49.223] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 752, "duration": 260, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:45:49 INFO 140374101440320] #quality_metric: host=algo-1, epoch=375, train rmse <loss>=0.7979339828990496[0m
[34m[05/22/2024 04:45:49 INFO 140374101440320] #quality_metric: host=algo-1, epoch=375, train mse <loss>=0.6366986410651408[0m
[34m[05/22/2024 04:45:49 INFO 140374101440320] #quality_metric: host=algo-1, epoch=375, train absolute_loss <loss>=0.5982720457265075[0m
[34m#metrics {"StartTime": 1716353148.9602566, "EndTime": 1716353149.2235236, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 262.82644271850586, "count": 1, "min": 262.82644271850586, "max": 262.82644271850586}}}[0m
[34m[05/22/2024 04:45:49 INFO 140374101440320] #progress_metric: host=algo-1, completed 75.2 % of epochs[0m
[34

[34m[2024-05-22 04:45:59.101] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 824, "duration": 288, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:45:59 INFO 140374101440320] #quality_metric: host=algo-1, epoch=411, train rmse <loss>=0.7919923821306919[0m
[34m[05/22/2024 04:45:59 INFO 140374101440320] #quality_metric: host=algo-1, epoch=411, train mse <loss>=0.627251933353048[0m
[34m[05/22/2024 04:45:59 INFO 140374101440320] #quality_metric: host=algo-1, epoch=411, train absolute_loss <loss>=0.5933122507014744[0m
[34m#metrics {"StartTime": 1716353158.8103604, "EndTime": 1716353159.1015286, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 290.4942035675049, "count": 1, "min": 290.4942035675049, "max": 290.4942035675049}}}[0m
[34m[05/22/2024 04:45:59 INFO 140374101440320] #progress_metric: host=algo-1, completed 82.4 % of epochs[0m
[34m#me

[34m[2024-05-22 04:46:09.164] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 892, "duration": 274, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:46:09 INFO 140374101440320] #quality_metric: host=algo-1, epoch=445, train rmse <loss>=0.7865371996367619[0m
[34m[05/22/2024 04:46:09 INFO 140374101440320] #quality_metric: host=algo-1, epoch=445, train mse <loss>=0.6186407664124395[0m
[34m[05/22/2024 04:46:09 INFO 140374101440320] #quality_metric: host=algo-1, epoch=445, train absolute_loss <loss>=0.5887444690113336[0m
[34m#metrics {"StartTime": 1716353168.8872945, "EndTime": 1716353169.1648202, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 276.9632339477539, "count": 1, "min": 276.9632339477539, "max": 276.9632339477539}}}[0m
[34m[05/22/2024 04:46:09 INFO 140374101440320] #progress_metric: host=algo-1, completed 89.2 % of epochs[0m
[34m#m

[34m[2024-05-22 04:46:19.327] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 966, "duration": 281, "num_examples": 71, "num_bytes": 4517440}[0m
[34m[05/22/2024 04:46:19 INFO 140374101440320] #quality_metric: host=algo-1, epoch=482, train rmse <loss>=0.7807170516579821[0m
[34m[05/22/2024 04:46:19 INFO 140374101440320] #quality_metric: host=algo-1, epoch=482, train mse <loss>=0.6095191147495324[0m
[34m[05/22/2024 04:46:19 INFO 140374101440320] #quality_metric: host=algo-1, epoch=482, train absolute_loss <loss>=0.5838550354863556[0m
[34m#metrics {"StartTime": 1716353179.0444057, "EndTime": 1716353179.328327, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 283.56361389160156, "count": 1, "min": 283.56361389160156, "max": 283.56361389160156}}}[0m
[34m[05/22/2024 04:46:19 INFO 140374101440320] #progress_metric: host=algo-1, completed 96.6 % of epochs[0m
[34m


2024-05-22 04:46:39 Uploading - Uploading generated training model
2024-05-22 04:46:39 Completed - Training job completed
Training seconds: 385
Billable seconds: 172
Managed Spot Training savings: 55.3%


### Deploy the Model

In [55]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                             endpoint_name = job_name)

INFO:sagemaker:Creating model with name: fm-movie-v1-2024-05-22-04-47-56-577
INFO:sagemaker:Creating endpoint-config with name fm-movie-v1
INFO:sagemaker:Creating endpoint with name fm-movie-v1


----------!

### Run Predictions

In [56]:
# Create libSVM formatted file. Convenient text format
# Output is stored as rating, user_index:value, movie_index:value
#  For example: 5.0 314:1 215:1  (user with index 314 and movie with index 215 in the one hot encoded table has a rating of 5 )

# This file can be used for two purposes: 
#   1. directly traing with libFM binary in local mode
#   2. It is easy to run inference with this format against sagemaker cloud as we need to
#      send only sparse input to sagemaker prediction service

# 
# Store in libSVM format as well for directly testing with libFM
dump_svmlight_file(X[:train],y[:train],r'ml-latest-small/user_movie_train.svm')
dump_svmlight_file(X[train:],y[train:],r'ml-latest-small/user_movie_test.svm')

In [57]:
import json

def fm_sparse_serializer(data):
    js = {'instances': []}
    for row in data:
        
        column_list = row.tolist()
        value_list = np.ones(len(column_list),dtype=int).tolist()
       
        js['instances'].append({'data':{'features': { 'keys': column_list, 'shape':[dim_movie], 'values': value_list}}})
    return json.dumps(js)

In [58]:
# SDK 2
from sagemaker.deserializers import JSONDeserializer

In [59]:
# Specify custom serializer
predictor.serializer.serialize = fm_sparse_serializer
predictor.serializer.content_type = 'application/json'

predictor.deserializer = JSONDeserializer()

In [60]:
import numpy as np
fm_sparse_serializer([np.array([341,1416])])

'{"instances": [{"data": {"features": {"keys": [341, 1416], "shape": [10334], "values": [1, 1]}}}]}'

In [61]:
# Let's test with few entries from test file
# Movie dataset is updated regularly...so, instead of hard coding userid and movie id, let's
# use actual values

# Each row is in this format: ['2.5', '426:1', '943:1']
# ActualRating, UserID, MovieID

with open(r'ml-latest-small/user_movie_test.svm','r') as f:
    for i in range(3):
        rating = f.readline().split()
        print(f"Movie {rating}")
        userID = rating[1].split(':')[0]
        movieID = rating[2].split(':')[0]
        predicted_rating = predictor.predict([np.array([int(userID),int(movieID)])])
        print(f'  Actual Rating:\t{rating[0]}')
        print(f"  Predicted Rating:\t{predicted_rating['predictions'][0]['score']}")
        print()

Movie ['4', '291:1', '1028:1']
  Actual Rating:	4
  Predicted Rating:	3.9805212020874023

Movie ['4', '579:1', '2570:1']
  Actual Rating:	4
  Predicted Rating:	3.3526663780212402

Movie ['4', '231:1', '3822:1']
  Actual Rating:	4
  Predicted Rating:	3.0857391357421875



In [73]:
# Create a predictor and point to an existing endpoint

endpoint_name = 'fm-movie-v1'
predictor = sagemaker.predictor.Predictor(endpoint_name=endpoint_name)

In [74]:
# Custom serializer
def fm_sparse_serializer(data):
    js = {'instances': []}
    for row in data:
        
        column_list = row.tolist()
        value_list = np.ones(len(column_list),dtype=int).tolist()
       
        js['instances'].append({'data':{'features': { 'keys': column_list, 'shape':[dim_movie], 'values': value_list}}})
    return json.dumps(js)

In [75]:
# Specify custom serializer
predictor.serializer.serialize = fm_sparse_serializer
predictor.serializer.content_type = 'application/json'

predictor.deserializer = JSONDeserializer()

In [76]:
# Read Dimension: Number of unique users + Number of unique movies in our dataset
dim_movie = 0

# Update movie dimension - from file used for training 
with open(r'ml-latest-small/movie_dimension.txt','r') as f:
    dim_movie = int(f.read())

In [77]:
print(fm_sparse_serializer([np.array([341,1416]),np.array([209,2640]),np.array([164,1346])]))

{"instances": [{"data": {"features": {"keys": [341, 1416], "shape": [10334], "values": [1, 1]}}}, {"data": {"features": {"keys": [209, 2640], "shape": [10334], "values": [1, 1]}}}, {"data": {"features": {"keys": [164, 1346], "shape": [10334], "values": [1, 1]}}}]}


In [78]:
# Test libSVM
# Load the test file in svm format. '5 341:1 1416:1'
test_file = r'ml-latest-small/user_movie_test.svm'

In [79]:
df_test = pd.read_csv(test_file, sep=' ', names=['rating','user_index','movie_index'])
df_test

Unnamed: 0,rating,user_index,movie_index
0,4.0,291:1,1028:1
1,4.0,579:1,2570:1
2,4.0,231:1,3822:1
3,4.0,17:1,3598:1
4,4.0,413:1,4894:1
...,...,...,...
30246,2.0,386:1,1609:1
30247,3.0,38:1,2152:1
30248,3.0,134:1,1884:1
30249,3.5,118:1,7429:1


In [80]:
# update column to contain only the one hot encoded index
df_test.user_index = df_test.user_index.map(lambda value: int(value.split(':')[0]))
df_test.movie_index = df_test.movie_index.map(lambda value: int(value.split(':')[0]))

In [81]:
df_test.shape

(30251, 3)

In [82]:
# For large number of predictions, we can split the input data and
# Query the prediction service.
# array_split is convenient to specify how many splits are needed
def get_predictions(predictor, arr_features):
    predictions = []
    for arr in np.array_split(arr_features,100):        
        if arr.shape[0] > 0:
            print (arr.shape, end=' ')
            result = predictor.predict(arr)
            predictions += [values['score'] for values in result['predictions']]
    return predictions

In [83]:
# %time predictions = get_predictions(predictor_sparse, df_test[['user_index','movie_index']].as_matrix())
%time predictions = get_predictions(predictor, df_test[['user_index','movie_index']].values)

(303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (303, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) (302, 2) CPU times: user 502 ms, sys: 13 ms, total: 515 ms
Wall time: 5.8 s
