In [2]:
%%sh
wget http://files.grouplens.org/datasets/movielens/ml-100k.zip

--2021-04-07 07:56:41--  http://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4.7M) [application/zip]
Saving to: ‘ml-100k.zip.1’

     0K .......... .......... .......... .......... ..........  1% 1.32M 4s
    50K .......... .......... .......... .......... ..........  2% 2.29M 3s
   100K .......... .......... .......... .......... ..........  3% 88.4M 2s
   150K .......... .......... .......... .......... ..........  4%  164M 1s
   200K .......... .......... .......... .......... ..........  5% 2.36M 1s
   250K .......... .......... .......... .......... ..........  6% 86.9M 1s
   300K .......... .......... .......... .......... ..........  7% 2.36M 1s
   350K .......... .......... .......... .......... ..........  8% 60.7M 1s
   400K .......... .......... ....

In [3]:
from zipfile import ZipFile
zip = ZipFile('ml-100k.zip')
zip.extractall()

In [4]:
%cd ml-100k
!shuf ua.base -o ua.base.shuffled
!head -5 ua.base.shuffled

/root/ml-100k
916	1070	4	880844202
52	288	3	882922454
90	18	3	891383687
747	428	3	888640046
321	709	4	879441308


In [5]:
num_users = 943
num_movies = 1682
num_features = num_users+num_movies
num_ratings_train = 90570
num_ratings_test = 9430

In [6]:
import csv
import numpy as np
from scipy.sparse import lil_matrix
def loadDataset(filename, lines, columns):
    X = lil_matrix((lines, columns)).astype('float32')
    Y = []
    line=0
    with open(filename,'r') as f:
        samples=csv.reader(f,delimiter='\t')
        for userId,movieId,rating,timestamp in samples:
            X[line,int(userId)-1] = 1
            X[line,int(num_users)+int(movieId)-1] = 1
            Y.append(int(rating))
            line=line+1
    Y=np.array(Y).astype('float32')
    return X,Y

In [7]:
X_train, Y_train = loadDataset('ua.base.shuffled',
                               num_ratings_train,
                               num_features)
X_test, Y_test = loadDataset('ua.test',
                             num_ratings_test,
                             num_features)

In [8]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(90570, 2625)
(90570,)
(9430, 2625)
(9430,)


In [11]:
import io, boto3
import sagemaker.amazon.common as smac
def writeDatasetToProtobuf(X, Y, bucket, prefix, key):
    buf = io.BytesIO()
    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
    buf.seek(0)
    obj = '{}/{}'.format(prefix, key)
    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
    return 's3://{}/{}'.format(bucket,obj)

In [12]:
import sagemaker

bucket = sagemaker.Session().default_bucket()

prefix = 'fm-movielens'

train_key = 'train.protobuf'

train_prefix = '{}/{}'.format(prefix, 'train')

test_key = 'test.protobuf'

test_prefix = '{}/{}'.format(prefix, 'test')

In [13]:
output_prefix = 's3://{}/{}/output'.format(bucket,prefix)
train_data = writeDatasetToProtobuf(X_train, Y_train,bucket, train_prefix, train_key)
test_data = writeDatasetToProtobuf(X_test, Y_test,bucket, test_prefix, test_key)

In [14]:
from sagemaker import image_uris
region=boto3.Session().region_name
container=image_uris.retrieve('factorization-machines',region)

fm=sagemaker.estimator.Estimator(
    container,
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=output_prefix)

fm.set_hyperparameters(
    feature_dim=num_features,
    predictor_type='regressor',
    num_factors=64,
    epochs=10)

In [15]:
fm.fit({'train': train_data, 'test': test_data})

2021-04-07 08:08:03 Starting - Starting the training job...
2021-04-07 08:08:26 Starting - Launching requested ML instancesProfilerReport-1617782882: InProgress
......
2021-04-07 08:09:28 Starting - Preparing the instances for training......
2021-04-07 08:10:26 Downloading - Downloading input data
2021-04-07 08:10:26 Training - Downloading the training image..
2021-04-07 08:10:56 Uploading - Uploading generated training model[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from collections import Mapping, MutableMapping, Sequence[0m
  """[0m
  """[0m
[34m[04/07/2021 08:10:47 INFO 140665436149568] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.0000

In [27]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer

endpoint_name = 'fm-movielens-100k'
class FMSerializer(JSONSerializer):
    def serialize(self,data):
        js = {"instances":[]}
        for row in data:
            js["instances"].append({"features":
                                   row.tolist()})
            return json.dumps(js)

In [29]:
fm_predictor = fm.deploy(
initial_instance_count = 1,
instance_type = "ml.t2.medium",
serializer = FMSerializer(),
deserializer = JSONDeserializer()
)

-----------------!

In [30]:
result = fm_predictor.predict(X_test[:3].toarray())
print(result)

{'predictions': [{'score': 3.4194469451904297}]}


In [31]:
fm_predictor.delete_endpoint()

In [33]:
import boto3
from sagemaker import image_uris
region = boto3.Session().region_name
container = image_uris.retrieve('pca', region)

pca = sagemaker.estimator.Estimator(
    container,
    role=sagemaker.get_execution_role(),
    instance_count=1,
    instance_type='ml.m5.xlarge',
    output_path=output_prefix)

In [34]:
pca.set_hyperparameters(feature_dim=num_features,
                        num_components=64,
                        mini_batch_size=1024)

In [35]:
pca.fit({'train': train_data, 'test': test_data})

2021-04-07 08:56:09 Starting - Starting the training job...
2021-04-07 08:56:32 Starting - Launching requested ML instancesProfilerReport-1617785769: InProgress
......
2021-04-07 08:57:32 Starting - Preparing the instances for training...
2021-04-07 08:58:04 Downloading - Downloading input data...
2021-04-07 08:58:32 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[04/07/2021 08:58:32 INFO 139658605082432] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'algorithm_mode': 'regular', 'subtract_mean': 'true', 'extra_components': '-1', 'force_dense': 'true', 'epochs': 1, '_log_level': 'info', '_kvstore': 'dist_sync', '_num_kv_servers': 'auto', '_num_gpus': 'auto'}[0m
[34m[04/07/2021 08:58:32 INFO 139658605082432] Merging with provided configuration from /opt/ml/input/config/hyperparam

In [49]:
pca_predictor = pca.deploy(
    endpoint_name='pca-movielens-100k1',
    instance_type='ml.t2.medium',
    initial_instance_count=1,
    serializer = FMSerializer(),
    deserializer = JSONDeserializer())

---------------!

In [50]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer

endpoint_name = 'fm-movielens-100k1'
class FMSerializer(JSONSerializer):
    def serialize(self,data):
        js = {"instances":[]}
        for row in data:
            js["instances"].append({"features":
                                   row.tolist()})
            return json.dumps(js)

In [51]:
result = pca_predictor.predict(X_test[0].toarray())
print(result)

{'projections': [{'projection': [-0.008711372502148151, 0.0019895541481673717, 0.002355781616643071, 0.012406938709318638, -0.0069608548656105995, -0.009556426666676998, 0.0070395139046013355, 0.0014258784940466285, -0.014954577200114727, 0.006284230388700962, 0.001228088280186057, 0.0033577263820916414, -0.005306658800691366, 0.003560103476047516, -0.005722153931856155, 0.0018947564531117678, -0.018347417935729027, 0.005859722383320332, -0.0051197693683207035, 0.005412592086941004, 0.002981008030474186, -0.0070180222392082214, -0.004825756885111332, 0.0006951577961444855, -0.002631745534017682, 0.0026822059880942106, -0.00016326206969097257, -0.002161189913749695, 0.007496879436075687, -0.010350828990340233, 0.009461312554776669, -0.007941177114844322, 0.008525246754288673, -0.005494360346347094, 0.002860172651708126, -0.00023960997350513935, 0.00014624283357989043, -0.005788157694041729, 0.010191304609179497, -0.0024550503585487604, 0.005202359054237604, -0.0032088235020637512, -0.00

In [52]:
pca_predictor.delete_endpoint()

In [54]:
ep_name= "pca-movielens-100k"
import boto3
sm = boto3.Session().client('sagemaker')
sm.delete_endpoint(EndpointName=ep_name)

{'ResponseMetadata': {'RequestId': 'a66aebd0-2afd-419e-881e-5274d38f4818',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'a66aebd0-2afd-419e-881e-5274d38f4818',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Wed, 07 Apr 2021 09:26:42 GMT'},
  'RetryAttempts': 0}}