# Using Sagemaker built in Algo - Factorization Machine we will build the recommender system. 

### We will use the data prepared in previous script as starting point

In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import time

import boto3
#import sagemaker
#import sagemaker.amazon.common as smac

from scipy.sparse import csr_matrix, hstack, save_npz, load_npz
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 50)
pd.set_option('max_colwidth', 50)  # default is 50

In [4]:
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)

numpy version: 1.20.3
pandas version: 1.0.1


# Read Dataset

In [5]:
# Set "usecols" to prevent the following error:
# ParserError: Error tokenizing data. C error: Expected 15 fields in line 1598, saw 22

df_rank = pd.read_csv("fm_preprocessed_filtered_with_attributes.csv")



print("Total records:", df_rank.shape[0], "\n")
#print("Sample records:\n")
df_rank.sample(5)

Total records: 3347127 



Unnamed: 0,index,customer_id,article_id,rating,product_group_name,detail_desc
2685810,2685810,734127,842062003,1,Garment Lower body,"High-waisted, calf-length skirt in an airy vis..."
833504,833504,158537,854619003,1,Swimwear,Fully lined bikini bottoms. Mid waist with wid...
2049984,2049984,483185,372860001,1,Socks & Tights,Fine-knit trainer socks in a soft cotton blend.
358900,358900,64893,832458002,1,Garment Upper body,"Fitted top in stretch, ribbed jersey with a wi..."
298175,298175,53793,839496002,1,Garment Upper body,Fitted top in jersey with a low-cut back and l...


In [6]:
df_rank["detail_desc"].fillna("", inplace=True)

In [7]:
print("Unique customers:", df_rank["customer_id"].nunique())
print("Unique products:", df_rank["article_id"].nunique())

Unique customers: 680578
Unique products: 1684


## Prepare the sparse matrix
#### For categorical columns use OneHotEncoder
#### For text columns use TfIdf for text embeddings from the corpus

In [8]:
ohe = OneHotEncoder(handle_unknown = "ignore")
ohe_cols = ["customer_id", "article_id","product_group_name"]
ohe_features = ohe.fit_transform(df_rank[ohe_cols])
ohe_features

<3347127x682272 sparse matrix of type '<class 'numpy.float64'>'
	with 10041381 stored elements in Compressed Sparse Row format>

In [9]:
df_rank.detail_desc.isna().sum()

0

In [10]:
vectorizer = TfidfVectorizer(min_df=2)  
vectorizer.fit(df_rank["detail_desc"].unique())
tfidf_features = vectorizer.transform(df_rank["detail_desc"])
tfidf_features

<3347127x482 sparse matrix of type '<class 'numpy.float64'>'
	with 65490369 stored elements in Compressed Sparse Row format>

In [11]:
X = hstack([ohe_features,tfidf_features], format="csr", dtype="float32")
X

<3347127x682754 sparse matrix of type '<class 'numpy.float32'>'
	with 75531750 stored elements in Compressed Sparse Row format>

In [12]:
y = df_rank["rating"].values.astype("float32")
y

array([1., 1., 1., ..., 5., 5., 5.], dtype=float32)

In [13]:
total = X.shape[0] * X.shape[1]
non_zero = X.nnz
sparsity = (total - non_zero) / total

print("Total elements:", total)
print("Non-zero elements:", non_zero)
print("Sparsity:", round(sparsity*100, 4), "%")


Total elements: 2285264347758
Non-zero elements: 75531750
Sparsity: 99.9967 %


### Spilt the data in training and testing set. The rating is the prediction label

In [15]:
# By default, shuffle=True.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=73)           

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape, "\n")
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (2677701, 682754)
Shape of y_train: (2677701,) 

Shape of X_test: (669426, 682754)
Shape of y_test: (669426,)


In [16]:
feature_dim = X.shape[1]


In [17]:

import boto3
import sagemaker
import sagemaker.amazon.common as smac


### Function to create sparse RecordIO file.

In [18]:


def write_sparse_recordio_file (filename, X, y=None):
    with open(filename, 'wb') as f:
        smac.write_spmatrix_to_sparse_tensor (f, X, y)

In [19]:
# Function to upload file to S3.
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.upload_fileobj

def upload_to_s3(filename, bucket, prefix, key):
    with open(filename,'rb') as f: # Read in binary mode
        boto3.Session().resource('s3').Bucket(bucket).Object(f"{prefix}/{key}").upload_fileobj(f)
        return f"s3://{bucket}/{prefix}/{key}"

In [20]:
write_sparse_recordio_file("fm_train_filtered_sparse.recordio", X_train, y_train)
write_sparse_recordio_file("fm_test_filtered_sparse.recordio", X_test, y_test)

In [21]:
role = sagemaker.get_execution_role()

### Uploading the train and test RecordIO files to S3.

In [22]:


sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()

prefix = "filteredfm"
train_key = "fm_train_filtered_sparse.recordio"
test_key = "fm_test_filtered_sparse.recordio"
output_location = f"s3://{bucket}/{prefix}/output"

train_file_location = upload_to_s3("fm_train_filtered_sparse.recordio", bucket, prefix, train_key)
test_file_location = upload_to_s3("fm_test_filtered_sparse.recordio", bucket, prefix, test_key)

print("SageMaker version:", sagemaker.__version__)
print("Region:", region)
print("Bucket:", bucket)
print("train file location:", train_file_location)
print("test file location:", test_file_location)
print("model output location:", output_location)

SageMaker version: 2.70.0
Region: ap-south-1
Bucket: sagemaker-ap-south-1-659144925604
train file location: s3://sagemaker-ap-south-1-659144925604/filteredfm/fm_train_filtered_sparse.recordio
test file location: s3://sagemaker-ap-south-1-659144925604/filteredfm/fm_test_filtered_sparse.recordio
model output location: s3://sagemaker-ap-south-1-659144925604/filteredfm/output


In [23]:
job_name = 'fm-job-recommender-v1'
job_name

'fm-job-recommender-v1'

In [24]:
# https://github.com/aws-samples/amazon-sagemaker-managed-spot-training/blob/main/xgboost_built_in_managed_spot_training_checkpointing/xgboost_built_in_managed_spot_training_checkpointing.ipynb
    
use_spot_instances = False
max_run = 3600                                   # set to 60 mins
max_wait = 3600 if use_spot_instances else None  # set to 60 mins (must be equal or greater than max_run)
   
checkpoint_s3_uri = (f"s3://{bucket}/{prefix}/checkpoints/{job_name}" if use_spot_instances
                     else None)
    
print(f"Checkpoint uri: {checkpoint_s3_uri}")

Checkpoint uri: None


In [25]:
role = sagemaker.get_execution_role()
container = sagemaker.image_uris.retrieve("factorization-machines", region=region)
container

'991648021394.dkr.ecr.ap-south-1.amazonaws.com/factorization-machines:1'

### Define estimator as built in Factorization machine model in Sagemaker

In [26]:
estimator = sagemaker.estimator.Estimator(    
    container,
    role,
    instance_count = 1,
    instance_type = "ml.m4.xlarge",   # Or "ml.c5.xlarge",
    output_path = output_location,
    sagemaker_session = sess,
    base_job_name = job_name,
    use_spot_instances = use_spot_instances,
    max_run = max_run,
    max_wait = max_wait,
    checkpoint_s3_uri = checkpoint_s3_uri
)

In [27]:
feature_dim

682754

In [28]:
estimator.set_hyperparameters(
    feature_dim = feature_dim,
    num_factors = 64,  
    predictor_type = "regressor",
    epochs = 5,      
    mini_batch_size = 2000,  
)

estimator.hyperparameters()

{'feature_dim': 682754,
 'num_factors': 64,
 'predictor_type': 'regressor',
 'epochs': 5,
 'mini_batch_size': 2000}

### Start trainig job in Sagemaker

In [29]:
estimator.fit({'train':train_file_location, 
               'test':test_file_location})

2022-04-05 11:41:24 Starting - Starting the training job...
2022-04-05 11:41:49 Starting - Preparing the instances for trainingProfilerReport-1649158884: InProgress
.........
2022-04-05 11:43:08 Downloading - Downloading input data...
2022-04-05 11:43:48 Training - Downloading the training image...
2022-04-05 11:44:25 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from collections import Mapping, MutableMapping, Sequence[0m
  """[0m
  """[0m
[34m[04/05/2022 11:44:30 INFO 140669154469696 integration.py:636] worker started[0m
[34m[04/05/2022 11:44:30 INFO 140669154469696] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01'

In [30]:
job_name = estimator.latest_training_job.job_name

sagemaker_boto_client = boto3.Session(region_name=region).client("sagemaker")
training_job_info = sagemaker_boto_client.describe_training_job(TrainingJobName = job_name)
training_job_info

{'TrainingJobName': 'fm-job-recommender-v1-2022-04-05-11-41-24-492',
 'TrainingJobArn': 'arn:aws:sagemaker:ap-south-1:659144925604:training-job/fm-job-recommender-v1-2022-04-05-11-41-24-492',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-ap-south-1-659144925604/filteredfm/output/fm-job-recommender-v1-2022-04-05-11-41-24-492/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'epochs': '5',
  'feature_dim': '682754',
  'mini_batch_size': '2000',
  'num_factors': '64',
  'predictor_type': 'regressor'},
 'AlgorithmSpecification': {'TrainingImage': '991648021394.dkr.ecr.ap-south-1.amazonaws.com/factorization-machines:1',
  'TrainingInputMode': 'File',
  'MetricDefinitions': [{'Name': 'train:rmse:epoch',
    'Regex': '#quality_metric: host=\\S+, epoch=\\S+, train rmse <loss>=(\\S+)'},
   {'Name': 'train:progress',
    'Regex': '#progress_metric: host=\\S+, completed (\\S+) %'},
   {'Name': 'test:binary_f_beta',
    'Regex

### Define the serializer and desrializer for prediction request and parsing response. this will be used by inference endpoint for request and response

In [31]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer
import json

class fm_json_serializer(JSONSerializer):
    def serialize(self, data):
        js = {"instances": []}
        for row in data:
            js["instances"].append({"features": row.tolist()})
        return json.dumps(js)

In [32]:
predictor = estimator.deploy(initial_instance_count = 1,
                             instance_type = "ml.m5.xlarge",
                             endpoint_name = job_name,
                             serializer = fm_json_serializer(),
                             deserializer = JSONDeserializer(),
                            )

-----!

In [42]:
df_rank.groupby("customer_id").count()["article_id"].sort_values(ascending=False).head(30)

customer_id
13592     166
20817     127
59752     119
51858     109
121825    104
175000     95
436100     95
372271     94
23465      92
113247     91
117899     89
89926      88
296306     88
340784     87
3130       87
335884     86
23691      85
319709     85
17148      85
15002      84
2468       84
419726     83
9178       83
21563      82
8975       82
940        81
97838      81
22849      80
167657     80
48896      80
Name: article_id, dtype: int64

### Prepare a sample data for prediction

In [43]:
# Select one of the top customers from above.
# This customer have records for both product categories.
sample_customer = 13592 # 42799904, 50623001, 16528195, 35178127, 18167714

# The existing product ratings given by the selected customer.
df[df["customer_id"] == sample_customer]  

Unnamed: 0,customer_id,article_id,rating
77654,13592,253448003,1
77655,13592,456163064,1
77656,13592,456163069,1
77657,13592,506098007,1
77658,13592,547780001,1
...,...,...,...
2931440,13592,913030001,2
2931441,13592,915526002,2
2931442,13592,933706001,2
3338200,13592,777099001,4


In [44]:
trending = df_rank.copy()
trending = (trending.groupby(["article_id"])
            .nunique()["customer_id"]
            .sort_values(ascending=False)
            .reset_index()            
           )            
trending = trending.rename(columns={'customer_id': 'unique_customers'})
trending

Unnamed: 0,article_id,unique_customers
0,706016001,13238
1,759871002,10717
2,610776002,10699
3,720125001,10626
4,372860001,10173
...,...,...
1679,750330003,916
1680,570004009,907
1681,189634001,882
1682,717464002,853


In [45]:
df_sample=df_rank.head(1)

In [46]:
df_sample['article_id']=706016001
df_sample['customer_id']=sample_customer

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [47]:
df_sample

Unnamed: 0,index,customer_id,article_id,rating,product_group_name,detail_desc
0,0,13592,706016001,1,Garment Upper body,"Fitted, long-sleeved, polo-neck top in soft jersey."


### Prepare the sparse matrix using the One hot encoding and tfidf for the sample data

In [48]:
# ohe = OneHotEncoder(handle_unknown = "ignore")
# ohe_cols = ["customer_id", "article_id"]
# ohe.fit(df[ohe_cols])
ohe_features = ohe.transform(df_sample[ohe_cols])
ohe_features

<1x682272 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [49]:
tfidf_sample = vectorizer.transform(df_sample["detail_desc"])
tfidf_sample

<1x482 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [50]:
X_trending = hstack([ohe_features,tfidf_sample], format="csr", dtype="float32")
X_trending.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Run the prediction for the sample data

In [51]:
result = predictor.predict(X_trending.toarray())
result

{'predictions': [{'score': 1.2170072793960571}]}

### We have deployed recommendation model based on Sagemaker inbuilt Factorization machine.