# Using Sagemaker built in Algo - Factorization Machine we will build the recommender system. 

### We will use the data prepared in previous script as starting point
### We will use Sagemaker's inbuilt algorithm, Factorization Machine to build a recommender system
### FM expects input sparse data in recordio protobuf format
### The sparse matrix will be prepared with customerid, article id and product group name as categorical features and text embeddings through the description of the product
### We will built a real time predictor for testing in Sagemaker and also create a batch transformer for processing multiple records through a batch job

In [127]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import time

import boto3
#import sagemaker
#import sagemaker.amazon.common as smac

from scipy.sparse import csr_matrix, hstack, save_npz, load_npz
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 50)
pd.set_option('max_colwidth', 50)  # default is 50

In [128]:
print("numpy version:", np.__version__)
print("pandas version:", pd.__version__)

numpy version: 1.20.3
pandas version: 1.0.1


# Read Dataset
### The data was already created in notebook#2

In [129]:
# Set "usecols" to prevent the following error:
# ParserError: Error tokenizing data. C error: Expected 15 fields in line 1598, saw 22

df_rank = pd.read_csv("fm_preprocessed_filtered_with_attributes.csv")



print("Total records:", df_rank.shape[0], "\n")
#print("Sample records:\n")
df_rank.sample(5)

Total records: 3347127 



Unnamed: 0,index,customer_id,article_id,rating,product_group_name,detail_desc
1836366,1836366,415107,568597007,1,Garment Lower body,Suit trousers in a stretch weave with a regula...
1014691,1014691,197828,832453003,1,Garment Upper body,"Fitted, off-the-shoulder top in smocked cotton..."
889016,889016,170254,821746001,1,Garment Lower body,Sports tights in fast-drying functional fabric...
2590949,2590949,686243,562245018,1,Garment Lower body,"5-pocket jeans in washed, superstretch denim w..."
1283356,1283356,260467,811925011,1,Swimwear,"Lined, non-wired bikini top with wide shoulder..."


In [130]:
df_rank["detail_desc"].fillna("", inplace=True)

In [131]:
print("Unique customers:", df_rank["customer_id"].nunique())
print("Unique products:", df_rank["article_id"].nunique())

Unique customers: 680578
Unique products: 1684


## Prepare the sparse matrix
#### For categorical columns use OneHotEncoder
#### For text columns use TfIdf for text embeddings from the corpus

In [132]:
ohe = OneHotEncoder(handle_unknown = "ignore")
ohe_cols = ["customer_id", "article_id","product_group_name"]
ohe_features = ohe.fit_transform(df_rank[ohe_cols])
ohe_features

<3347127x682272 sparse matrix of type '<class 'numpy.float64'>'
	with 10041381 stored elements in Compressed Sparse Row format>

In [133]:
df_rank.detail_desc.isna().sum()

0

In [134]:
vectorizer = TfidfVectorizer(min_df=2)  
vectorizer.fit(df_rank["detail_desc"].unique())
tfidf_features = vectorizer.transform(df_rank["detail_desc"])
tfidf_features

<3347127x482 sparse matrix of type '<class 'numpy.float64'>'
	with 65490369 stored elements in Compressed Sparse Row format>

In [135]:
X = hstack([ohe_features,tfidf_features], format="csr", dtype="float32")
X

<3347127x682754 sparse matrix of type '<class 'numpy.float32'>'
	with 75531750 stored elements in Compressed Sparse Row format>

In [136]:
y = df_rank["rating"].values.astype("float32")
y

array([1., 1., 1., ..., 5., 5., 5.], dtype=float32)

In [137]:
total = X.shape[0] * X.shape[1]
non_zero = X.nnz
sparsity = (total - non_zero) / total

print("Total elements:", total)
print("Non-zero elements:", non_zero)
print("Sparsity:", round(sparsity*100, 4), "%")


Total elements: 2285264347758
Non-zero elements: 75531750
Sparsity: 99.9967 %


### Spilt the data in training and testing set. The rating is the prediction label

In [138]:
# By default, shuffle=True.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=73)           

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape, "\n")
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (2677701, 682754)
Shape of y_train: (2677701,) 

Shape of X_test: (669426, 682754)
Shape of y_test: (669426,)


In [139]:
feature_dim = X.shape[1]


In [140]:

import boto3
import sagemaker
import sagemaker.amazon.common as smac


### Function to create sparse RecordIO file.

In [141]:


def write_sparse_recordio_file (filename, X, y=None):
    with open(filename, 'wb') as f:
        smac.write_spmatrix_to_sparse_tensor (f, X, y)

### Function to upload file to S3

In [142]:
# Function to upload file to S3.
# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-uploading-files.html
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.upload_fileobj

def upload_to_s3(filename, bucket, prefix, key):
    with open(filename,'rb') as f: # Read in binary mode
        boto3.Session().resource('s3').Bucket(bucket).Object(f"{prefix}/{key}").upload_fileobj(f)
        return f"s3://{bucket}/{prefix}/{key}"

In [143]:
write_sparse_recordio_file("fm_train_filtered_sparse.recordio", X_train, y_train)
write_sparse_recordio_file("fm_test_filtered_sparse.recordio", X_test, y_test)

In [144]:
role = sagemaker.get_execution_role()

### Uploading the train and test RecordIO files to S3.

In [145]:


sess = sagemaker.Session()
region = sess.boto_region_name
bucket = sess.default_bucket()

prefix = "filteredfm"
train_key = "fm_train_filtered_sparse.recordio"
test_key = "fm_test_filtered_sparse.recordio"
output_location = f"s3://{bucket}/{prefix}/output"

train_file_location = upload_to_s3("fm_train_filtered_sparse.recordio", bucket, prefix, train_key)
test_file_location = upload_to_s3("fm_test_filtered_sparse.recordio", bucket, prefix, test_key)

print("SageMaker version:", sagemaker.__version__)
print("Region:", region)
print("Bucket:", bucket)
print("train file location:", train_file_location)
print("test file location:", test_file_location)
print("model output location:", output_location)

SageMaker version: 2.70.0
Region: ap-south-1
Bucket: sagemaker-ap-south-1-659144925604
train file location: s3://sagemaker-ap-south-1-659144925604/filteredfm/fm_train_filtered_sparse.recordio
test file location: s3://sagemaker-ap-south-1-659144925604/filteredfm/fm_test_filtered_sparse.recordio
model output location: s3://sagemaker-ap-south-1-659144925604/filteredfm/output


In [146]:
job_name = 'fm-job-recommender-v1'
job_name

'fm-job-recommender-v1'

In [147]:
# https://github.com/aws-samples/amazon-sagemaker-managed-spot-training/blob/main/xgboost_built_in_managed_spot_training_checkpointing/xgboost_built_in_managed_spot_training_checkpointing.ipynb
    
use_spot_instances = False
max_run = 3600                                   # set to 60 mins
max_wait = 3600 if use_spot_instances else None  # set to 60 mins (must be equal or greater than max_run)
   
checkpoint_s3_uri = (f"s3://{bucket}/{prefix}/checkpoints/{job_name}" if use_spot_instances
                     else None)
    
print(f"Checkpoint uri: {checkpoint_s3_uri}")

Checkpoint uri: None


In [148]:
role = sagemaker.get_execution_role()
container = sagemaker.image_uris.retrieve("factorization-machines", region=region)
container

'991648021394.dkr.ecr.ap-south-1.amazonaws.com/factorization-machines:1'

### Define estimator as built in Factorization machine model in Sagemaker

In [149]:
estimator = sagemaker.estimator.Estimator(    
    container,
    role,
    instance_count = 1,
    instance_type = "ml.m4.xlarge",   # Or "ml.c5.xlarge",
    output_path = output_location,
    sagemaker_session = sess,
    base_job_name = job_name,
    use_spot_instances = use_spot_instances,
    max_run = max_run,
    max_wait = max_wait,
    checkpoint_s3_uri = checkpoint_s3_uri
)

In [150]:
feature_dim

682754

### set the hyperparameters of FM

In [151]:
estimator.set_hyperparameters(
    feature_dim = feature_dim,
    num_factors = 64,  
    predictor_type = "regressor",
    epochs = 5,      
    mini_batch_size = 2000,  
)

estimator.hyperparameters()

{'feature_dim': 682754,
 'num_factors': 64,
 'predictor_type': 'regressor',
 'epochs': 5,
 'mini_batch_size': 2000}

### Start trainig job in Sagemaker

In [152]:
estimator.fit({'train':train_file_location, 
               'test':test_file_location})

2022-04-17 09:34:51 Starting - Starting the training job...
2022-04-17 09:35:16 Starting - Preparing the instances for trainingProfilerReport-1650188091: InProgress
.........
2022-04-17 09:36:36 Downloading - Downloading input data...
2022-04-17 09:37:15 Training - Downloading the training image.....[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
  from collections import Mapping, MutableMapping, Sequence[0m
  """[0m
  """[0m
[34m[04/17/2022 09:38:04 INFO 140272607053632 integration.py:636] worker started[0m
[34m[04/17/2022 09:38:04 INFO 140272607053632] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.00001', 'bias_init_method': 'normal', 'bias_in

In [153]:
job_name = estimator.latest_training_job.job_name

sagemaker_boto_client = boto3.Session(region_name=region).client("sagemaker")
training_job_info = sagemaker_boto_client.describe_training_job(TrainingJobName = job_name)
training_job_info

{'TrainingJobName': 'fm-job-recommender-v1-2022-04-17-09-34-51-355',
 'TrainingJobArn': 'arn:aws:sagemaker:ap-south-1:659144925604:training-job/fm-job-recommender-v1-2022-04-17-09-34-51-355',
 'ModelArtifacts': {'S3ModelArtifacts': 's3://sagemaker-ap-south-1-659144925604/filteredfm/output/fm-job-recommender-v1-2022-04-17-09-34-51-355/output/model.tar.gz'},
 'TrainingJobStatus': 'Completed',
 'SecondaryStatus': 'Completed',
 'HyperParameters': {'epochs': '5',
  'feature_dim': '682754',
  'mini_batch_size': '2000',
  'num_factors': '64',
  'predictor_type': 'regressor'},
 'AlgorithmSpecification': {'TrainingImage': '991648021394.dkr.ecr.ap-south-1.amazonaws.com/factorization-machines:1',
  'TrainingInputMode': 'File',
  'MetricDefinitions': [{'Name': 'train:rmse:epoch',
    'Regex': '#quality_metric: host=\\S+, epoch=\\S+, train rmse <loss>=(\\S+)'},
   {'Name': 'train:progress',
    'Regex': '#progress_metric: host=\\S+, completed (\\S+) %'},
   {'Name': 'test:binary_f_beta',
    'Regex

### Define the serializer and desrializer for prediction request and parsing response. this will be used by inference endpoint for request and response

In [154]:
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer
import json

class fm_json_serializer(JSONSerializer):
    def serialize(self, data):
        js = {"instances": []}
        for row in data:
            js["instances"].append({"features": row.tolist()})
        return json.dumps(js)

In [155]:
predictor = estimator.deploy(initial_instance_count = 1,
                             instance_type = "ml.m5.xlarge",
                             endpoint_name = job_name,
                             serializer = fm_json_serializer(),
                             deserializer = JSONDeserializer(),
                            )

-----!

### Prepare a sample data for prediction

In [156]:
trending = df_rank.copy()
trending = (trending.groupby(["article_id"])
            .nunique()["customer_id"]
            .sort_values(ascending=False)
            .reset_index()            
           )            
trending = trending.rename(columns={'customer_id': 'unique_customers'})
trending

Unnamed: 0,article_id,unique_customers
0,706016001,13238
1,759871002,10717
2,610776002,10699
3,720125001,10626
4,372860001,10173
...,...,...
1679,750330003,916
1680,570004009,907
1681,189634001,882
1682,717464002,853


In [157]:
df_sample=df_rank.head(1)

In [158]:
df_sample['article_id']=706016001
df_sample['customer_id']=13592

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [159]:
df_sample

Unnamed: 0,index,customer_id,article_id,rating,product_group_name,detail_desc
0,0,13592,706016001,1,Garment Upper body,"Fitted, long-sleeved, polo-neck top in soft je..."


### Prepare the sparse matrix using the One hot encoding and tfidf for the sample data

In [160]:
# ohe = OneHotEncoder(handle_unknown = "ignore")
# ohe_cols = ["customer_id", "article_id"]
# ohe.fit(df[ohe_cols])
ohe_features = ohe.transform(df_sample[ohe_cols])
ohe_features

<1x682272 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Row format>

In [161]:
tfidf_sample = vectorizer.transform(df_sample["detail_desc"])
tfidf_sample

<1x482 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [162]:
X_trending = hstack([ohe_features,tfidf_sample], format="csr", dtype="float32")
X_trending.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Run the prediction for the sample data

In [163]:
result = predictor.predict(X_trending.toarray())
result

{'predictions': [{'score': 1.1960450410842896}]}

### We have deployed recommendation model based on Sagemaker inbuilt Factorization machine.

### Now lets prepare the data for the inference job. Lets pick 15 random customer and 15 most popular items
### Then we will create input data as each customer and item combination (10*10 combinations)
### Then use this data to get the ranking for each combination
### For each customer we can arrange data for top picks

In [164]:
customers_int=pd.DataFrame(pd.read_csv("input_customers_rec.csv")["customer_id"]).head(15)

In [165]:
pop_items=pd.DataFrame(pd.read_csv("popular_items.csv").article_id).head(15)

In [166]:
print(customers_int.columns)
print(pop_items.columns)

Index(['customer_id'], dtype='object')
Index(['article_id'], dtype='object')


In [167]:
customers_int['key'] = 1
pop_items['key'] = 1
  
# to obtain the cross join we will merge 
# on the key and drop it.
batch_input = pd.merge(customers_int, pop_items, on ='key').drop("key", 1)
  
batch_input.to_csv("batch_input.csv")

In [168]:
s3_client = boto3.client('s3')
import io

### Add the items metadata for input data

In [169]:
file_key_art = 'kaggle/articles.csv'
obj_art = s3_client.get_object(Bucket=bucket, Key=file_key_art)
df_articles = pd.read_csv(io.BytesIO(obj_art['Body'].read()))
df_articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,1,Dusty Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,4,Dark,5,Black,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,3,Light,9,White,1339,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [170]:
batch_input_meta=pd.merge(batch_input,df_articles,on='article_id',how='left').reset_index()
batch_input_meta


Unnamed: 0,index,customer_id,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,0,115966,706016001,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,9,Black,4,Dark,5,Black,1747,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...
1,1,115966,610776002,610776,Tilly (1),255,T-shirt,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,T-shirt in lightweight jersey with a rounded h...
2,2,115966,751471001,751471,Pluto RW slacks (1),272,Trousers,Garment Lower body,1010016,Solid,9,Black,4,Dark,5,Black,1722,Trouser,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1009,Trousers,Ankle-length cigarette trousers in a stretch w...
3,3,115966,759871002,759871,Tilda tank,253,Vest top,Garment Upper body,1010016,Solid,9,Black,4,Dark,5,Black,3936,EQ Divided Basics,D,Divided,2,Divided,80,Divided Complements Other,1002,Jersey Basic,"Cropped, fitted top in cotton jersey with narr..."
4,4,115966,720125001,720125,SUPREME RW tights,273,Leggings/Tights,Garment Lower body,1010016,Solid,9,Black,4,Dark,5,Black,8310,Ladies Sport Bottoms,S,Sport,26,Sport,5,Ladies H&M Sport,1005,Jersey Fancy,Sports tights in fast-drying functional fabric...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
220,220,214566,706016002,706016,Jade HW Skinny Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,71,Light Blue,3,Light,2,Blue,1747,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,High-waisted jeans in washed superstretch deni...
221,221,214566,599580055,599580,Timeless Midrise Brief,59,Swimwear bottom,Swimwear,1010006,Dot,93,Dark Green,2,Medium Dusty,19,Green,4242,Swimwear,B,Lingeries/Tights,1,Ladieswear,60,"Womens Swimwear, beachwear",1018,Swimwear,Fully lined bikini bottoms with a mid waist an...
222,222,214566,448509014,448509,Perrie Slim Mom Denim TRS,272,Trousers,Garment Lower body,1010016,Solid,72,Blue,3,Light,2,Blue,1747,Trousers,D,Divided,2,Divided,53,Divided Collection,1009,Trousers,"5-pocket, ankle-length jeans in washed, sturdy..."
223,223,214566,741356002,741356,Pamela Shorts HW,274,Shorts,Garment Lower body,1010023,Denim,72,Blue,2,Medium Dusty,2,Blue,1723,Shorts,A,Ladieswear,1,Ladieswear,15,Womens Everyday Collection,1025,Shorts,"Short, 5-pocket shorts in washed denim with a ..."


### For input sparse data use same One Hot Encoder and TFIDF vectorizer from the training dataset

In [171]:
ohe_features = ohe.transform(batch_input_meta[ohe_cols])

tfidf_sample = vectorizer.transform(batch_input_meta["detail_desc"])

X_batch = hstack([ohe_features,tfidf_sample], format="csr", dtype="float32")
X_batch.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.1834833 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.17198625,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

### Save sparse data in recordio protobuf format and save in S3

In [172]:
write_sparse_recordio_file("batch_input.recordio", X_batch)

In [173]:
prefix = "batchfilter"
input_key = "batch_input.recordio"
output_location = f"s3://{bucket}/{prefix}/output"

batch_file_location = upload_to_s3("batch_input.recordio", bucket, prefix, input_key)

print("SageMaker version:", sagemaker.__version__)
print("Region:", region)
print("Bucket:", bucket)
print("train file location:", batch_file_location)

SageMaker version: 2.70.0
Region: ap-south-1
Bucket: sagemaker-ap-south-1-659144925604
train file location: s3://sagemaker-ap-south-1-659144925604/batchfilter/batch_input.recordio


### Run the batch transform job and retrieve the results

In [174]:
fm_transformer = estimator.transformer(
    instance_type='ml.c4.xlarge', 
    instance_count=1, 
    strategy="MultiRecord", 
    output_path="s3://{}/transform/".format(bucket)
)

In [175]:
fm_transformer.transform(
    data="s3://{}/batchfilter/".format(bucket), 
    data_type='S3Prefix', 
    content_type="application/x-recordio-protobuf")

............................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
  from collections import Mapping, MutableMapping, Sequence[0m
  """[0m
  """[0m
  from google.protobuf.pyext import _message[0m
[34m[04/17/2022 09:53:00 INFO 140146484217664] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[04/17/2022 09:53:00 INFO 140146484217664] loading entry points[0m
[34m[04/17/2022 09:53:00 INFO 140146484217664] loaded request iterator application/json[0m
[34m[04/17/2022 09:53:00 INFO 140146484217664] loaded request iterator application/jsonlines[0m
[34m[04/17/2022 09:53:00 INFO 140146484217664] loaded request iterator application/x-recordio-protobuf[0m
[34m[04/17/2022 09:53:00 INFO 140146484217664] loaded response encoder application/json[0m
[34m[04/17/2022 09:53:00 INFO 140146484217664] loaded response encoder application/jsonlines[0m
[34m[04/17/2022 09:53:00 INFO 14014648421

In [176]:
def download_from_s3(bucket, key):
    s3 = boto3.resource('s3')
    obj = s3.Object( bucket, key)
    content = obj.get()['Body'].read()
    return content

In [177]:
key = 'transform/batch_input.recordio.out'
response = download_from_s3(bucket, key)
result = [json.loads(row)["score"] for row in response.split(b"\n") if len(row) > 0]
    
    

In [178]:
result_batch=pd.DataFrame(result)
result_batch.shape

(225, 1)

In [179]:
final_result=batch_input.copy()

In [180]:
final_result["score"]=result_batch

In [181]:
final_result.head()

Unnamed: 0,customer_id,article_id,score
0,115966,706016001,1.429199
1,115966,610776002,1.307121
2,115966,751471001,1.334001
3,115966,759871002,1.216511
4,115966,720125001,1.192747
