In [2]:
import sagemaker
import boto3
import re
import pandas as pd
import numpy as np
import io
import os
import sys
import time
import json
from sagemaker.inputs import TrainingInput
from sagemaker.serializers import CSVSerializer
from sagemaker import get_execution_role

In [22]:
sess = sagemaker.Session()
bucket = "sagemaker-eu-north-1-998821356241"
prefix = "deployment"
file_name = "amazon_reviews_3.csv"
role = get_execution_role()

In [23]:
#READ THE FILE FROM S3
data_uri = f"s3://{bucket}/{file_name}"
dataset = pd.read_csv(data_uri)

In [24]:
#PRINT THE COLUMNS
dataset.columns

Index(['Unnamed: 0', 'Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0.1.1',
       'RATING', 'VERIFIED_PURCHASE', 'REVIEW_TITLE', 'REVIEW_TEXT',
       'NUM_NOUNS', 'NUM_VERBS', 'NUM_ADJECTIVES', 'NUM_ADVERBS',
       'REVIEW_LENGTH', 'SENTIMENT_SCORE', 'TITLE_LENGTH', 'AVERAGE_RATING',
       'RATING_DEVIATION', 'NUM_REVIEWS', 'READABILITY_FRE',
       'SENTIMENT_CATEGORY_ENCODED', 'RATING_CATEGORY_ENCODED',
       'COHERENT_ENCODED', 'AVG_WORD_LENGTH', 'LABEL_ENCODED',
       'NUM_NAMED_ENTITIES', 'CAPITAL_CHAR_COUNT', 'PUNCTUATION_COUNT',
       'PREPROCESSED_REVIEW_TEXT'],
      dtype='object')

In [26]:
#ONLY NUMERICAL FEATURES LEFT
dataset.columns

Index(['RATING', 'VERIFIED_PURCHASE', 'NUM_NOUNS', 'NUM_VERBS',
       'NUM_ADJECTIVES', 'NUM_ADVERBS', 'REVIEW_LENGTH', 'SENTIMENT_SCORE',
       'TITLE_LENGTH', 'AVERAGE_RATING', 'RATING_DEVIATION', 'NUM_REVIEWS',
       'READABILITY_FRE', 'SENTIMENT_CATEGORY_ENCODED',
       'RATING_CATEGORY_ENCODED', 'COHERENT_ENCODED', 'AVG_WORD_LENGTH',
       'LABEL_ENCODED', 'NUM_NAMED_ENTITIES', 'CAPITAL_CHAR_COUNT',
       'PUNCTUATION_COUNT'],
      dtype='object')

In [25]:
#DELETE COLUMNS NOT NEEDED
dataset = dataset.drop(dataset.columns[0], axis=1)
columns_to_remove = ['Unnamed: 0.2',
                     'Unnamed: 0.1', 'Unnamed: 0.1.1']

dataset = dataset.drop(columns = columns_to_remove, axis = 1)
#REMOVE TEXTUAL FEATURES FOR NOW
dataset = dataset.drop(columns = ['PREPROCESSED_REVIEW_TEXT',
                                  'REVIEW_TEXT', 'REVIEW_TITLE'],
                       axis = 1)
#REMOVE THE HEADER ROW: WHEN READING WITHOUT HEADER
# dataset = dataset[1:]
# dataset = dataset.reset_index(drop = True)

In [28]:
train_data, validation_data, test_data = np.split(
    dataset.sample(frac=1, random_state=1729),
    [int(0.7 * len(dataset)), int(0.9 * len(dataset))],
)

train_data.to_csv("train.csv", header=False, index=False)
validation_data.to_csv("validation.csv", header=False, index=False)
test_data.to_csv("test.csv", header=False, index=False)


boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "train/train.csv")
).upload_file("train.csv")
boto3.Session().resource("s3").Bucket(bucket).Object(
    os.path.join(prefix, "validation/validation.csv")
).upload_file("validation.csv")

In [29]:
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, "latest")
display(container)

INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


'669576153137.dkr.ecr.eu-north-1.amazonaws.com/xgboost:latest'

In [30]:
s3_input_train = TrainingInput(

    s3_data="s3://{}/{}/train".format(bucket, prefix), content_type="csv"
)
s3_input_validation = TrainingInput(
    s3_data="s3://{}/{}/validation/".format(bucket, prefix), content_type="csv"
)
print_train_data = "s3://{}/{}/train".format(bucket, prefix)
print_test_data = "s3://{}/{}/validation/".format(bucket, prefix)

In [31]:
sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(

container,

role,

instance_count=1,

instance_type= 'ml.c5.2xlarge',
output_path='s3://{}/{}/output'.format(bucket, prefix),
sagemaker_session=sess,
)
xgb.set_hyperparameters(
max_depth=5,
eta=0.2,
gamma=4,
min_child_weight=6,
subsample=0.8,
silent=0,
objective='binary:logistic',
num_round=100,
)
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})



INFO:sagemaker:Creating training-job with name: xgboost-2023-06-28-07-51-47-094


2023-06-28 07:51:47 Starting - Starting the training job...
2023-06-28 07:52:02 Starting - Preparing the instances for training............
2023-06-28 07:54:00 Downloading - Downloading input data.

KeyboardInterrupt: 

In [None]:
xgb_predictor = xgb.deploy(
    initial_instance_count=1, instance_type="ml.c5.2xlarge", serializer=CSVSerializer()
)