# Grabbing Our Dataset and Splitting It Into Train and Test

(Before you try it out, please pip install xgboost version 1.7.1 using the following line: pip install xgboost==1.7.1)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", delimiter=";")

# Splitting the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data.to_csv("train.csv", header=False, index=False)
test_data.to_csv("test.csv", header=False, index=False)
data.to_csv("whole_wine.csv", header = False, index = False)
test_data

# Setting Up SageMaker

Creates a sagemaker session and IAM role (what gives this notebook the proper permissions to access S3). Next three lines send our csv files to S3.

In [None]:
import sagemaker
from sagemaker import get_execution_role

session = sagemaker.Session()
role = get_execution_role()

train_uri = session.upload_data(path='train.csv', key_prefix='wine-quality/train')
test_uri = session.upload_data(path='test.csv', key_prefix='wine-quality/test')
data_uri = session.upload_data(path='whole_wine.csv', key_prefix='wine-quality/whole_data')

# Training The Model

We first create a Docker image URI for XGBoost from Amazon SageMaker's pre-built algorithms

Then an estimator is created which specifies how we want the model to be trained. Then we set our hyperparameters where we choose regression as our form of learning.

Finally we start our training where we define the source (train_uri) and properties (content_type) of our training data and feed it into the .fit() method

In [None]:
from sagemaker.inputs import TrainingInput
from sagemaker.image_uris import retrieve

xgboost_image = retrieve("xgboost", session.boto_region_name, version="1.7-1")
xgboost_estimator = sagemaker.estimator.Estimator(
    image_uri=xgboost_image,
    role=role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    sagemaker_session=session,
)

xgboost_estimator.set_hyperparameters(
    objective="reg:linear",
    num_round=100,
)

train_input = TrainingInput(train_uri, content_type="csv")
xgboost_estimator.fit({"train": train_input})

# Extracting and Testing The Model

After training, our model was stored in S3. In these last blocks of code we are going to extract the model from S3 and load it so we can check it's accuracy

In [None]:
import boto3
import tarfile
import xgboost

print(xgboost.__version__)

s3_model_artifacts = xgboost_estimator.model_data
s3_path_parts = s3_model_artifacts.replace("s3://", "").split("/")
bucket = s3_path_parts[0]
key = "/".join(s3_path_parts[1:])

# Use boto3 to download the file
s3 = boto3.client("s3", region_name=session.boto_region_name)
s3.download_file(bucket, key, "model.tar.gz")

with tarfile.open("model.tar.gz", "r:gz") as tar:
    tar.extractall()

In [None]:
# Load model
model = xgboost.Booster()
model.load_model("xgboost-model")

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error

test_data = pd.read_csv('test.csv', header=None)
y_test = test_data.iloc[:, 0].values
X_test = test_data.iloc[:, 1:].values

dtest = xgboost.DMatrix(X_test)

predictions = model.predict(dtest)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"RMSE: {rmse}")