# Optimization of Machine Learning Model hyperparameters with Amazon SageMaker

### Install and Load Packages

In [2]:
!pip install -q watermark

In [3]:
%reload_ext watermark
%watermark -a "Zelly Irigon"

Author: Zelly Irigon



In [5]:
!pip install -q seaborn

In [6]:
!pip install -q sagemaker

In [10]:
# Imports
import io #data manipulation
import math 
import boto3 #to interact to s3 via python code
import pickle #to save the ML model in the pickle format
import sklearn
import sagemaker
import numpy as np
import pandas as pd
import xgboost as xgb #framework to build the ML model
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sagemaker.inputs import TrainingInput
from sagemaker import get_execution_role
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter #optimization of hyperparameters
import warnings
warnings.filterwarnings('ignore')

In [11]:
# Load and Organise Data
bucket_name = 'sagemaker-studio-767397741239-wq8e2ijxnf9'

In [12]:
# Create client of connection  to S3
s3_client = boto3.client('s3')

In [14]:
# List of files to be used on the project
files = ['dataset_1.csv','dataset_2.csv']

In [16]:
# Download loop of files that are on s3
for i, key in enumerate(files):
    local_filename = f'dataset_{i+1}.csv'
    s3_client.download_file(bucket_name, key, local_filename)

In [17]:
# Load files in dataframes
white_wine_df = pd.read_csv('dataset_1.csv')
red_wine_df = pd.read_csv('dataset_2.csv')

In [20]:
# Column names
white_wine_df = white_wine_df['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'].str.split(';', expand=True)

In [21]:
# Column names
red_wine_df = red_wine_df['fixed acidity;"volatile acidity";"citric acid";"residual sugar";"chlorides";"free sulfur dioxide";"total sulfur dioxide";"density";"pH";"sulphates";"alcohol";"quality"'].str.split(';', expand=True)

In [22]:
# Adjust column names
column_names = ['fixed acidity',
                'volatile acidity',
                'citric acid',
                'residual sugar',
                'chlorides',
                'free sulfur dioxide',
                'total sulfur dioxide',
                'density',
                'pH',
                'sulphates',
                'alcohol',
                'quality']

In [24]:
# Rename the columns on the dataframe
white_wine_df.columns = column_names
red_wine_df.columns = column_names

In [25]:
white_wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45,170,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14,132,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30,97,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47,186,0.9956,3.19,0.4,9.9,6


In [27]:
red_wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5


In [29]:
# Add a column in each dataframe with the wine type
white_wine_df['wine_type'] = 'white'
red_wine_df['wine_type'] = 'red'

In [30]:
# Merge the dataframes
merged_df = pd.concat([white_wine_df, red_wine_df], ignore_index = True)

In [31]:
merged_df.shape

(6497, 13)

In [32]:
merged_df.sample(5)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
5866,9.0,0.4,0.43,2.4,0.068,29,46,0.9943,3.2,0.6,12.2,6,red
5997,8.6,0.52,0.38,1.5,0.096,5,18,0.99666,3.2,0.52,9.4,5,red
1616,6.9,0.34,0.49,7.3,0.045,61,206,0.9957,3.09,0.4,9.0,6,white
291,5.9,0.15,0.31,5.8,0.041,53,155,0.9945,3.52,0.46,10.5,6,white
4285,6.0,0.2,0.26,6.8,0.049,22,93,0.9928,3.15,0.42,11.0,6,white


In [34]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   fixed acidity         6497 non-null   object
 1   volatile acidity      6497 non-null   object
 2   citric acid           6497 non-null   object
 3   residual sugar        6497 non-null   object
 4   chlorides             6497 non-null   object
 5   free sulfur dioxide   6497 non-null   object
 6   total sulfur dioxide  6497 non-null   object
 7   density               6497 non-null   object
 8   pH                    6497 non-null   object
 9   sulphates             6497 non-null   object
 10  alcohol               6497 non-null   object
 11  quality               6497 non-null   object
 12  wine_type             6497 non-null   object
dtypes: object(13)
memory usage: 660.0+ KB


In [35]:
# Loop to extract the column not more need
for column in merged_df.columns:

    #exclude the type column
    if column != 'wine_type':

        #Convert from string to floats
        merged_df[column] = merged_df[column].astype(float)    

In [38]:
merged_df.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_type
4320,6.4,0.26,0.35,7.7,0.056,45.0,191.0,0.99527,3.16,0.5,9.5,5.0,white
4255,6.0,0.33,0.26,5.1,0.051,16.0,119.0,0.99416,3.15,0.41,9.2,5.0,white
3843,7.5,0.34,0.28,4.0,0.028,46.0,100.0,0.98958,3.2,0.5,13.2,7.0,white
633,6.0,0.29,0.2,12.6,0.045,45.0,187.0,0.9972,3.33,0.42,9.5,5.0,white
6344,6.9,0.63,0.02,1.9,0.078,18.0,30.0,0.99712,3.4,0.75,9.8,5.0,red
5123,7.7,0.43,0.25,2.6,0.073,29.0,63.0,0.99615,3.37,0.58,10.5,6.0,red
5842,8.3,0.3,0.49,3.8,0.09,11.0,24.0,0.99498,3.27,0.64,12.1,7.0,red
1478,7.9,0.22,0.24,4.6,0.044,39.0,159.0,0.9927,2.99,0.28,11.5,6.0,white
904,6.9,0.21,0.28,2.4,0.056,49.0,159.0,0.9944,3.02,0.47,8.8,8.0,white
4520,5.8,0.31,0.31,7.5,0.052,55.0,230.0,0.9949,3.19,0.46,9.8,5.0,white


In [37]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         6497 non-null   float64
 1   volatile acidity      6497 non-null   float64
 2   citric acid           6497 non-null   float64
 3   residual sugar        6497 non-null   float64
 4   chlorides             6497 non-null   float64
 5   free sulfur dioxide   6497 non-null   float64
 6   total sulfur dioxide  6497 non-null   float64
 7   density               6497 non-null   float64
 8   pH                    6497 non-null   float64
 9   sulphates             6497 non-null   float64
 10  alcohol               6497 non-null   float64
 11  quality               6497 non-null   float64
 12  wine_type             6497 non-null   object 
dtypes: float64(12), object(1)
memory usage: 660.0+ KB
