In [None]:
!pip install xgboost --user

In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
bucket = sess.default_bucket()
region_name = boto3.Session().region_name
prefix = 'gcr_sagemaker_workshop/classification_regression/xgboost'

In [None]:
%matplotlib inline

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing 
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets
import xgboost as xgb
from xgboost import plot_importance
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math
from io import StringIO
from sagemaker.predictor import csv_serializer
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [None]:
df_features = pd.read_csv('features.csv',index_col='index')
df_labels = pd.read_csv('labels.csv',index_col='index')

In [None]:
# prepare datasets
X_train,X_test,y_train,y_test = train_test_split(df_features,df_labels,test_size = 0.3,random_state = 1)

# model1: xgboost

## use native xgboost

In [None]:
data_train = xgb.DMatrix(X_train, y_train)
data_test = xgb.DMatrix(X_test, y_test)

param = {'max_depth': 6, 'eta': 0.3, 'objective': 'reg:squarederror'}
watchlist = [(data_test, 'test'), (data_train, 'train')]

n_round = 300

booster = xgb.train(param, data_train, num_boost_round=n_round, evals=watchlist)

In [None]:
# envaluation
y_predicted = booster.predict(data_test)
y = data_test.get_label()
mean_squared_error(y, y_predicted)

In [None]:
# feature importance
feature_importances_dict = booster.get_score(importance_type='weight')
fig, ax = plt.subplots(figsize=(30, 30))
plot_importance(booster, ax=ax)

## use sagemaker built-in xgboost

In [None]:
# data preparation
df_combined = df_labels.join(df_features)
# train
df_train = df_combined.sample(int(0.7*len(df_combined)))
# validation
df_validation_test = df_combined[~df_combined.index.isin(df_train.index)]
df_validation = df_validation_test.sample(int(len(df_validation_test)/2))
# test
df_test = df_validation_test[~df_validation_test.index.isin(df_validation.index)]
df_test_label = df_test.iloc[:,0]
df_test.drop(columns=['label'],inplace=True)

In [None]:
# upload to s3
df_train.to_csv('df_train.csv',header=False,index=None)
df_validation.to_csv('df_validation.csv',header=False,index=None)
df_test.to_csv('df_test.csv',header=False,index=None)
print('train set length is {}'.format(len(df_train)))
print('validation set length is {}'.format(len(df_validation)))
print('test set length is {}'.format(len(df_test)))

In [None]:
# first prepare data to s3 bucket
csv_buffer = StringIO()
s3_client = boto3.client('s3')
s3_client.upload_file('./df_train.csv', bucket, '{}/train/xgboost_train.csv'.format(prefix))
s3_client.upload_file('./df_validation.csv', bucket, '{}/validation/xgboost_validation.csv'.format(prefix))
s3_client.upload_file('./df_test.csv', bucket, '{}/test/xgboost_test.csv'.format(prefix))

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
container = get_image_uri(region_name, 'xgboost')
print(container)

In [None]:
# training
sagemaker_xgb = sagemaker.estimator.Estimator(container,
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.c5.xlarge',
                                    output_path='s3://{}/{}/xgboost-output'.format(bucket, prefix),
                                    sagemaker_session=sess)

sagemaker_xgb.set_hyperparameters(eta=0.15,
                        max_depth = 5,
#                         subsample = 0.9,
                        eval_metric = 'rmse',
                        objective='reg:linear',
                        num_round=300)

now_time = "2019-09-03"

s3_input_train = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(bucket, prefix, 'train/xgboost_train.csv'), content_type='text/csv')
s3_input_validation = sagemaker.s3_input(s3_data='s3://{}/{}/{}'.format(bucket, prefix, 'validation/xgboost_validation.csv'), content_type='text/csv')
                         
sagemaker_xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

# evaluate xgb in sagemaker

In [None]:
# use batch transform to predict test dataset
# The location of the test dataset
batch_input = 's3://{}/{}/test'.format(bucket, prefix) 
# The location to store the results of the batch transform job
batch_output = 's3://{}/{}/batch-inference'.format(bucket, prefix) 
# transforming
transformer = sagemaker_xgb.transformer(instance_count=1, instance_type='ml.m4.xlarge', output_path=batch_output)

transformer.transform(data=batch_input, data_type='S3Prefix', content_type='text/csv', split_type='Line')

transformer.wait()

In [None]:
# load prediction result caculate rmse
obj = s3_client.get_object(Bucket=bucket, Key='{}/batch-inference/xgboost_test.csv.out'.format(prefix))
df_pre = pd.read_csv(obj['Body'],header=None)

In [None]:
# evaluation
sm_y_pre = df_pre.values.flatten()
sm_y = df_test_label.values.flatten()
mean_squared_error(y, y_predicted)

# model2: naive bayes

In [None]:
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [None]:
mean_squared_error(y,y_pred)