# Deploy Sagemaker

## Importar dados

In [16]:
from io import BytesIO
from zipfile import ZipFile
import requests
import pandas as pd

url = 'https://archive-beta.ics.uci.edu/static/public/697/predict+students+dropout+and+academic+success.zip'
dataset_file = 'data.csv'
filename = requests.get(url).content
zip_file = ZipFile( BytesIO(filename), 'r' )
df = pd.read_csv(zip_file.open(dataset_file),sep=';')
df

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


# Preparar dados

In [17]:
import sagemaker
from sagemaker.image_uris import retrieve
from sklearn.model_selection import train_test_split
import boto3
import io, os

bucket='educationai'

# df.rename(columns={"Target": "class"})
class_mapper = {b'Dropout':0,b'Graduate':1,b'Enrolled':2}
df['class']=df['Target'].replace(class_mapper)

cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

train, test_and_validate = train_test_split(df, test_size=0.2, random_state=42, stratify=df['class'])
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42, stratify=test_and_validate['class'])

prefix='students_prediction'

train_file='students_train.csv'
test_file='students_test.csv'
validate_file='students_validate.csv'

s3_resource = boto3.Session().resource('s3')
def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False )
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

upload_s3_csv(train_file, 'train', train)
upload_s3_csv(test_file, 'test', test)
upload_s3_csv(validate_file, 'validate', validate)

INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole


# Criar modelo

In [18]:
container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')
s3_output_location="s3://{}/{}/output/".format(bucket,prefix)

hyperparams={"num_round":"42",
             "eval_metric": "auc",
             "objective": "binary:logistic"}

xgb_model=sagemaker.estimator.Estimator(container,
                                       sagemaker.get_execution_role(),
                                       instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       output_path=s3_output_location,
                                        hyperparameters=hyperparams,
                                        sagemaker_session=sagemaker.Session())

train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(bucket,prefix,validate_file),
    content_type='text/csv')

data_channels = {'train': train_channel, 'validation': validate_channel}

xgb_model.fit(inputs=data_channels, logs=False)

print('ready for hosting!')

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2023-04-27-10-39-25-675



2023-04-27 10:39:26 Starting - Starting the training job......
2023-04-27 10:40:01 Starting - Preparing the instances for training.............
2023-04-27 10:41:14 Downloading - Downloading input data.....
2023-04-27 10:41:45 Training - Downloading the training image.......
2023-04-27 10:42:25 Training - Training image download completed. Training in progress.............
2023-04-27 10:43:29 Uploading - Uploading generated training model
2023-04-27 10:43:35 Failed - Training job failed


UnexpectedStatusException: Error for Training job sagemaker-xgboost-2023-04-27-10-39-25-675: Failed. Reason: AlgorithmError: framework error: 
Traceback (most recent call last):
  File "/miniconda3/lib/python3.7/site-packages/sagemaker_xgboost_container/algorithm_mode/train.py", line 226, in train_job
    verbose_eval=False)
  File "/miniconda3/lib/python3.7/site-packages/xgboost/training.py", line 209, in train
    xgb_model=xgb_model, callbacks=callbacks)
  File "/miniconda3/lib/python3.7/site-packages/xgboost/training.py", line 84, in _train_internal
    bst_eval_set = bst.eval_set(evals, i, feval)
  File "/miniconda3/lib/python3.7/site-packages/xgboost/core.py", line 1314, in eval_set
    ctypes.byref(msg)))
  File "/miniconda3/lib/python3.7/site-packages/xgboost/core.py", line 189, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [10:42:42] /workspace/src/metric/rank_metric.cc:212: Check failed: dat[1] > 0.0f (0 vs. 0) : AUC: the dataset only contains pos or neg samples
Stack trace:
  [bt] (0) /miniconda3/lib/python3.7/site-packages/xgboost/./lib/libxgboost.so(dml

## Deploy modelo

In [None]:
xgb_predictor = xgb_model.deploy(initial_instance_count=1,
                serializer = sagemaker.serializers.CSVSerializer(),
                instance_type='ml.m4.xlarge')

## Realizar previsões

In [None]:
test.shape

In [None]:
test.head(5)

# Remover classe

In [None]:
row = test.iloc[0:1,1:]
row.head()

### Preparar dados para previsão

In [None]:
batch_X_csv_buffer = io.StringIO()
row.to_csv(batch_X_csv_buffer, header=False, index=False)
test_row = batch_X_csv_buffer.getvalue()
print(test_row)

### Realizar previsão

In [None]:
xgb_predictor.predict(test_row)

### Analisar resultados

In [None]:
test.head(5)

## Encerrar deploy do modelo

In [None]:
xgb_predictor.delete_endpoint(delete_endpoint_config=True)

# Realizar transformação batch

### Obter dados

In [None]:
batch_X = test.iloc[:,1:];
batch_X.head()

### Preparar dados

In [None]:
batch_X_file='batch-in.csv'
upload_s3_csv(batch_X_file, 'batch-in', batch_X)

### Configurar modelo

In [None]:
batch_output = "s3://{}/{}/batch-out/".format(bucket,prefix)
batch_input = "s3://{}/{}/batch-in/{}".format(bucket,prefix,batch_X_file)

xgb_transformer = xgb_model.transformer(instance_count=1,
                                       instance_type='ml.m4.xlarge',
                                       strategy='MultiRecord',
                                       assemble_with='Line',
                                       output_path=batch_output)

xgb_transformer.transform(data=batch_input,
                         data_type='S3Prefix',
                         content_type='text/csv',
                         split_type='Line')
xgb_transformer.wait()

### Armazenar resultados


In [None]:
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket, Key="{}/batch-out/{}".format(prefix,'batch-in.csv.out'))
target_predicted = pd.read_csv(io.BytesIO(obj['Body'].read()),',',names=['class'])
target_predicted.head(5)

### Ajustar dados 

Converter probabilidade em *0* ou *1*.

The first table output will be the *predicted values*, and the second table output is the *original test data*.

In [None]:
def binary_convert(x):
    threshold = 0.65
    if x > threshold:
        return 1
    else:
        return 0

target_predicted['binary'] = target_predicted['class'].apply(binary_convert)

print(target_predicted.head(10))
test.head(10)

**Note:** The *threshold* in the **binary_convert** function is set to *.65*.

# Fim de notebook