# Deploy Sagemaker

## Importar dados

In [1]:
from io import BytesIO
from zipfile import ZipFile
import requests
import pandas as pd

url = 'https://archive-beta.ics.uci.edu/static/public/697/predict+students+dropout+and+academic+success.zip'
dataset_file = 'data.csv'
filename = requests.get(url).content
zip_file = ZipFile( BytesIO(filename), 'r' )
df = pd.read_csv(zip_file.open(dataset_file),sep=';')
df


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,17,5,171,1,1,122.0,1,19,12,...,0,0,0,0,0.000000,0,10.8,1.4,1.74,Dropout
1,1,15,1,9254,1,1,160.0,1,1,3,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,9070,1,1,122.0,1,37,37,...,0,6,0,0,0.000000,0,10.8,1.4,1.74,Dropout
3,1,17,2,9773,1,1,122.0,1,38,37,...,0,6,10,5,12.400000,0,9.4,-0.8,-3.12,Graduate
4,2,39,1,8014,0,1,100.0,1,37,38,...,0,6,6,6,13.000000,0,13.9,-0.3,0.79,Graduate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,6,9773,1,1,125.0,1,1,1,...,0,6,8,5,12.666667,0,15.5,2.8,-4.06,Graduate
4420,1,1,2,9773,1,1,120.0,105,1,1,...,0,6,6,2,11.000000,0,11.1,0.6,2.02,Dropout
4421,1,1,1,9500,1,1,154.0,1,37,37,...,0,8,9,1,13.500000,0,13.9,-0.3,0.79,Dropout
4422,1,1,1,9147,1,1,180.0,1,37,37,...,0,5,6,5,12.000000,0,9.4,-0.8,-3.12,Graduate


# Preparar dados

In [2]:
import sagemaker
from sagemaker.image_uris import retrieve
from sklearn.model_selection import train_test_split
import boto3
import io, os
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker import get_execution_role

bucket='educationai'

class_mapper = {'Dropout':0,'Graduate':1,'Enrolled':2}
df['class']=df['Target'].replace(class_mapper)
df.drop('Target', axis='columns', inplace=True)


cols = df.columns.tolist()
cols = cols[-1:] + cols[:-1]
df = df[cols]

train, test_and_validate = train_test_split(df, test_size=0.2, random_state=42, stratify=df['class'])
test, validate = train_test_split(test_and_validate, test_size=0.5, random_state=42, stratify=test_and_validate['class'])

prefix='students_prediction'

train_file='students_train.csv'
test_file='students_test.csv'
validate_file='students_validate.csv'

s3_resource = boto3.Session().resource('s3')
def upload_s3_csv(filename, folder, dataframe):
    csv_buffer = io.StringIO()
    dataframe.to_csv(csv_buffer, header=False, index=False )
    s3_resource.Bucket(bucket).Object(os.path.join(prefix, folder, filename)).put(Body=csv_buffer.getvalue())

upload_s3_csv(train_file, 'train', train)
upload_s3_csv(test_file, 'test', test)
upload_s3_csv(validate_file, 'validate', validate)
df

Unnamed: 0,class,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
0,0,1,17,5,171,1,1,122.0,1,19,...,0,0,0,0,0,0.000000,0,10.8,1.4,1.74
1,1,1,15,1,9254,1,1,160.0,1,1,...,0,0,6,6,6,13.666667,0,13.9,-0.3,0.79
2,0,1,1,5,9070,1,1,122.0,1,37,...,0,0,6,0,0,0.000000,0,10.8,1.4,1.74
3,1,1,17,2,9773,1,1,122.0,1,38,...,0,0,6,10,5,12.400000,0,9.4,-0.8,-3.12
4,1,2,39,1,8014,0,1,100.0,1,37,...,0,0,6,6,6,13.000000,0,13.9,-0.3,0.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,1,1,1,6,9773,1,1,125.0,1,1,...,0,0,6,8,5,12.666667,0,15.5,2.8,-4.06
4420,0,1,1,2,9773,1,1,120.0,105,1,...,0,0,6,6,2,11.000000,0,11.1,0.6,2.02
4421,0,1,1,1,9500,1,1,154.0,1,37,...,0,0,8,9,1,13.500000,0,13.9,-0.3,0.79
4422,1,1,1,1,9147,1,1,180.0,1,37,...,0,0,5,6,5,12.000000,0,9.4,-0.8,-3.12


# Criar modelo

In [3]:
# container = retrieve('xgboost',boto3.Session().region_name,'1.0-1')
s3_output_location="s3://{}/{}/output/".format(bucket,prefix)

hyperparams = {"feature_dim": 36, "k": 7, "sample_size": 2000, "predictor_type": "classifier"}

model = sagemaker.estimator.Estimator(
        get_image_uri(boto3.Session().region_name, "knn"),
        get_execution_role(),
        instance_count=1,
        instance_type="ml.m5.2xlarge",
        output_path=s3_output_location,
        hyperparameters=hyperparams,
        sagemaker_session=sagemaker.Session(),
    )
# model.set_hyperparameters(**hyperparams)
    
train_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/train/".format(bucket,prefix,train_file),
    content_type='text/csv')

validate_channel = sagemaker.inputs.TrainingInput(
    "s3://{}/{}/validate/".format(bucket,prefix,validate_file),
    content_type='text/csv')

data_channels = {'train': train_channel, 'validation': validate_channel}

model.fit(inputs=data_channels, logs=False)

print('ready for hosting!')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
INFO:sagemaker:Creating training-job with name: knn-2023-04-27-15-25-51-333



2023-04-27 15:25:51 Starting - Starting the training job..
2023-04-27 15:26:07 Starting - Preparing the instances for training.......
2023-04-27 15:26:52 Downloading - Downloading input data....
2023-04-27 15:27:12 Training - Downloading the training image...................................
2023-04-27 15:30:18 Training - Training image download completed. Training in progress........
2023-04-27 15:30:59 Uploading - Uploading generated training model.
2023-04-27 15:31:09 Completed - Training job completed
ready for hosting!


## Deploy modelo

In [4]:
predictor = model.deploy(initial_instance_count=1,
                serializer = sagemaker.serializers.CSVSerializer(),
                instance_type='ml.m4.xlarge')

INFO:sagemaker:Creating model with name: knn-2023-04-27-15-31-09-842
INFO:sagemaker:Creating endpoint-config with name knn-2023-04-27-15-31-09-842
INFO:sagemaker:Creating endpoint with name knn-2023-04-27-15-31-09-842


------------!

## Realizar previsões

In [5]:
test.shape

(442, 37)

In [6]:
test.head(5)

Unnamed: 0,class,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
4204,0,1,1,1,9238,1,1,123.0,1,34,...,0,0,6,10,4,11.5,0,7.6,2.6,0.32
1279,1,1,17,1,9238,1,1,120.0,1,19,...,0,0,6,8,6,12.625,0,13.9,-0.3,0.79
2111,1,1,17,1,9003,1,1,131.0,1,19,...,0,0,6,8,6,14.333333,0,12.4,0.5,1.79
677,2,1,1,1,9853,1,1,133.0,1,37,...,0,0,6,6,4,12.0,0,15.5,2.8,-4.06
2579,1,1,1,1,9238,1,1,148.0,1,37,...,0,0,6,6,6,12.833333,0,10.8,1.4,1.74


# Remover classe

In [7]:
row = test.iloc[0:1,1:]
row.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,Father's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
4204,1,1,1,9238,1,1,123.0,1,34,38,...,0,0,6,10,4,11.5,0,7.6,2.6,0.32


### Preparar dados para previsão

In [8]:
batch_X_csv_buffer = io.StringIO()
row.to_csv(batch_X_csv_buffer, header=False, index=False)
test_row = batch_X_csv_buffer.getvalue()
print(test_row)

1,1,1,9238,1,1,123.0,1,34,38,0,7,120.2,1,0,0,1,0,0,20,0,0,6,9,5,11.8,0,0,6,10,4,11.5,0,7.6,2.6,0.32



### Realizar previsão

In [9]:
predictor.predict(test_row)

b'{"predictions": [{"predicted_label": 1.0}]}'

### Analisar resultados

In [10]:
test.head(5)

Unnamed: 0,class,Marital status,Application mode,Application order,Course,Daytime/evening attendance\t,Previous qualification,Previous qualification (grade),Nacionality,Mother's qualification,...,Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP
4204,0,1,1,1,9238,1,1,123.0,1,34,...,0,0,6,10,4,11.5,0,7.6,2.6,0.32
1279,1,1,17,1,9238,1,1,120.0,1,19,...,0,0,6,8,6,12.625,0,13.9,-0.3,0.79
2111,1,1,17,1,9003,1,1,131.0,1,19,...,0,0,6,8,6,14.333333,0,12.4,0.5,1.79
677,2,1,1,1,9853,1,1,133.0,1,37,...,0,0,6,6,4,12.0,0,15.5,2.8,-4.06
2579,1,1,1,1,9238,1,1,148.0,1,37,...,0,0,6,6,6,12.833333,0,10.8,1.4,1.74


## Encerrar deploy do modelo

In [11]:
# predictor.delete_endpoint(delete_endpoint_config=True)

# Fim de notebook