In [1]:
# import numpy and pandas libraries for working with data
import numpy as np
import pandas as pd# Read in csv and store in a pandas dataframe
df = pd.read_csv('./data/linear-learner-MatchData.csv', sep=',' )

To verify that the csv was read correctly you can execute df.head() to get a list of the top 5 entries in your dataframe.

In [2]:
df.head(5)

Unnamed: 0,id,game_id,player_name,team,player_id,K,H,D,CP,UP,...,G,B,GA,G%,TG%,AF,CD_RP,T50,MG,SI
0,1,1,Jack Riewoldt,RICH,577,12,6,18,5,14,...,4,2,2,57,96,100,17.7,0,322,13
1,2,1,Alex Rance,RICH,576,9,6,15,8,5,...,0,0,0,0,100,55,15.3,0,327,4
2,3,1,Corey Ellis,RICH,555,6,6,12,7,6,...,0,0,1,0,72,52,10.0,0,174,2
3,4,1,Brandon Ellis,RICH,554,11,7,18,3,14,...,0,1,0,0,79,69,2.5,0,417,6
4,5,1,Nick Vlastuin,RICH,584,10,2,12,3,7,...,0,0,0,0,80,47,5.1,0,257,4


The csv has a lot of data that we don’t need right now, we should create a dataframe with only the information we care about. Let’s create a new pandas df with only the columns we require for the excercise.

In [3]:
# keep player name for readability and manual checking
data = df.loc[:, ['player_name', 'K', 'H', 'M', 'T', 'G', 'B', 'HO', 'FF', 'FA', 'AF']]# Remove player name as it is irrelevant for calcs
playerStats = data.loc[:, ['K', 'H','M','T','G','B','HO','FF','FA']]# confirm we got the data we wanted
data.head(10)

Unnamed: 0,player_name,K,H,M,T,G,B,HO,FF,FA,AF
0,Jack Riewoldt,12,6,7,1,4,2,0,1,0,100
1,Alex Rance,9,6,3,1,0,0,0,3,0,55
2,Corey Ellis,6,6,2,4,0,0,0,0,0,52
3,Brandon Ellis,11,7,3,3,0,1,0,0,0,69
4,Nick Vlastuin,10,2,3,3,0,0,0,1,3,47
5,Jacob Townsend,6,3,5,1,4,2,0,0,1,66
6,Jayden Short,12,4,2,1,0,1,0,0,1,52
7,Toby Nankervis,7,7,2,5,0,0,33,2,5,81
8,Kamdyn McIntosh,19,0,5,1,0,1,0,0,1,74
9,Dustin Martin,20,12,5,1,1,3,0,1,1,110


We now have an array of all the relevant player stats for every game of AFL in the 2018 season so far as well as the Fantasy Points that the player scored.

Now AFL fantasy points are calculated by the following formula:

Kick (3), Handball (2), Mark (3), Tackle (4), Goal (6), Behind (1), Hit Out (1), Free Kick For (1), Free Kick Against (-3)

I’ve ordered these in the same order as our array so that we can create a weightings array in this order.

In [4]:
weightings = [3, 2, 3, 4, 6, 1, 1, 1, -3]

Before we run any ML algorithms we should verify that our data and weighting array are valid. Lets write a simple function to confirm this.

This function will take an array of player stats and a vector of weights and multiply each stat by the relevant weight and sum them together to give us calculated Fantasy Points.

In [5]:
def calculate_fantasy_points(playerStats, Weightings):
    return np.dot(playerStats, np.transpose(weightings))

Now we can calculate fantasy points based on the weightings vector we have created and verify that they are indeed the correct weights.

In [6]:
# Calculate Fantasy Points
data['calculated'] = calculate_fantasy_points(playerStats, weightings)# Get the difference between actual points and predicted
data['diff'] = data['AF'] - calculate_fantasy_points(playerStats, weightings)# Take the sum of the difference over all data points and verify that is is zero
data['diff'].sum()

0

At this stage we see that indeed, the weighting vector we created above is correct and does generate the Fantasy Points we would expect. The next step is to see if the SageMaker Linear Learner can find that weighting vector if it was unknown to us.

##### Using SageMaker Linear Learner
The first thing we need to do is to prepare the data in a format that SageMaker can use. The Linear Learner requires a numpy array of type float32.

In [7]:
# Kicks, handballs, goals etc
modelData = np.array(data.iloc[:, 1:10]).astype('float32')

# Actual Fantasy Points
target = np.array(data.iloc[:, 10]).astype('float32')

#Verify that the conversion worked
print(modelData[0])

[12.  6.  7.  1.  4.  2.  0.  1.  0.]


Next we need to import some librarys to communicate with the ML instances
!!! Don't forget to change the bucket and prefix name as you like.!!!

In [8]:
import boto3
import sagemaker
import io
import os
import sagemaker.amazon.common as smac

# Create new sagemaker session
sess = sagemaker.Session()

# S3 bucket to export results to
bucket = "gcr-sm-workshop-henanwan"
prefix = "AFLFantasy/test"

In [9]:
# Use the IO buffer as dataset is small
buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, modelData, target)
buf.seek(0)

key = 'linearlearner'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buf)

s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

uploaded training data location: s3://gcr-sm-workshop-henanwan/AFLFantasy/test/train/linearlearner
training artifacts will be uploaded to: s3://gcr-sm-workshop-henanwan/AFLFantasy/test/output


In [10]:
# Use all regions for ML model
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:latest',
              'ap-northeast-1': '351501993468.dkr.ecr.ap-northeast-1.amazonaws.com/linear-learner:latest'}

Now that we’ve done some setup and configuration, we can look at running the model.

In [11]:
from sagemaker import get_execution_role
# role = get_execution_role()
role = 'arn:aws:iam::579019700964:role/service-role/AmazonSageMaker-ExecutionRole-20190429T111678'

linear = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sess)


We need to set some model parameters for this model. Specifically we need to tell the linear learner that we have 9 parameters to fit, that we want a regression model, and most importantly we do not want to normalise the data.

In [12]:
linear.set_hyperparameters(feature_dim=9,
                           predictor_type='regressor',
                           normalize_data=False)

Now we are ready to deploy our model to an instance to run the linear learner and get results. To deploy this model we simply run the following codes.
This will take a couple of minutes to provision and run and will let you know when it’s done.

In [13]:
linear.fit({'train': s3_train_data})
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.m4.xlarge')

2019-09-03 05:52:10 Starting - Starting the training job...
2019-09-03 05:52:10 Starting - Launching requested ML instances...
2019-09-03 05:53:12 Starting - Preparing the instances for training......
2019-09-03 05:54:26 Downloading - Downloading input data...
2019-09-03 05:55:05 Training - Training image download completed. Training in progress.
2019-09-03 05:55:05 Uploading - Uploading generated training model
2019-09-03 05:55:05 Completed - Training job completed

[31mDocker entrypoint called with argument(s): train[0m
[31m[09/03/2019 05:54:55 INFO 139895032825664] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bi

Billable seconds: 40
-------------------------------------------------------------------------------------------!

##### Accessing the results
Once the model has been trained, we can send new data to the model and obtain predictions. In this case we are just going to send it the training data back and see how close it got to finding the correct weights.

In [14]:
# Set up
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

Pass all the data to the predictor and get all the results back

In [15]:
predictions = []
for array in modelData:
    result = linear_predictor.predict(array)
    predictions += [r['score'] for r in result['predictions']]

In [16]:
predictions = np.array(predictions)
# Push into our pandas dataframe
data['Predicted'] = predictions.astype(int)
data.head(5)

Unnamed: 0,player_name,K,H,M,T,G,B,HO,FF,FA,AF,calculated,diff,Predicted
0,Jack Riewoldt,12,6,7,1,4,2,0,1,0,100,100,0,99
1,Alex Rance,9,6,3,1,0,0,0,3,0,55,55,0,54
2,Corey Ellis,6,6,2,4,0,0,0,0,0,52,52,0,52
3,Brandon Ellis,11,7,3,3,0,1,0,0,0,69,69,0,68
4,Nick Vlastuin,10,2,3,3,0,0,0,1,3,47,47,0,46


In [17]:
# envaluation
y_predicted = data['Predicted']
y = data['calculated']

In [18]:
def evaluationModel(y,y_pre):
    accuracy_cnt = 0
    for i,j in zip(y,y_pre):
        if abs(i-j) <= 1:
            accuracy_cnt += 1
    accuracy_rate = float(accuracy_cnt) / len(y_predicted)
    print ('test samples：{0}'.format(len(y_predicted)))
    print ('accurate samples：{0}'.format(accuracy_cnt))
    print ('accuracy rate：{0:.3f}'.format((accuracy_rate)))   

In [19]:
evaluationModel(y,y_predicted)

test samples：5522
accurate samples：5486
accuracy rate：0.993


The results were very close.
And Sagemaker make it possible to deliver the ease of setting up the model, and just a little domain knowledge required to run this simple regression.

##### (Optional) Delete the Endpoint

If you're ready to be done with this notebook, please run the delete_endpoint line in the cell below. This will remove the hosted endpoint you created and avoid any charges from a stray instance being left on.


In [20]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)