In [1]:
#import libraries

import pandas as pd
import numpy as np
import boto3

In [2]:
#import data

s3 = boto3.client('s3')

bucket = 'flood-prediction-master-dataset'
key = 'final-dataset-15-min/final_data_15_min.csv'

obj = s3.get_object(Bucket= bucket,Key= key)

dataset_15min = pd.read_csv(obj['Body'])

In [3]:
#drop existing index column

dataset_15min.drop(['Unnamed: 0'],axis=1,inplace=True)

#setting datetime datatype from string. Default is string when reading from csv

dataset_15min['timerecorded'] = pd.to_datetime(dataset_15min['timerecorded'])

#rearranging columns

dataset_15min = dataset_15min[['timerecorded','river','rain','temperature','wind_direction','wind_speed','source']]  

#splitting GAN and sensor data

gan_file = dataset_15min.loc[dataset_15min['source']=='GAN']
sensor_file = dataset_15min.loc[dataset_15min['source']=='SENSOR']

In [4]:
#makes the GAN datetime go ahead by 1 month. June - July sensor data. June to august is summer. Hence GAN 1 month ahead.

gan_file['timerecorded']  = gan_file['timerecorded'] + pd.DateOffset(months=1) 

#merging both files and resetting index

dataset_15min = sensor_file.append(gan_file)
dataset_15min.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [5]:
dataset_15min

Unnamed: 0,timerecorded,river,rain,temperature,wind_direction,wind_speed,source
0,2020-06-12 00:00:00,4.540,0.0,11.53,7.0,5.272,SENSOR
1,2020-06-12 00:15:00,4.932,0.0,11.61,4.0,5.554,SENSOR
2,2020-06-12 00:30:00,5.332,0.0,11.71,8.0,5.099,SENSOR
3,2020-06-12 00:45:00,5.702,0.0,11.84,5.0,5.201,SENSOR
4,2020-06-12 01:00:00,6.078,0.0,11.94,4.0,4.914,SENSOR
...,...,...,...,...,...,...,...
5511,2020-08-11 13:45:00,7.109,0.1,16.83,268.0,5.325,GAN
5512,2020-08-11 14:00:00,7.202,0.0,16.41,267.0,6.108,GAN
5513,2020-08-11 14:15:00,7.332,0.0,16.35,268.0,6.334,GAN
5514,2020-08-11 14:30:00,7.500,0.0,17.46,267.0,6.048,GAN


In [6]:
# adding the datetime column value as a feature. River level being time dependent, datetime column value is saved as
# continuous columns.

dataset_15min['dayofweek'] = dataset_15min['timerecorded'].dt.dayofweek
dataset_15min['hour'] = dataset_15min['timerecorded'].dt.hour
dataset_15min['minute'] = dataset_15min['timerecorded'].dt.minute
dataset_15min['month'] = dataset_15min['timerecorded'].dt.month
dataset_15min['year'] = dataset_15min['timerecorded'].dt.year
dataset_15min['dayofmonth'] = dataset_15min['timerecorded'].dt.day
dataset_15min['dayofyear'] = dataset_15min['timerecorded'].dt.dayofyear

In [7]:
dataset_15min.shape

(5516, 14)

In [8]:
train_dataset = dataset_15min[:5240]
test_dataset = dataset_15min[5240:]

In [9]:
train_dataset.head()

Unnamed: 0,timerecorded,river,rain,temperature,wind_direction,wind_speed,source,dayofweek,hour,minute,month,year,dayofmonth,dayofyear
0,2020-06-12 00:00:00,4.54,0.0,11.53,7.0,5.272,SENSOR,4,0,0,6,2020,12,164
1,2020-06-12 00:15:00,4.932,0.0,11.61,4.0,5.554,SENSOR,4,0,15,6,2020,12,164
2,2020-06-12 00:30:00,5.332,0.0,11.71,8.0,5.099,SENSOR,4,0,30,6,2020,12,164
3,2020-06-12 00:45:00,5.702,0.0,11.84,5.0,5.201,SENSOR,4,0,45,6,2020,12,164
4,2020-06-12 01:00:00,6.078,0.0,11.94,4.0,4.914,SENSOR,4,1,0,6,2020,12,164


In [10]:
test_dataset.head()

Unnamed: 0,timerecorded,river,rain,temperature,wind_direction,wind_speed,source,dayofweek,hour,minute,month,year,dayofmonth,dayofyear
5240,2020-08-08 18:00:00,2.541,0.4,13.72,75.0,3.751,GAN,5,18,0,8,2020,8,221
5241,2020-08-08 18:15:00,1.889,0.2,13.53,72.0,2.543,GAN,5,18,15,8,2020,8,221
5242,2020-08-08 18:30:00,2.066,0.4,13.46,81.0,3.953,GAN,5,18,30,8,2020,8,221
5243,2020-08-08 18:45:00,1.808,0.2,13.38,72.0,3.148,GAN,5,18,45,8,2020,8,221
5244,2020-08-08 19:00:00,1.691,0.2,13.21,65.0,2.324,GAN,5,19,0,8,2020,8,221


In [11]:
# removing dependent columns from test dataset. timerecorded is not required for prediction but for further processes.

y_test = test_dataset[['timerecorded','river']]
test_dataset.drop(['timerecorded','river'],axis=1,inplace=True)

# converting training and testing datasets into csv files
train_dataset.drop(['timerecorded','source'],axis=1,inplace=True)
train_dataset.to_csv("train.csv",header=None,index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [12]:
# import libraries

import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role and assign S3 bucket

role = get_execution_role()
prefix = 'predictions-15-min'
bucket_name = 'flood-prediction-master-dataset' # bucket where data needs to be stored and retrieved

containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container

my_region = boto3.session.Session().region_name # region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [13]:
# setting hyperparameters, bucket and session data

sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(containers[my_region],
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket_name, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(eta=0.06,
                        silent=0,
                        early_stopping_rounds=5,
                        objective='reg:linear',
                        num_round=1000)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [14]:
# saving data to S3. SageMaker will take training data from s3
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

trainpath = sagemaker.s3_input(s3_data='s3://{}/{}/train/'.format(bucket_name, prefix), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [15]:
# training the model

xgb.fit({'train': trainpath})

2020-08-16 13:44:40 Starting - Starting the training job...
2020-08-16 13:44:42 Starting - Launching requested ML instances......
2020-08-16 13:45:54 Starting - Preparing the instances for training......
2020-08-16 13:47:07 Downloading - Downloading input data
2020-08-16 13:47:07 Training - Downloading the training image...
2020-08-16 13:47:39 Uploading - Uploading generated training model
2020-08-16 13:47:39 Completed - Training job completed
[34mArguments: train[0m
[34m[2020-08-16:13:47:27:INFO] Running standalone xgboost training.[0m
[34m[2020-08-16:13:47:27:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2020-08-16:13:47:27:INFO] File size need to be processed in the node: 0.27mb. Available memory size in the node: 8496.56mb[0m
[34m[2020-08-16:13:47:27:INFO] Determined delimiter of CSV input is ','[0m
[34m[13:47:27] S3DistributionType set as FullyReplicated[0m
[34m[13:47:27] 5240x11 matrix with 57640 entries loaded from /opt/ml/input/data/train?format=

In [16]:
# deploying to a endpoint

xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


---------------!

In [17]:
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type

In [18]:
# removing unrequired columns

test_dataset.drop(['source'],axis=1,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [19]:
#predictions contains ML predictions
#see how to add column name to prediction output
predictions = xgb_predictor.predict(test_dataset.values).decode('utf-8') # prediction
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

outcome = pd.DataFrame(predictions_array)
outcome.rename(columns={0:"river"},inplace=True)


rf_final = y_test['timerecorded'].to_frame()
rf_final.reset_index(drop=True,inplace=True)
rf_final['river'] = outcome['river'].astype(float)
rf_final['source'] = 'XGB'

rf_final.to_csv("xgboost_predictions_15_min.csv")

y_test['source'] = 'ACTUAL'

y_test.to_csv("actual_15_min.csv")

sess.upload_data(
    path='xgboost_predictions_15_min.csv', bucket=bucket_name,
    key_prefix='predictions-15-min')

sess.upload_data(
    path='actual_15_min.csv', bucket=bucket_name,
    key_prefix='predictions-15-min')

print("Success!")

Success!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
