In [1]:
#import libraries

import pandas as pd
import numpy as np
import boto3

In [2]:
#import data

s3 = boto3.client('s3')

bucket = 'flood-prediction-master-dataset'
key = 'final-dataset-1-hr/final_data_1_hr.csv'

obj = s3.get_object(Bucket= bucket,Key= key)

dataset_1hr = pd.read_csv(obj['Body'])

In [3]:
#drop existing index column

dataset_1hr.drop(['Unnamed: 0'],axis=1,inplace=True)

#setting datetime datatype from string. Default is string when reading from csv

dataset_1hr['timerecorded'] = pd.to_datetime(dataset_1hr['timerecorded'])

#rearranging columns

dataset_1hr = dataset_1hr[['timerecorded','river','rain','temperature','wind_direction','wind_speed','source']]  

#splitting GAN and sensor data

gan_file = dataset_1hr.loc[dataset_1hr['source']=='GAN']
sensor_file = dataset_1hr.loc[dataset_1hr['source']=='SENSOR']

In [4]:
#makes the GAN datetime go ahead by 1 month. June - July sensor data. June to august is summer. Hence GAN 1 month ahead.

gan_file['timerecorded']  = gan_file['timerecorded'] + pd.DateOffset(months=1) 

#merging both files and resetting index

dataset_1hr = sensor_file.append(gan_file)
dataset_1hr.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [5]:
dataset_1hr

Unnamed: 0,timerecorded,river,rain,temperature,wind_direction,wind_speed,source
0,2020-06-12 00:00:00,5.702,0.0,11.67,6.0,5.282,SENSOR
1,2020-06-12 01:00:00,7.077,0.0,12.08,7.5,4.806,SENSOR
2,2020-06-12 02:00:00,7.881,0.0,12.35,8.3,3.607,SENSOR
3,2020-06-12 03:00:00,8.036,0.0,12.39,216.8,2.554,SENSOR
4,2020-06-12 04:00:00,7.920,0.0,12.30,10.5,4.524,SENSOR
...,...,...,...,...,...,...,...
1379,2020-08-11 10:00:00,4.028,0.0,16.96,264.3,5.898,GAN
1380,2020-08-11 11:00:00,5.234,0.0,17.22,263.3,5.929,GAN
1381,2020-08-11 12:00:00,6.189,0.0,17.50,266.8,5.007,GAN
1382,2020-08-11 13:00:00,6.743,0.2,17.44,268.0,5.341,GAN


In [6]:
# adding the datetime column value as a feature. River level being time dependent, datetime column value is saved as
# continuous columns.

dataset_1hr['dayofweek'] = dataset_1hr['timerecorded'].dt.dayofweek
dataset_1hr['hour'] = dataset_1hr['timerecorded'].dt.hour
dataset_1hr['minute'] = dataset_1hr['timerecorded'].dt.minute
dataset_1hr['month'] = dataset_1hr['timerecorded'].dt.month
dataset_1hr['year'] = dataset_1hr['timerecorded'].dt.year
dataset_1hr['dayofmonth'] = dataset_1hr['timerecorded'].dt.day
dataset_1hr['dayofyear'] = dataset_1hr['timerecorded'].dt.dayofyear

In [7]:
dataset_1hr.shape

(1384, 14)

In [8]:
train_dataset = dataset_1hr[:1315] 
test_dataset = dataset_1hr[1315:] 

In [9]:
train_dataset.tail()

Unnamed: 0,timerecorded,river,rain,temperature,wind_direction,wind_speed,source,dayofweek,hour,minute,month,year,dayofmonth,dayofyear
1310,2020-08-08 13:00:00,8.898,0.0,16.39,69.3,3.223,GAN,5,13,0,8,2020,8,221
1311,2020-08-08 14:00:00,7.891,0.0,15.68,76.8,3.154,GAN,5,14,0,8,2020,8,221
1312,2020-08-08 15:00:00,6.545,0.5,15.51,94.5,3.498,GAN,5,15,0,8,2020,8,221
1313,2020-08-08 16:00:00,5.611,0.0,15.15,90.0,2.845,GAN,5,16,0,8,2020,8,221
1314,2020-08-08 17:00:00,4.503,0.2,14.41,75.8,2.836,GAN,5,17,0,8,2020,8,221


In [10]:
test_dataset.head()

Unnamed: 0,timerecorded,river,rain,temperature,wind_direction,wind_speed,source,dayofweek,hour,minute,month,year,dayofmonth,dayofyear
1315,2020-08-08 18:00:00,2.879,1.0,14.56,75.0,3.349,GAN,5,18,0,8,2020,8,221
1316,2020-08-08 19:00:00,2.212,2.4,12.84,49.8,2.69,GAN,5,19,0,8,2020,8,221
1317,2020-08-08 20:00:00,2.95,2.4,10.99,37.5,2.894,GAN,5,20,0,8,2020,8,221
1318,2020-08-08 21:00:00,4.92,3.4,14.06,59.5,2.556,GAN,5,21,0,8,2020,8,221
1319,2020-08-08 22:00:00,7.0,1.4,13.52,69.5,2.903,GAN,5,22,0,8,2020,8,221


In [11]:
# removing dependent columns from test dataset. timerecorded is not required for prediction but for further processes.

y_test = test_dataset[['timerecorded','river']]
test_dataset.drop(['timerecorded','river'],axis=1,inplace=True)

# converting training and testing datasets into csv files
train_dataset.drop(['timerecorded','source'],axis=1,inplace=True)
train_dataset.to_csv("train.csv",header=None,index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [12]:
# import libraries

import boto3, re, sys, math, json, os, sagemaker, urllib.request
from sagemaker import get_execution_role
import numpy as np                                
import pandas as pd                               
import matplotlib.pyplot as plt                   
from IPython.display import Image                 
from IPython.display import display               
from time import gmtime, strftime                 
from sagemaker.predictor import csv_serializer   

# Define IAM role and assign S3 bucket

role = get_execution_role()
prefix = 'predictions-1-hr'
bucket_name = 'flood-prediction-master-dataset' # bucket where data needs to be stored and retrieved

containers = {'us-west-2': '433757028032.dkr.ecr.us-west-2.amazonaws.com/xgboost:latest',
              'us-east-1': '811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest',
              'us-east-2': '825641698319.dkr.ecr.us-east-2.amazonaws.com/xgboost:latest',
              'eu-west-1': '685385470294.dkr.ecr.eu-west-1.amazonaws.com/xgboost:latest'} # each region has its XGBoost container

my_region = boto3.session.Session().region_name # region of the instance
print("Success - the MySageMakerInstance is in the " + my_region + " region. You will use the " + containers[my_region] + " container for your SageMaker endpoint.")

Success - the MySageMakerInstance is in the us-east-1 region. You will use the 811284229777.dkr.ecr.us-east-1.amazonaws.com/xgboost:latest container for your SageMaker endpoint.


In [13]:
# setting hyperparameters, bucket and session data

sess = sagemaker.Session()

xgb = sagemaker.estimator.Estimator(containers[my_region],
                                    role, 
                                    train_instance_count=1, 
                                    train_instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(bucket_name, prefix),
                                    sagemaker_session=sess)
xgb.set_hyperparameters(eta=0.06,
                        silent=0,
                        early_stopping_rounds=5,
                        objective='reg:linear',
                        num_round=1000)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [14]:
# saving data to S3. SageMaker will take training data from s3
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')

trainpath = sagemaker.s3_input(s3_data='s3://{}/{}/train/'.format(bucket_name, prefix), content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [15]:
# training the model

xgb.fit({'train': trainpath})

2020-08-16 13:44:10 Starting - Starting the training job...
2020-08-16 13:44:12 Starting - Launching requested ML instances......
2020-08-16 13:45:24 Starting - Preparing the instances for training......
2020-08-16 13:46:37 Downloading - Downloading input data
2020-08-16 13:46:37 Training - Downloading the training image...
2020-08-16 13:47:14 Uploading - Uploading generated training model
2020-08-16 13:47:14 Completed - Training job completed
[34mArguments: train[0m
[34m[2020-08-16:13:46:57:INFO] Running standalone xgboost training.[0m
[34m[2020-08-16:13:46:57:INFO] Path /opt/ml/input/data/validation does not exist![0m
[34m[2020-08-16:13:46:57:INFO] File size need to be processed in the node: 0.07mb. Available memory size in the node: 8485.46mb[0m
[34m[2020-08-16:13:46:57:INFO] Determined delimiter of CSV input is ','[0m
[34m[13:46:57] S3DistributionType set as FullyReplicated[0m
[34m[13:46:57] 1315x11 matrix with 14465 entries loaded from /opt/ml/input/data/train?format=

In [16]:
# deploying to a endpoint

xgb_predictor = xgb.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


---------------!

In [17]:
xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type

In [18]:
# removing unrequired columns

test_dataset.drop(['source'],axis=1,inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [21]:
#predictions contains ML predictions
#see how to add column name to prediction output
predictions = xgb_predictor.predict(test_dataset.values).decode('utf-8') # prediction
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array

outcome = pd.DataFrame(predictions_array)
outcome.rename(columns={0:"river"},inplace=True)


rf_final = y_test['timerecorded'].to_frame()
rf_final.reset_index(drop=True,inplace=True)
rf_final['river'] = outcome['river'].astype(float)
rf_final['source'] = 'XGB'

rf_final.to_csv("xgboost_predictions_1_hr.csv")

y_test['source'] = 'ACTUAL'

y_test.to_csv("actual_1_hr.csv")

sess.upload_data(
    path='xgboost_predictions_1_hr.csv', bucket=bucket_name,
    key_prefix='predictions-1-hr')

sess.upload_data(
    path='actual_1_hr.csv', bucket=bucket_name,
    key_prefix='predictions-1-hr')

print("Success!")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Success!
