In [1]:
#import libraries

import pandas as pd
import numpy as np
import boto3

In [2]:
#import data

s3 = boto3.client('s3')

bucket = 'flood-prediction-master-dataset'
key = 'final-dataset-15-min/final_data_15_min.csv'

obj = s3.get_object(Bucket= bucket,Key= key)

dataset_15min = pd.read_csv(obj['Body'])

In [3]:
#drop existing index column

dataset_15min.drop(['Unnamed: 0'],axis=1,inplace=True)

#setting datetime datatype from string. Default is string when reading from csv

dataset_15min['timerecorded'] = pd.to_datetime(dataset_15min['timerecorded'])

#rearranging columns

dataset_15min = dataset_15min[['timerecorded','river','rain','temperature','wind_direction','wind_speed','source']]  

#splitting GAN and sensor data

gan_file = dataset_15min.loc[dataset_15min['source']=='GAN']
sensor_file = dataset_15min.loc[dataset_15min['source']=='SENSOR']

In [4]:
#makes the GAN datetime go ahead by 1 month. June - July sensor data. June to august is summer. Hence GAN 1 month ahead.

gan_file['timerecorded']  = gan_file['timerecorded'] + pd.DateOffset(months=1)

#merging both files and resetting index

dataset_15min = sensor_file.append(gan_file)
dataset_15min.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


In [5]:
dataset_15min

Unnamed: 0,timerecorded,river,rain,temperature,wind_direction,wind_speed,source
0,2020-06-12 00:00:00,4.540,0.0,11.53,7.0,5.272,SENSOR
1,2020-06-12 00:15:00,4.932,0.0,11.61,4.0,5.554,SENSOR
2,2020-06-12 00:30:00,5.332,0.0,11.71,8.0,5.099,SENSOR
3,2020-06-12 00:45:00,5.702,0.0,11.84,5.0,5.201,SENSOR
4,2020-06-12 01:00:00,6.078,0.0,11.94,4.0,4.914,SENSOR
...,...,...,...,...,...,...,...
5511,2020-08-11 13:45:00,7.109,0.1,16.83,268.0,5.325,GAN
5512,2020-08-11 14:00:00,7.202,0.0,16.41,267.0,6.108,GAN
5513,2020-08-11 14:15:00,7.332,0.0,16.35,268.0,6.334,GAN
5514,2020-08-11 14:30:00,7.500,0.0,17.46,267.0,6.048,GAN


In [6]:
# adding the datetime column value as a feature. River level being time dependent, datetime column value is saved as
# continuous columns.

dataset_15min['dayofweek'] = dataset_15min['timerecorded'].dt.dayofweek
dataset_15min['hour'] = dataset_15min['timerecorded'].dt.hour
dataset_15min['minute'] = dataset_15min['timerecorded'].dt.minute
dataset_15min['month'] = dataset_15min['timerecorded'].dt.month
dataset_15min['year'] = dataset_15min['timerecorded'].dt.year
dataset_15min['dayofmonth'] = dataset_15min['timerecorded'].dt.day
dataset_15min['dayofyear'] = dataset_15min['timerecorded'].dt.dayofyear

In [7]:
dataset_15min.shape

(5516, 14)

In [8]:
#splitting into train and test dataset

train_dataset = dataset_15min[:5240]
test_dataset = dataset_15min[5240:]

In [9]:
train_dataset.head()

Unnamed: 0,timerecorded,river,rain,temperature,wind_direction,wind_speed,source,dayofweek,hour,minute,month,year,dayofmonth,dayofyear
0,2020-06-12 00:00:00,4.54,0.0,11.53,7.0,5.272,SENSOR,4,0,0,6,2020,12,164
1,2020-06-12 00:15:00,4.932,0.0,11.61,4.0,5.554,SENSOR,4,0,15,6,2020,12,164
2,2020-06-12 00:30:00,5.332,0.0,11.71,8.0,5.099,SENSOR,4,0,30,6,2020,12,164
3,2020-06-12 00:45:00,5.702,0.0,11.84,5.0,5.201,SENSOR,4,0,45,6,2020,12,164
4,2020-06-12 01:00:00,6.078,0.0,11.94,4.0,4.914,SENSOR,4,1,0,6,2020,12,164


In [10]:
test_dataset.head()

Unnamed: 0,timerecorded,river,rain,temperature,wind_direction,wind_speed,source,dayofweek,hour,minute,month,year,dayofmonth,dayofyear
5240,2020-08-08 18:00:00,2.541,0.4,13.72,75.0,3.751,GAN,5,18,0,8,2020,8,221
5241,2020-08-08 18:15:00,1.889,0.2,13.53,72.0,2.543,GAN,5,18,15,8,2020,8,221
5242,2020-08-08 18:30:00,2.066,0.4,13.46,81.0,3.953,GAN,5,18,30,8,2020,8,221
5243,2020-08-08 18:45:00,1.808,0.2,13.38,72.0,3.148,GAN,5,18,45,8,2020,8,221
5244,2020-08-08 19:00:00,1.691,0.2,13.21,65.0,2.324,GAN,5,19,0,8,2020,8,221


In [11]:
# removing dependent columns from test dataset. timerecorded is not required for prediction but for further processes.  

y_test = test_dataset[['timerecorded','river']]
test_dataset.drop(['timerecorded','river'],axis=1,inplace=True)

# converting training and testing datasets into csv files

train_dataset.to_csv("train_dataset.csv")
test_dataset.to_csv("test_dataset.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [12]:
import datetime
import tarfile

import boto3 # AWS SDK for python. Provides low-level access to AWS services
from sagemaker import get_execution_role
import sagemaker

m_boto3 = boto3.client('sagemaker') 

sess = sagemaker.Session()

region = sess.boto_session.region_name

bucket = 'flood-prediction-master-dataset'  #  Bucket to store and retrieve data

print('Using bucket ' + bucket)

Using bucket flood-prediction-master-dataset


In [13]:
# saving data to S3. SageMaker will take training data from s3

trainpath = sess.upload_data(
    path='train_dataset.csv', bucket=bucket,
    key_prefix='predictions-15-min')

testpath = sess.upload_data(
    path='test_dataset.csv', bucket=bucket,
    key_prefix='predictions-15-min')

In [14]:
%%writefile rftimeseries15min.py

#doing by scripting

import argparse
import os

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import joblib

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "rfmodel.joblib"))
    return clf

if __name__ =='__main__':

    print('extracting arguments')
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
        
    parser.add_argument('--n-estimators', type=int, default=1500)
    parser.add_argument('--max-leaf-nodes', type=int, default=15)
    

    # Data, model, and output directories
    parser.add_argument('--model_dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--train-file', type=str, default='train_dataset.csv')
    parser.add_argument('--test-file', type=str, default='test_dataset.csv')
    
    args, _ = parser.parse_known_args()
    

    print('reading data')
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print('building training and testing datasets')
    
    
    #since only one file is accepted as a script parameter, the predictors and target are segregated here
    y_train = train_df['river']
    
    # remove unrequired columns
    train_df.drop(['river'],axis=1,inplace=True)
    train_df.drop(['Unnamed: 0'],axis=1,inplace=True)
    train_df.drop(['timerecorded'],axis=1,inplace=True)
    train_df.drop(['source'],axis=1,inplace=True)
    
    X_train = train_df
    
    test_df.drop(['Unnamed: 0'],axis=1,inplace=True)
    test_df.drop(['source'],axis=1,inplace=True)
    
    X_test = test_df
    
    
    # train
    print('training model')
    model = RandomForestRegressor(
        n_estimators=args.n_estimators,
        max_leaf_nodes =args.max_leaf_nodes,
        n_jobs=-1)
    
    model.fit(X_train,y_train)
     
    # persist model
    
    path = os.path.join(args.model_dir, "rfmodel.joblib")
    joblib.dump(model, path)
    print('model persisted at ' + path)
    
    # predicting value. It will not predict from the below code when deployed to AWS ML EC2.
    # but is required so that it can have a code when predict is called. It analyses the no of test parameters and its dtypes 
    # The print in this script are shown in CloudWatch.
    
    print('validating model')
    predictions = model.predict(X_test)

Overwriting rftimeseries15min.py


In [15]:
# use of Estimator from the SageMaker Python SDK. stating the script and hyperparameters

from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn(
    entry_point='rftimeseries15min.py',
    role = get_execution_role(),
    train_instance_count=1,
    train_instance_type='ml.m4.xlarge',
    framework_version='0.23-1',
    base_job_name='randomforest-15-min',
    hyperparameters = {
                        'n-estimators': 2000,
                        'max-leaf-nodes': 20
                       })

In [16]:
# launch training job, with asynchronous call

sklearn_estimator.fit({'train':trainpath, 'test': testpath}, wait=False)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [17]:
# after training the model is created which is used for prediction. Here the model is generated. The path is displayed. 

sklearn_estimator.latest_training_job.wait(logs='None')
artifact = m_boto3.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name)['ModelArtifacts']['S3ModelArtifacts']

print('Model artifact persisted at ' + artifact)


2020-08-16 13:43:07 Starting - Launching requested ML instances..............
2020-08-16 13:44:21 Starting - Preparing the instances for training...........
2020-08-16 13:45:20 Downloading - Downloading input data......
2020-08-16 13:45:54 Training - Downloading the training image........
2020-08-16 13:46:41 Training - Training image download completed. Training in progress...
2020-08-16 13:46:57 Uploading - Uploading generated training model.
2020-08-16 13:47:04 Completed - Training job completed
Model artifact persisted at s3://sagemaker-us-east-1-884654660367/randomforest-15-min-2020-08-16-13-43-04-693/output/model.tar.gz


In [18]:
# An EC2 model is deployed based on the script and model

predictor = sklearn_estimator.deploy(instance_type='ml.m4.xlarge',initial_instance_count=1)

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


-----------------!

In [19]:
# removing unrequired columns

test_dataset.drop(['source'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [20]:
# "outcome" contains ML predictions. rf_final has the datetime value for each prediction taken from y_test.
# "rf_final" is then provided prediction values saved as a column. Also source column states the algorithm name.
# By just counting the number of predictions above flood level, a better algorithm can be decided, 
# but the datetime column will help analyze the delay between two algorithms.  

outcome = pd.DataFrame(predictor.predict(test_dataset))
outcome.rename(columns={0:"river"},inplace=True)


rf_final = y_test['timerecorded'].to_frame()
rf_final.reset_index(drop=True,inplace=True)
rf_final['river'] = outcome['river'].astype(float)
rf_final['source'] = 'RF'

# saving as a csv file locally
rf_final.to_csv("random_forest_predictions_15_min.csv")

# saving file to s3
sess.upload_data(
    path='random_forest_predictions_15_min.csv', bucket=bucket,
    key_prefix='predictions-15-min')

print("Success!")

Success!
