In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score,mean_absolute_error
import boto3

In [3]:
#importing predicted data

s3 = boto3.client('s3')

bucket = 'flood-prediction-master-dataset'

key = 'predictions-15-min/actual_15_min.csv'
obj = s3.get_object(Bucket= bucket,Key= key)
actual = pd.read_csv(obj['Body'])

key = 'predictions-15-min/xgboost_predictions_15_min.csv'
obj = s3.get_object(Bucket= bucket,Key= key)
xgb_predict = pd.read_csv(obj['Body'])

key = 'predictions-15-min/random_forest_predictions_15_min.csv'
obj = s3.get_object(Bucket= bucket,Key= key)
rf_predict = pd.read_csv(obj['Body'])


In [4]:
#dropping unnexessary columns

actual.drop(['Unnamed: 0','source'],axis=1,inplace=True)
xgb_predict.drop(['Unnamed: 0','source'],axis=1,inplace=True)
rf_predict.drop(['Unnamed: 0','source'],axis=1,inplace=True)

In [5]:
actual

Unnamed: 0,timerecorded,river
0,08-08-20 18:00,2.541
1,08-08-20 18:15,1.889
2,08-08-20 18:30,2.066
3,08-08-20 18:45,1.808
4,08-08-20 19:00,1.691
...,...,...
271,11-08-20 13:45,7.109
272,11-08-20 14:00,7.202
273,11-08-20 14:15,7.332
274,11-08-20 14:30,7.500


In [6]:
xgb_predict

Unnamed: 0,timerecorded,river
0,08-08-20 18:00,2.454
1,08-08-20 18:15,2.039
2,08-08-20 18:30,2.000
3,08-08-20 18:45,1.854
4,08-08-20 19:00,1.771
...,...,...
271,11-08-20 13:45,7.020
272,11-08-20 14:00,7.178
273,11-08-20 14:15,7.289
274,11-08-20 14:30,7.447


In [7]:
rf_predict

Unnamed: 0,timerecorded,river
0,08-08-20 18:00,2.881
1,08-08-20 18:15,1.956
2,08-08-20 18:30,2.030
3,08-08-20 18:45,1.724
4,08-08-20 19:00,1.443
...,...,...
271,11-08-20 13:45,7.441
272,11-08-20 14:00,7.406
273,11-08-20 14:15,7.235
274,11-08-20 14:30,7.374


In [8]:
# saving difference between actual and predicted river value

xgb_difference = pd.DataFrame(actual['river'] - xgb_predict['river'])

rf_difference = pd.DataFrame(actual['river'] - rf_predict['river'])

print("Sum of Prediction Error in XGBoost: "+str(xgb_difference['river'].sum())+
      " Highest Error in Prediction in XGBoost: "+str(xgb_difference['river'].max()))

print("Sum of Prediction Error in Random Forest: "+str(rf_difference['river'].sum())+
      " Highest Error in Prediction in Random Forest: "+str(rf_difference['river'].max()))

print("Flood Triggered by XGBoost within "+str((xgb_predict[xgb_predict["river"]>=6.2000].index[0])//4)+" Hours."+
      " Flood Triggered by Random Forest within "+str((rf_predict[rf_predict["river"]>=6.2000].index[0])//4)+" Hours.")

Sum of Prediction Error in XGBoost: -7.980999999999995 Highest Error in Prediction in XGBoost: 0.3799999999999999
Sum of Prediction Error in Random Forest: 21.583000000000006 Highest Error in Prediction in Random Forest: 0.742
Flood Triggered by XGBoost within 5 Hours. Flood Triggered by Random Forest within 5 Hours.


In [9]:
# CREATING TABLE OF TRIGGERS. filtering columns/instances with river level above flood level 

actual_flood = actual[['timerecorded','river']]

In [10]:
# joining xgboost and random forest columns with same index to compare the triggers 

xgb_compare = actual_flood.join(xgb_predict,lsuffix='_actual',rsuffix='_xgboost')

all_combined = xgb_compare.join(rf_predict)

all_combined.drop(['timerecorded_xgboost','timerecorded'],axis=1,inplace=True)
all_combined = all_combined.rename(columns={"river":"river_rf"})

In [11]:
# creating table that has either actual, xgboost prediction or random forest prediction greater than flood threshold value

possible_flood = all_combined[(all_combined['river_actual']>=6.200) | (all_combined['river_xgboost']>=6.200) | (all_combined['river_rf']>=6.200)]

In [12]:
possible_flood['xgb_status'] = 'NONE'
possible_flood['rf_status'] = 'NONE'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [13]:
# HIT means triggered rightly MISS means not triggered FALSE wrongly triggered

possible_flood['xgb_status'] = np.where( (possible_flood['river_actual']>=6.200) & (possible_flood['river_xgboost']>=6.200),'HIT', possible_flood['xgb_status'])
possible_flood['xgb_status'] = np.where( (possible_flood['river_actual']>=6.200) & (possible_flood['river_xgboost']<6.200),'MISS', possible_flood['xgb_status'])
possible_flood['xgb_status'] = np.where( (possible_flood['river_actual']<6.200) & (possible_flood['river_xgboost']>=6.200),'FALSE', possible_flood['xgb_status'])


possible_flood['rf_status'] = np.where( (possible_flood['river_actual']>=6.200) & (possible_flood['river_rf']>=6.200),'HIT', possible_flood['rf_status'])
possible_flood['rf_status'] = np.where( (possible_flood['river_actual']>=6.200) & (possible_flood['river_rf']<6.200),'MISS', possible_flood['rf_status'])
possible_flood['rf_status'] = np.where( (possible_flood['river_actual']<6.200) & (possible_flood['river_rf']>=6.200),'FALSE', possible_flood['rf_status'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata

In [14]:
possible_flood.reset_index(drop=True,inplace=True)
possible_flood

Unnamed: 0,timerecorded_actual,river_actual,river_xgboost,river_rf,xgb_status,rf_status
0,08-08-20 23:15,7.103,6.893,6.785,HIT,HIT
1,08-08-20 23:30,7.030,7.142,7.327,HIT,HIT
2,08-08-20 23:45,7.705,7.697,7.670,HIT,HIT
3,09-08-20 0:00,8.410,8.501,8.020,HIT,HIT
4,09-08-20 0:15,8.638,8.666,8.375,HIT,HIT
...,...,...,...,...,...,...
98,11-08-20 13:45,7.109,7.020,7.441,HIT,HIT
99,11-08-20 14:00,7.202,7.178,7.406,HIT,HIT
100,11-08-20 14:15,7.332,7.289,7.235,HIT,HIT
101,11-08-20 14:30,7.500,7.447,7.374,HIT,HIT


In [17]:
# variables are used to avoid making print statement long and complex

xgb_hits = possible_flood['xgb_status'].loc[possible_flood['xgb_status'] == 'HIT'].count()
xgb_misses = possible_flood['xgb_status'].loc[possible_flood['xgb_status'] == 'MISS'].count()
xgb_false = possible_flood['xgb_status'].loc[possible_flood['xgb_status'] == 'FALSE'].count()

rf_hits = possible_flood['rf_status'].loc[possible_flood['rf_status'] == 'HIT'].count()
rf_misses = possible_flood['rf_status'].loc[possible_flood['rf_status'] == 'MISS'].count()
rf_false = possible_flood['rf_status'].loc[possible_flood['rf_status'] == 'FALSE'].count()

actual_triggers = xgb_hits + xgb_misses


print(" Total Actual Flood Triggers are "+ str(actual_triggers)+ ".\n Out of which XGBoost Triggered "+ str(xgb_hits)+
      ",missed " + str(xgb_misses)+ " and false positives are "+ str(xgb_false) + ". "+
      " XGBoost Efficiency in triggering is " + str(round(xgb_hits/actual_triggers*100,2))+"%.\n"+
      " Random Forest Triggered "+ str(rf_hits) + ",missed "+ str(rf_misses) + " and false positives are "+ str(rf_false)+ ". "+
      " Random Forest Efficiency in triggereing is "+ str(round(rf_hits/actual_triggers*100,2)) +"%.\n"
      " Total Time saved by XGBoost by better prediction and triggering is: "+ 
      str(round((xgb_hits-rf_hits)/4,2)) +" Hours."
)

 Total Actual Flood Triggers are 103.
 Out of which XGBoost Triggered 102,missed 1 and false positives are 0.  XGBoost Efficiency in triggering is 99.03%.
 Random Forest Triggered 94,missed 9 and false positives are 0.  Random Forest Efficiency in triggereing is 91.26%.
 Total Time saved by XGBoost by better prediction and triggering is: 2.0 Hours.


In [19]:
print("r2 score is: "+str(round(r2_score(actual["river"].values,xgb_predict["river"].values),4)*100))

r2 score is: 99.4


In [18]:
print("r2 score is: "+str(round(r2_score(actual["river"].values,rf_predict["river"].values),4)*100))

r2 score is: 98.75
