In [20]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score,mean_absolute_error
import boto3

In [21]:
#importing predicted data

s3 = boto3.client('s3')

bucket = 'flood-prediction-master-dataset'

key = 'predictions-1-hr/actual_1_hr.csv'
obj = s3.get_object(Bucket= bucket,Key= key)
actual = pd.read_csv(obj['Body'])

key = 'predictions-1-hr/xgboost_predictions_1_hr.csv'
obj = s3.get_object(Bucket= bucket,Key= key)
xgb_predict = pd.read_csv(obj['Body'])

key = 'predictions-1-hr/random_forest_predictions_1_hr.csv'
obj = s3.get_object(Bucket= bucket,Key= key)
rf_predict = pd.read_csv(obj['Body'])

In [22]:
#dropping unnexessary columns

actual.drop(['Unnamed: 0','source'],axis=1,inplace=True)
xgb_predict.drop(['Unnamed: 0','source'],axis=1,inplace=True)
rf_predict.drop(['Unnamed: 0','source'],axis=1,inplace=True)

In [23]:
actual

Unnamed: 0,timerecorded,river
0,08-08-20 18:00,2.879
1,08-08-20 19:00,2.212
2,08-08-20 20:00,2.950
3,08-08-20 21:00,4.920
4,08-08-20 22:00,7.000
...,...,...
64,11-08-20 10:00,4.028
65,11-08-20 11:00,5.234
66,11-08-20 12:00,6.189
67,11-08-20 13:00,6.743


In [24]:
xgb_predict

Unnamed: 0,timerecorded,river
0,08-08-20 18:00,3.437
1,08-08-20 19:00,2.749
2,08-08-20 20:00,2.301
3,08-08-20 21:00,3.786
4,08-08-20 22:00,6.347
...,...,...
64,11-08-20 10:00,3.898
65,11-08-20 11:00,5.104
66,11-08-20 12:00,5.671
67,11-08-20 13:00,6.442


In [25]:
rf_predict

Unnamed: 0,timerecorded,river
0,08-08-20 18:00,3.341
1,08-08-20 19:00,3.966
2,08-08-20 20:00,2.003
3,08-08-20 21:00,3.316
4,08-08-20 22:00,5.844
...,...,...
64,11-08-20 10:00,3.554
65,11-08-20 11:00,4.628
66,11-08-20 12:00,5.647
67,11-08-20 13:00,6.109


In [26]:
# saving difference between actual and predicted river value

xgb_difference = pd.DataFrame(actual['river'] - xgb_predict['river'])

rf_difference = pd.DataFrame(actual['river'] - rf_predict['river'])

print("Sum of Prediction Error in XGBoost: "+str(xgb_difference['river'].sum())+
      " Highest Error in Prediction in XGBoost: "+str(xgb_difference['river'].max()))

print("Sum of Prediction Error in Random Forest: "+str(rf_difference['river'].sum())+
      " Highest Error in Prediction in Random Forest: "+str(rf_difference['river'].max()))

print("Flood Triggered by XGBoost within "+str(xgb_predict[xgb_predict["river"]>=6.2000].index[0])+" Hours."+
      " Flood Triggered by Random Forest within "+str(rf_predict[rf_predict["river"]>=6.2000].index[0])+" Hours.")

Sum of Prediction Error in XGBoost: -1.8619999999999997 Highest Error in Prediction in XGBoost: 1.3270000000000004
Sum of Prediction Error in Random Forest: 24.796000000000006 Highest Error in Prediction in Random Forest: 1.6039999999999996
Flood Triggered by XGBoost within 4 Hours. Flood Triggered by Random Forest within 5 Hours.


In [27]:
# CREATING TABLE OF TRIGGERS. filtering columns/instances with river level above flood level 

actual_flood = actual[['timerecorded','river']]

In [28]:
# joining xgboost and random forest columns with same index to compare the triggers 

xgb_compare = actual_flood.join(xgb_predict,lsuffix='_actual',rsuffix='_xgboost')

all_combined = xgb_compare.join(rf_predict)

all_combined.drop(['timerecorded_xgboost','timerecorded'],axis=1,inplace=True)
all_combined = all_combined.rename(columns={"river":"river_rf"})

In [29]:
# creating table that has either actual, xgboost prediction or random forest prediction greater than flood threshold value

possible_flood = all_combined[(all_combined['river_actual']>=6.200) | (all_combined['river_xgboost']>=6.200) | (all_combined['river_rf']>=6.200)]

In [30]:
possible_flood['xgb_status'] = 'NONE'
possible_flood['rf_status'] = 'NONE'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


In [31]:
# HIT means triggered rightly MISS means not triggered FALSE wrongly triggered

possible_flood['xgb_status'] = np.where( (possible_flood['river_actual']>=6.200) & (possible_flood['river_xgboost']>=6.200),'HIT', possible_flood['xgb_status'])
possible_flood['xgb_status'] = np.where( (possible_flood['river_actual']>=6.200) & (possible_flood['river_xgboost']<6.200),'MISS', possible_flood['xgb_status'])
possible_flood['xgb_status'] = np.where( (possible_flood['river_actual']<6.200) & (possible_flood['river_xgboost']>=6.200),'FALSE', possible_flood['xgb_status'])


possible_flood['rf_status'] = np.where( (possible_flood['river_actual']>=6.200) & (possible_flood['river_rf']>=6.200),'HIT', possible_flood['rf_status'])
possible_flood['rf_status'] = np.where( (possible_flood['river_actual']>=6.200) & (possible_flood['river_rf']<6.200),'MISS', possible_flood['rf_status'])
possible_flood['rf_status'] = np.where( (possible_flood['river_actual']<6.200) & (possible_flood['river_rf']>=6.200),'FALSE', possible_flood['rf_status'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata

In [32]:
possible_flood.reset_index(drop=True,inplace=True)
possible_flood

Unnamed: 0,timerecorded_actual,river_actual,river_xgboost,river_rf,xgb_status,rf_status
0,08-08-20 22:00,7.0,6.347,5.844,HIT,MISS
1,08-08-20 23:00,8.441,7.991,7.36,HIT,HIT
2,09-08-20 0:00,9.211,8.646,8.714,HIT,HIT
3,09-08-20 1:00,8.903,8.625,8.611,HIT,HIT
4,09-08-20 2:00,8.405,8.52,8.288,HIT,HIT
5,09-08-20 3:00,7.011,7.622,7.443,HIT,HIT
6,09-08-20 11:00,7.147,7.09,6.689,HIT,HIT
7,09-08-20 12:00,8.036,8.333,7.647,HIT,HIT
8,09-08-20 13:00,8.974,9.263,8.225,HIT,HIT
9,09-08-20 14:00,8.555,8.771,8.536,HIT,HIT


In [33]:
# variables are used to avoid making print statement long and complex

xgb_hits = possible_flood['xgb_status'].loc[possible_flood['xgb_status'] == 'HIT'].count()
xgb_misses = possible_flood['xgb_status'].loc[possible_flood['xgb_status'] == 'MISS'].count()
xgb_false = possible_flood['xgb_status'].loc[possible_flood['xgb_status'] == 'FALSE'].count()

rf_hits = possible_flood['rf_status'].loc[possible_flood['rf_status'] == 'HIT'].count()
rf_misses = possible_flood['rf_status'].loc[possible_flood['rf_status'] == 'MISS'].count()
rf_false = possible_flood['rf_status'].loc[possible_flood['rf_status'] == 'FALSE'].count()

actual_triggers = xgb_hits + xgb_misses


print(" Total Actual Flood Triggers are "+ str(actual_triggers)+ ".\n Out of which XGBoost Triggered "+ str(xgb_hits)+
      ",missed " + str(xgb_misses)+ " and false positives are "+ str(xgb_false) + ". "+
      " XGBoost Efficiency in triggering is " + str(round(xgb_hits/actual_triggers*100,2))+"%.\n"+
      " Random Forest Triggered "+ str(rf_hits) + ",missed "+ str(rf_misses) + " and false positives are "+ str(rf_false)+ ". "+
      " Random Forest Efficiency in triggereing is "+ str(round(rf_hits/actual_triggers*100,2)) +"%.\n"
      " Total Time saved by XGBoost by better prediction and triggering is: "+ 
      str(xgb_hits-rf_hits) +" Hours."
)

 Total Actual Flood Triggers are 29.
 Out of which XGBoost Triggered 27,missed 2 and false positives are 1.  XGBoost Efficiency in triggering is 93.1%.
 Random Forest Triggered 21,missed 8 and false positives are 1.  Random Forest Efficiency in triggereing is 72.41%.
 Total Time saved by XGBoost by better prediction and triggering is: 6 Hours.


In [28]:
print("r2 score is: "+str(round(r2_score(actual["river"].values,xgb_predict["river"].values),4)*100))

r2 score is: 94.01


In [29]:
print("r2 score is: "+str(round(r2_score(actual["river"].values,rf_predict["river"].values),4)*100))

r2 score is: 85.75
