<a href="https://colab.research.google.com/github/uteyechea/crime-prediction-using-artificial-intelligence/blob/master/Part5_Black_Box_Testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Part 5: Black-Box Testing

Test crimes predicted sequence against a real sequence of crimes.

##5.1  Dependencies, mount Google Drive and set system path
Import the relevant packages we will use to train the RNN model.

In [None]:
import os
import gc

import pandas as pd
from scipy import stats

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

path='/content/drive/My Drive/Colab Notebooks/crime_prediction'

#Update our path to import from 
import sys
sys.path.append(path+'/libs')
import autocorr as ac


Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


##5.1 RNN input sequence parameters

The windowing over the time series used an end_date and some look_back period, all highly correlated windows were put in sequence, such sequence was fed into the RNN for training purposes.

In [None]:
#Input sequence parameters
#end_date='2019-01-01'
#end_date='2018-10-31'
#end_date='2018-09-07'
end_date='2018-11-01'
lookback_periods=10
column_name='zone11'
min_correlation=0.75

##5.2 Import test data 
Import test data sequence, i.e. real data sequence unknown to the RNN at the time of training 

In [None]:
test_file_path=os.path.join(path,'data','theft.csv')
file=pd.read_csv(test_file_path,sep=',',parse_dates=['Date'],index_col='Date')
file.isnull().values.any() # nulls?

False

In [None]:
dataframe=file
timestamp=end_date
apriori_window=dataframe.loc[pd.date_range(start=timestamp,periods=lookback_periods,freq='-1D'),column_name]
apriori_window=apriori_window[::-1]
aposteriori_window=dataframe.loc[pd.date_range(start=timestamp,periods=lookback_periods,freq='1D',closed='right'),column_name]
test_window=apriori_window.append(aposteriori_window)

In [None]:
test_window



2018-10-23    0.214
2018-10-24    0.286
2018-10-25    0.107
2018-10-26    0.214
2018-10-27    0.250
2018-10-28    0.214
2018-10-29    0.214
2018-10-30    0.286
2018-10-31    0.286
2018-11-01    0.143
2018-11-02    0.214
2018-11-03    0.071
2018-11-04    0.143
2018-11-05    0.143
2018-11-06    0.214
2018-11-07    0.143
2018-11-08    0.071
2018-11-09    0.179
2018-11-10    0.250
Freq: D, Name: zone11, dtype: float64

##5.3 Import predicted data

Data sequence generated by the RNN as the most likely future crime sequence. 

In [None]:
prediction_file_path=os.path.join(path,'data','prediction','rnn_output.txt')
prediction_windows=pd.read_csv(prediction_file_path,sep=',')
print(prediction_windows.shape)

(33059, 1)


Remove non-numeric values. It very well can happen that the RNN predicts some number in the wrong format, for example: 1.324.234

In [None]:
prediction_windows.iloc[:,0] = pd.to_numeric(prediction_windows.iloc[:,0], errors='coerce') #Remove non float values, substitute them with NAN
prediction_windows=prediction_windows.dropna()
prediction_windows.shape

(32940, 1)

Verify all predicted data to be free of nulls

In [None]:
prediction_windows.isnull().values.any()

False

##5.4 Estimate error in RNN output.

For some date end_date we will compare the predicted sequence vs the real sequence during some period of N time units. 

Fix data types to pandas series

In [None]:
print(test_window.shape)
print(prediction_windows.iloc[:,0].shape)
print(type(test_window))
print(type(prediction_windows.iloc[:,0]))

(19,)
(32940,)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [None]:
prediction=prediction_windows.iloc[:,0]

In [None]:
assert type(prediction)==type(test_window)

In [None]:
#series1 test_window
#series2 prediction_windows

def correlation(apriori_window,aposteriori_window,rnn_output_series,periods,min_correlation):
  max_correlation=1.1
  index_at_rnn_output=[]
  apriori_window=apriori_window.reset_index(drop=True) #Better find a way to simplify this procedure
  aposteriori_window=aposteriori_window.reset_index(drop=True) #Better find a way to simplify this procedure

  for i,row in enumerate(rnn_output_series):
    if i>=periods:
      try:
        past_predicted_window=rnn_output_series[i-periods:i]
        past_predicted_window=past_predicted_window.reset_index(drop=True) #Better find a way to simplify this procedure
        ro=apriori_window.corr(past_predicted_window)

        if ro > min_correlation and ro < max_correlation:
          future_predicted_window=rnn_output_series[i:i+periods]
          future_predicted_window=future_predicted_window.reset_index(drop=True)

          ro2=aposteriori_window.corr(future_predicted_window)
          print(i,ro,ro2)
          index_at_rnn_output.append((ro,ro2,i))

      except:
        print('fix the loop indices')

  return index_at_rnn_output


In [None]:
"""
aposteriori_window=[1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,8,9,10]
future_predicted_window=[1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,8,9,10]

a=pd.Series(aposteriori_window)
b=pd.Series(future_predicted_window)

print(a[10-10:10])

a.corr(b)
"""

'\naposteriori_window=[1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,8,9,10]\nfuture_predicted_window=[1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,8,9,10]\n\na=pd.Series(aposteriori_window)\nb=pd.Series(future_predicted_window)\n\nprint(a[10-10:10])\n\na.corr(b)\n'

In [None]:
index_at_rnn_output=correlation(apriori_window,aposteriori_window,rnn_output_series=prediction,periods=len(apriori_window),min_correlation=0.9)

387 0.9216543554747596 -0.32612055421192515
1483 0.9216726954188158 -0.33409971350316364
4035 0.9216726954188158 0.39210772001567246
4420 0.9216726954188158 -0.33409971350316364
4530 0.9308371776103368 -0.0011916273577526658
5952 0.9216726954188158 0.011464213439662694
7375 0.9999999999999999 0.7281808419721604
9119 0.9402357001607139 0.18134557638269708
9210 0.9999999999999999 0.06401768493392142
12773 0.9216726954188158 -0.33409971350316364
22879 0.9100775187009009 0.16195766083435523
23385 0.9216726954188158 -0.016137311784445666
26489 0.9005045998777164 0.18134557638269708
26605 0.9216726954188158 -0.33409971350316364
29171 0.9005045998777164 0.18134557638269708
30533 0.9999999999999999 -0.027075520012682985


In [None]:
#index_at_rnn_output_df=pd.DataFrame(index_at_rnn_output)

In [None]:
#index_at_rnn_output_mean=index_at_rnn_output_df[0].mean()
#index_at_rnn_output_mean

In [None]:
#min_avg_diff=min([abs(x - index_at_rnn_output_mean) for x in index_at_rnn_output_df[0]]) #[x - n for x in a]

#min_avg_diff=[]
#for i,x in enumerate(index_at_rnn_output_df[0]):
#  min_avg_diff.append(abs(x - index_at_rnn_output_mean))
#  print(i,abs(x - index_at_rnn_output_mean))

In [None]:
#index_at_rnn_output.index(min_avg_diff)

In [None]:

#Get the index with max correlation between the test_window and apriori_window
if len(index_at_rnn_output)>1:
  max_apriori_corr_at_index=max(index_at_rnn_output)[2]
  print(max(index_at_rnn_output))
  print(max_apriori_corr_at_index)
  #Get the index with the second max correlation between the test_window and apriori_window
  to_remove=index_at_rnn_output.index(max(index_at_rnn_output))
  #print(to_remove)
  index_at_rnn_output[to_remove]=(0,0,0)
  max_apriori_corr_at_index=max(index_at_rnn_output)[2]
  print(max(index_at_rnn_output))
  print(max_apriori_corr_at_index)
else:
    max_apriori_corr_at_index=max(index_at_rnn_output)[2]
    print(max(index_at_rnn_output))
    print(max_apriori_corr_at_index)



(0.9999999999999999, 0.7281808419721604, 7375)
7375
(0.9999999999999999, 0.06401768493392142, 9210)
9210


In [None]:
#target_window=prediction[max_apriori_corr_at_index-len(apriori_window):max_apriori_corr_at_index+len(apriori_window)-1]
target_window=prediction[7375-len(apriori_window):7375+len(apriori_window)-1]

line chart

In [None]:
df={}
df=pd.DataFrame(df)
df1={}
df1=pd.DataFrame(df1)


df['value']=list(test_window)
df['code']=['CPD']*len(test_window)
df=df.set_index(test_window.index)

df1['value']=list(target_window)
df1['code']=['Algorithm']*len(target_window)
df1=df1.set_index(test_window.index)

df3=df.append(df1)

path_to_save_plot=os.path.join(path,'data','prediction','plots')
df3.to_csv(path_to_save_plot+str(end_date)+'.txt')


In [None]:
import plotly.express as px


fig = px.line(df3, x=df3.index, y="value", color='code',
              labels={
                     "value": "Normalized crime count",
                     "code": "CS",
                     "x": "Date"
                 }
                    

)

fig.update_layout(
    title={
        'text': "Last date known to the system: "+str(end_date),
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
        
   legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
)     
        )

fig.show()

In [None]:
# x and y given as array_like objects
import plotly.express as px
fig = px.scatter(x=test_window, y=target_window)
fig.show()

In [None]:
#targets.sub(predictions).pow(2).mean() #RMS Error
rms=df['value'].sub(df1['value']).pow(2).mean()

#Estimate variance
(df['value'] - (df1['value'] )) / df['value']

print(rms)
print(rms**2)

from sklearn.metrics import mean_squared_error
from math import sqrt

test=df['value']
predictions=df1['value']

rmse = sqrt(mean_squared_error(test, predictions))
print('RMSE: %.3f' % rmse)



0.003554789473684211
1.2636528202216068e-05
RMSE: 0.060


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=3, cols=1,
                subplot_titles=('Last date know to the system '+str(end_date),
                'Last date know to the system '+str(end_date),
                'Last date know to the system '+str(end_date)
                )                    
                    )

fig.append_trace(go.Scatter(
    x=list(df3.iloc[:len(test_window),:].index),
    y=list(df3.iloc[:len(test_window),:]['value']),
), row=1, col=1)

fig.append_trace(go.Scatter(
    x=list(df3.iloc[len(test_window):,:].index),
    y=list(df3.iloc[len(test_window):,:]['value']),
), row=1, col=1)

fig.append_trace(go.Scatter(
    x=list(df3.index),
    y=list(df3['value']),
), row=2, col=1)

fig.append_trace(go.Scatter(
    x=list(df3.index),
    y=list(df3['value']),
), row=3, col=1)

# Update xaxis properties
fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_xaxes(title_text="Date", row=2, col=1)
fig.update_xaxes(title_text="Date", row=3, col=1)

#fig.update_xaxes(title_text="Date", range=[10, 50], row=1, col=2)
#fig.update_xaxes(title_text="Date", showgrid=False, row=2, col=1)
#fig.update_xaxes(title_text="Date", type="log", row=2, col=2)

# Update yaxis properties
fig.update_yaxes(title_text="Normalized crime count", row=1, col=1)
fig.update_yaxes(title_text="Normalized crime count", row=2, col=1)
fig.update_yaxes(title_text="Normalized crime count", row=3, col=1)
#fig.update_yaxes(title_text="yaxis 2 title", range=[40, 80], row=1, col=2)
#fig.update_yaxes(title_text="yaxis 3 title", showgrid=False, row=2, col=1)
#fig.update_yaxes(title_text="yaxis 4 title", row=2, col=2)

fig.update_layout(height=800, width=800, title_text="Stacked Subplots")
fig.show()

In [None]:
df3[:int(len(df3)/2)].set_index(test_window.index,drop=True)
df3[int(len(df3)/2):].set_index(test_window.index,drop=True)

Unnamed: 0,value,code
2018-10-23,0.214,Algorithm
2018-10-24,0.286,Algorithm
2018-10-25,0.107,Algorithm
2018-10-26,0.214,Algorithm
2018-10-27,0.25,Algorithm
2018-10-28,0.214,Algorithm
2018-10-29,0.214,Algorithm
2018-10-30,0.286,Algorithm
2018-10-31,0.286,Algorithm
2018-11-01,0.143,Algorithm


In [None]:
df3

Unnamed: 0,value,code
2018-10-23,0.214,CPD
2018-10-24,0.286,CPD
2018-10-25,0.107,CPD
2018-10-26,0.214,CPD
2018-10-27,0.25,CPD
2018-10-28,0.214,CPD
2018-10-29,0.214,CPD
2018-10-30,0.286,CPD
2018-10-31,0.286,CPD
2018-11-01,0.143,CPD


In [None]:
list(df3['value'])

[0.214,
 0.28600000000000003,
 0.107,
 0.214,
 0.25,
 0.214,
 0.214,
 0.28600000000000003,
 0.28600000000000003,
 0.14300000000000002,
 0.214,
 0.071,
 0.14300000000000002,
 0.14300000000000002,
 0.214,
 0.14300000000000002,
 0.071,
 0.179,
 0.25,
 0.214,
 0.28600000000000003,
 0.107,
 0.214,
 0.25,
 0.214,
 0.214,
 0.28600000000000003,
 0.28600000000000003,
 0.14300000000000002,
 0.214,
 0.071,
 0.14300000000000002,
 0.14300000000000002,
 0.214,
 0.14300000000000002,
 0.071,
 0.429,
 0.321]