<a href="https://colab.research.google.com/github/uteyechea/crime-prediction-using-artificial-intelligence/blob/master/temporal_correlation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import gc

import pandas as pd
from scipy import stats

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

path='/content/drive/My Drive/Colab Notebooks/crime_prediction'

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


Get a window of length t-n, where t is the end datetime and n is the length of the temporal window in a data series.

In [2]:
file_path=os.path.join(path,'data','theft.csv')
file=pd.read_csv(file_path,sep=',',parse_dates=['Date'],index_col='Date')

In [28]:
file.isnull().values.any() # No nulls

False

In [102]:
def get_window(dataframe,t,n):
  #if type(t) == int elif type(t)==str then use .loc, instead of iloc. For the time being, we will just assume t and n are integers 
  return dataframe.iloc[t-n:t,:]

def get_correlation(dataframe1,dataframe2):
  assert dataframe1.shape==dataframe2.shape,'Dataframes must have the same shape'
  ro=dataframe1.corrwith(dataframe2.set_index(dataframe1.index),axis=0) #Force alignment. Make sure size is the same for both dataframes
  return ro.mean() #For the time being we will stop with a general correlation among all zones

def get_correlated_endof_sequence_timestamp(dataframe,t,n=10,min_correlation=0.5):
  endof_sequence_timestamp=[]
  dataframe2=get_window(dataframe,t,n)
  for epoch in reversed(range(2*n,t)):
    dataframe1=get_window(dataframe,epoch-n,n)
    correlation=get_correlation(dataframe1,dataframe2)
    if correlation >= min_correlation:
      #record datetime value. We will use this datetime value to generate all sequences that will go as input to the RNN
      #print(dataframe.index[epoch]) 
      endof_sequence_timestamp.append(dataframe.index[epoch])
  return endof_sequence_timestamp

def get_correlated_dataframe_slice(dataframe,endof_sequence_timestamp,n):
  input={}
  output={}
  for timestamp in endof_sequence_timestamp:
    try:
      input[timestamp]=file.loc[pd.date_range(start=timestamp,periods=n,freq='-1D')]
      output[timestamp]=file.loc[pd.date_range(start=timestamp,periods=n+1,freq='1D',closed='right')]
    except:
      print('Missing dates at ', str(timestamp))
  #df with all inputs and outputs  
  return input,output

def get_correlated_series(dataframe,column_name,endof_sequence_timestamp):
  sequence={}
  for timestamp in endof_sequence_timestamp:
    input[timestamp]=file.loc[pd.date_range(start=timestamp,periods=n,freq='-1D'),column_name]
    output[timestamp]=file.loc[pd.date_range(start=timestamp,periods=n+1,freq='1D',closed='right'),column_name]

def get_IO_series(dataframe,periods,min_correlation=0.5):
  endof_sequence_timestamp=get_correlated_endof_sequence_timestamp(dataframe,len(dataframe),periods,min_correlation)
  input,output=get_correlated_dataframe_slice(dataframe,endof_sequence_timestamp,periods)
  return input,output  



In [84]:
print(input.update(output))

In [103]:
input,output=get_IO_series(file,periods=10,min_correlation=0.25)

Missing dates at  2001-12-18 00:00:00
Missing dates at  2001-09-18 00:00:00
Missing dates at  2001-08-24 00:00:00
Missing dates at  2001-05-20 00:00:00


It would be ideal to save a list of dates for each zone where the correlation is high. Nevertheless, we will demote this as further work. 

In [108]:
from collections import defaultdict
sequence=defaultdict(list)
for key in input: #Change from input to timestamp sequence
  try:
    #sequence[key].append(input[key][::-1])
    #sequence[key].append(output[key])
    #sequence[key]=input[key][::-1]+output[key]
    sequence[key]= input[key][::-1].append(output[key])
  except:
    print('Error with key',str(key))

Error with key 2001-08-24 00:00:00


In [109]:
sequence

defaultdict(list,
            {Timestamp('2002-02-17 00:00:00'):                zone1     zone2     zone3  ...   zone17    zone18    zone19
             2002-02-08 -1.835100 -2.006852 -2.110972  ... -2.14365 -1.750917 -2.537583
             2002-02-09 -1.614799 -1.817576 -2.110972  ... -1.98516 -1.750917 -2.537583
             2002-02-10 -1.614799 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
             2002-02-11 -1.835100 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
             2002-02-12 -1.835100 -1.817576 -2.242696  ... -2.14365 -1.750917 -2.537583
             2002-02-13 -1.614799 -1.817576 -2.110972  ... -2.14365 -1.750917 -2.456650
             2002-02-14 -1.835100 -2.006852 -2.242696  ... -1.98516 -1.750917 -2.294784
             2002-02-15 -1.835100 -2.006852 -2.110972  ... -2.14365 -1.750917 -2.537583
             2002-02-16 -1.835100 -1.817576 -2.242696  ... -2.14365 -1.750917 -2.456650
             2002-02-17 -1.835100 -2.006852 -2.242696  ... -2.14365 

In [101]:
sequence

defaultdict(list,
            {Timestamp('2001-08-24 00:00:00'): [             zone1     zone2     zone3  ...   zone17    zone18    zone19
              2001-08-15 -1.8351 -1.817576 -2.110972  ... -2.14365 -1.750917 -2.537583
              2001-08-16 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
              2001-08-17 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.432137 -2.537583
              2001-08-18 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.456650
              2001-08-19 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
              2001-08-20 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.432137 -2.456650
              2001-08-21 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
              2001-08-22 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
              2001-08-23 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
              2001-08-24 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -

In [22]:
type(file.loc[:,'zone11'])

pandas.core.series.Series

In [71]:
endof_sequence_timestamp=get_correlated_endof_sequence_timestamp(file,len(file),n=10,min_correlation=0.25)

In [72]:
endof_sequence_timestamp

[Timestamp('2020-03-08 00:00:00'),
 Timestamp('2020-02-24 00:00:00'),
 Timestamp('2019-11-20 00:00:00'),
 Timestamp('2019-03-28 00:00:00'),
 Timestamp('2018-08-26 00:00:00'),
 Timestamp('2018-07-18 00:00:00'),
 Timestamp('2018-06-16 00:00:00'),
 Timestamp('2018-04-30 00:00:00'),
 Timestamp('2017-12-07 00:00:00'),
 Timestamp('2017-10-25 00:00:00'),
 Timestamp('2015-08-09 00:00:00'),
 Timestamp('2014-11-04 00:00:00'),
 Timestamp('2014-08-18 00:00:00'),
 Timestamp('2013-01-17 00:00:00'),
 Timestamp('2012-07-30 00:00:00'),
 Timestamp('2012-07-06 00:00:00'),
 Timestamp('2012-01-08 00:00:00'),
 Timestamp('2011-12-01 00:00:00'),
 Timestamp('2009-05-17 00:00:00'),
 Timestamp('2009-02-25 00:00:00'),
 Timestamp('2008-06-09 00:00:00'),
 Timestamp('2007-10-14 00:00:00'),
 Timestamp('2007-02-26 00:00:00'),
 Timestamp('2007-01-18 00:00:00'),
 Timestamp('2005-04-02 00:00:00'),
 Timestamp('2004-09-03 00:00:00'),
 Timestamp('2004-05-17 00:00:00'),
 Timestamp('2003-06-05 00:00:00'),
 Timestamp('2002-06-

In [78]:
get_correlated_dataframe_slice(file,endof_sequence_timestamp,n=10)

Missing dates at  2001-12-18 00:00:00
Missing dates at  2001-09-18 00:00:00
Missing dates at  2001-08-24 00:00:00
Missing dates at  2001-05-20 00:00:00


({Timestamp('2001-08-24 00:00:00'):              zone1     zone2     zone3  ...   zone17    zone18    zone19
  2001-08-24 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
  2001-08-23 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
  2001-08-22 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
  2001-08-21 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
  2001-08-20 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.432137 -2.456650
  2001-08-19 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
  2001-08-18 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.456650
  2001-08-17 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.432137 -2.537583
  2001-08-16 -1.8351 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
  2001-08-15 -1.8351 -1.817576 -2.110972  ... -2.14365 -1.750917 -2.537583
  
  [10 rows x 19 columns],
  Timestamp('2002-02-17 00:00:00'):                zone1     zone2     zone3  ...   zone17    zone18    zone19
 

In [75]:
for timestamp in endof_sequence_timestamp:
  try:
    print(timestamp)
    file.loc[pd.date_range(start=timestamp,periods=10,freq='1D')]
  except:
    print('missing dates in dataframe')

2020-03-08 00:00:00
2020-02-24 00:00:00
2019-11-20 00:00:00
2019-03-28 00:00:00
2018-08-26 00:00:00
2018-07-18 00:00:00
2018-06-16 00:00:00
2018-04-30 00:00:00
2017-12-07 00:00:00
2017-10-25 00:00:00
2015-08-09 00:00:00
2014-11-04 00:00:00
2014-08-18 00:00:00
2013-01-17 00:00:00
2012-07-30 00:00:00
2012-07-06 00:00:00
2012-01-08 00:00:00
2011-12-01 00:00:00
2009-05-17 00:00:00
2009-02-25 00:00:00
2008-06-09 00:00:00
2007-10-14 00:00:00
2007-02-26 00:00:00
2007-01-18 00:00:00
2005-04-02 00:00:00
2004-09-03 00:00:00
2004-05-17 00:00:00
2003-06-05 00:00:00
2002-06-06 00:00:00
2002-02-17 00:00:00
2001-12-18 00:00:00
2001-09-18 00:00:00
missing dates in dataframe
2001-08-24 00:00:00
missing dates in dataframe
2001-05-20 00:00:00
missing dates in dataframe


In [68]:
endof_sequence_timestamp

[]

In [56]:
file.loc['2016-11-04':'2016-11-13']

Unnamed: 0_level_0,zone1,zone2,zone3,zone4,zone5,zone6,zone7,zone8,zone9,zone10,zone11,zone12,zone13,zone14,zone15,zone16,zone17,zone18,zone19
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2016-11-04,0.367908,-1.060472,0.391791,-0.050849,0.658983,-1.352133,-0.064352,-1.258164,0.598476,-0.315154,0.179659,-0.288685,-0.736737,0.555634,-0.866708,-0.516249,0.392189,-0.157016,1.428137
2016-11-05,-0.953897,-0.68192,0.786964,-0.050849,0.658983,-0.493738,-1.564538,0.471272,-0.412465,-0.063167,-0.215602,0.466208,-0.249816,-0.049163,-0.419606,-1.109473,0.867659,-0.157016,1.832803
2016-11-06,-1.174198,-1.060472,-0.793728,-0.050849,-1.0368,-0.493738,-0.814445,-0.022852,-0.412465,-0.819129,-0.808493,-0.791946,0.724027,-0.049163,-1.090259,-0.318507,0.709169,-1.113357,1.266271
2016-11-07,0.808509,-0.68192,-0.793728,-0.050849,0.023065,-0.27914,-0.626922,-0.516977,-0.008089,-0.063167,-0.413232,-0.288685,-0.006355,1.160432,0.027496,-0.318507,0.075209,-0.794576,-0.109591
2016-11-08,-0.072694,0.26446,-1.45235,-0.050849,-0.824827,-0.708337,-1.001968,0.22421,-0.412465,-1.071117,-0.808493,-1.295207,-0.006355,-0.049163,-0.419606,-0.911732,-0.241771,0.161764,-1.242654
2016-11-09,0.147607,0.075184,-0.003382,-0.050849,0.658983,-0.27914,-0.626922,-0.022852,0.396288,-0.063167,0.97018,1.2211,-0.980198,0.555634,0.251047,-1.109473,-0.083281,-0.475796,-0.35239
2016-11-10,-0.733596,-0.492644,-0.003382,-0.050849,-0.612854,0.364656,-1.939584,-0.022852,-0.614653,-0.567142,0.77255,-1.043577,-0.493276,-0.049163,-0.643157,-1.109473,1.184639,-0.157016,0.861606
2016-11-11,-0.953897,-0.871196,0.260066,-0.050849,0.023065,-0.493738,-0.251875,0.718335,-0.210277,-0.315154,-0.610863,-0.037054,-0.736737,-0.65396,0.027496,0.076976,-0.083281,1.436885,0.69974
2016-11-12,-0.513295,-0.114092,-0.398555,-0.050849,-0.400881,0.579255,-0.814445,-0.022852,-1.625594,-1.575092,-0.017971,-0.288685,-0.980198,-0.049163,0.698149,-0.71399,0.709169,0.480545,0.942539
2016-11-13,-0.292995,-0.68192,-0.53028,-0.050849,-0.400881,-0.708337,-0.626922,-0.516977,0.1941,-0.063167,-1.006123,-1.295207,-0.493276,-0.049163,-1.31381,-0.71399,0.550679,-1.432137,-0.271457


In [55]:
pd.date_range(start='2016-11-04 00:00:00',periods=7,freq='1D')

DatetimeIndex(['2016-11-04', '2016-11-05', '2016-11-06', '2016-11-07',
               '2016-11-08', '2016-11-09', '2016-11-10'],
              dtype='datetime64[ns]', freq='D')

In [59]:
file.loc[pd.date_range(start='2016-11-04 00:00:00',periods=7,freq='1D')]

Unnamed: 0,zone1,zone2,zone3,zone4,zone5,zone6,zone7,zone8,zone9,zone10,zone11,zone12,zone13,zone14,zone15,zone16,zone17,zone18,zone19
2016-11-04,0.367908,-1.060472,0.391791,-0.050849,0.658983,-1.352133,-0.064352,-1.258164,0.598476,-0.315154,0.179659,-0.288685,-0.736737,0.555634,-0.866708,-0.516249,0.392189,-0.157016,1.428137
2016-11-05,-0.953897,-0.68192,0.786964,-0.050849,0.658983,-0.493738,-1.564538,0.471272,-0.412465,-0.063167,-0.215602,0.466208,-0.249816,-0.049163,-0.419606,-1.109473,0.867659,-0.157016,1.832803
2016-11-06,-1.174198,-1.060472,-0.793728,-0.050849,-1.0368,-0.493738,-0.814445,-0.022852,-0.412465,-0.819129,-0.808493,-0.791946,0.724027,-0.049163,-1.090259,-0.318507,0.709169,-1.113357,1.266271
2016-11-07,0.808509,-0.68192,-0.793728,-0.050849,0.023065,-0.27914,-0.626922,-0.516977,-0.008089,-0.063167,-0.413232,-0.288685,-0.006355,1.160432,0.027496,-0.318507,0.075209,-0.794576,-0.109591
2016-11-08,-0.072694,0.26446,-1.45235,-0.050849,-0.824827,-0.708337,-1.001968,0.22421,-0.412465,-1.071117,-0.808493,-1.295207,-0.006355,-0.049163,-0.419606,-0.911732,-0.241771,0.161764,-1.242654
2016-11-09,0.147607,0.075184,-0.003382,-0.050849,0.658983,-0.27914,-0.626922,-0.022852,0.396288,-0.063167,0.97018,1.2211,-0.980198,0.555634,0.251047,-1.109473,-0.083281,-0.475796,-0.35239
2016-11-10,-0.733596,-0.492644,-0.003382,-0.050849,-0.612854,0.364656,-1.939584,-0.022852,-0.614653,-0.567142,0.77255,-1.043577,-0.493276,-0.049163,-0.643157,-1.109473,1.184639,-0.157016,0.861606


In [79]:
input

{}

In [80]:
output

{}