<a href="https://colab.research.google.com/github/uteyechea/crime-prediction-using-artificial-intelligence/blob/master/temporal_autocorrelation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os
import gc

import pandas as pd
from scipy import stats

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

path='/content/drive/My Drive/Colab Notebooks/crime_prediction'

Mounted at /content/drive


Get a window of length t-n, where t is the end datetime and n is the length of the temporal window in a data series.

In [6]:
file_path=os.path.join(path,'data','theft.csv')
file=pd.read_csv(file_path,sep=',',parse_dates=['Date'],index_col='Date')

In [7]:
file.isnull().values.any() # No nulls

False

In [8]:
file.columns.get_loc('zone4')

4

In [9]:
file.columns[3]

'zone3'

In [10]:
type('zone3')==str

True

Series correlation

In [13]:
def series_window(dataframe,end,periods,column):
  if type(column)==str:
    column=dataframe.columns.get_loc(column)
  return dataframe.iloc[end-periods:end,column]

def series_correlation(series1,series2):
  #assert something?
  series1=series1.reset_index(drop=True) #Better find a way to simplify this procedure
  series2=series2.reset_index(drop=True) #Better find a way to simplify this procedure
  ro=series1.corr(series2) 
  return ro 

def correlated_series_timestamp(dataframe,end,periods,column,min_correlation):
  timestamps=[]
  series2=series_window(dataframe,end,periods,column)
  for epoch in reversed(range(2*periods,end)):
    series1=series_window(dataframe,epoch-periods,periods,column)
    correlation=series_correlation(series1,series2)
    if correlation >= min_correlation:
      #record datetime value. We will use this datetime value to generate all sequences that will go as input to the RNN
      #print(dataframe.index[epoch]) 
      timestamps.append(dataframe.index[epoch])
  return timestamps

def correlated_series_sequence(dataframe,end,periods,column,min_correlation):
  timestamps=correlated_series_timestamp(dataframe,end,periods,column,min_correlation)
  input={}
  output={}
  if type(column)==int:
    column=dataframe.columns[column]
  for timestamp in timestamps:
    try:
      input[timestamp]=dataframe.loc[pd.date_range(start=timestamp,periods=periods,freq='-1D'),column]
      output[timestamp]=dataframe.loc[pd.date_range(start=timestamp,periods=periods,freq='1D',closed='right'),column]
    except:
      print('Missing date in DataFrame')
  return input,output      

def get_IO_series(dataframe,end,periods,column,min_correlation):
  timestamps=correlated_series_timestamp(dataframe,end,periods,column,min_correlation)
  input,output=correlated_series_sequence(dataframe,end,periods,column,min_correlation)
  #return input,output
  sequence={}
  for timestamp in timestamps: #Change from input to timestamp sequence
    try:
      sequence[timestamp]= input[timestamp][::-1].append(output[timestamp])
    except:
      print('Error with key',str(timestamp))
  return sequence      

def save_to_file(sequence,file_path):
  with open(file_path,'a') as file:
    for timestamp in sequence:
      sequence[timestamp].to_csv(file,mode='a',header=False,index=False)
      file.write('\n')

In [22]:
sequence=get_IO_series(file,end=len(file)-10,periods=10,column=11,min_correlation=0.85)

Missing date in DataFrame
Missing date in DataFrame
Error with key 2001-07-06 00:00:00
Error with key 2001-04-02 00:00:00


In [23]:
len(sequence)

5

In [27]:
file_path=os.path.join(path,'data','training','theft2.csv')
save_to_file(sequence,file_path)

In [28]:
sequence

{Timestamp('2004-07-19 00:00:00'): 2004-07-10    0.62500
 2004-07-11    0.37500
 2004-07-12    0.50000
 2004-07-13    0.62500
 2004-07-14    0.65625
 2004-07-15    0.40625
 2004-07-16    0.65625
 2004-07-17    0.40625
 2004-07-18    0.46875
 2004-07-19    0.46875
 2004-07-20    0.37500
 2004-07-21    0.50000
 2004-07-22    0.28125
 2004-07-23    0.46875
 2004-07-24    0.46875
 2004-07-25    0.53125
 2004-07-26    0.34375
 2004-07-27    0.65625
 2004-07-28    0.31250
 Freq: D, Name: zone11, dtype: float64,
 Timestamp('2007-07-14 00:00:00'): 2007-07-05    0.65625
 2007-07-06    0.59375
 2007-07-07    0.53125
 2007-07-08    0.31250
 2007-07-09    0.62500
 2007-07-10    0.43750
 2007-07-11    0.53125
 2007-07-12    0.43750
 2007-07-13    0.56250
 2007-07-14    0.21875
 2007-07-15    0.37500
 2007-07-16    0.59375
 2007-07-17    0.43750
 2007-07-18    0.31250
 2007-07-19    0.53125
 2007-07-20    0.37500
 2007-07-21    0.34375
 2007-07-22    0.43750
 2007-07-23    0.40625
 Freq: D, Name: zo

DataFrame correlation

In [None]:
def get_window(dataframe,t,n):
  #if type(t) == int elif type(t)==str then use .loc, instead of iloc. For the time being, we will just assume t and n are integers 
  return dataframe.iloc[t-n:t,:]

def get_correlation(dataframe1,dataframe2):
  assert dataframe1.shape==dataframe2.shape,'Dataframes must have the same shape'
  ro=dataframe1.corrwith(dataframe2.set_index(dataframe1.index),axis=0) #Force alignment. Make sure size is the same for both dataframes
  return ro.mean() #For the time being we will stop with a general correlation among all zones

def get_correlated_endof_sequence_timestamp(dataframe,t,n=10,min_correlation=0.5):
  endof_sequence_timestamp=[]
  dataframe2=get_window(dataframe,t,n)
  for epoch in reversed(range(2*n,t)):
    dataframe1=get_window(dataframe,epoch-n,n)
    correlation=get_correlation(dataframe1,dataframe2)
    if correlation >= min_correlation:
      #record datetime value. We will use this datetime value to generate all sequences that will go as input to the RNN
      #print(dataframe.index[epoch]) 
      endof_sequence_timestamp.append(dataframe.index[epoch])
  return endof_sequence_timestamp

def get_correlated_dataframe_slice(dataframe,endof_sequence_timestamp,n):
  input={}
  output={}
  for timestamp in endof_sequence_timestamp:
    try:
      input[timestamp]=dataframe.loc[pd.date_range(start=timestamp,periods=n,freq='-1D')]
      output[timestamp]=dataframe.loc[pd.date_range(start=timestamp,periods=n+1,freq='1D',closed='right')]
    except:
      print('Missing dates at ', str(timestamp))
  #df with all inputs and outputs  
  return input,output

def get_IO_sequence(dataframe,periods,min_correlation=0.5):
  endof_sequence_timestamp=get_correlated_endof_sequence_timestamp(dataframe,len(dataframe),periods,min_correlation)
  input,output=get_correlated_dataframe_slice(dataframe,endof_sequence_timestamp,periods)
  #return input,output
  sequence={}
  for key in endof_sequence_timestamp: #Change from input to timestamp sequence
    try:
      sequence[key]= input[key][::-1].append(output[key])
    except:
      print('Error with key',str(key))
  return sequence      

def save_to_file(sequence,file_path):
  with open(file_path,'a') as file:
    for key in sequence:
      sequence[key].to_csv(file,mode='a',header=False,index=False)
      file.write('\n')

It would be ideal to save a list of dates for each zone where the correlation is high. Nevertheless, we will demote this as further work. 

In [None]:
sequence=get_IO_sequence(file,periods=10,min_correlation=0.25)

Missing dates at  2001-12-18 00:00:00
Missing dates at  2001-09-18 00:00:00
Missing dates at  2001-08-24 00:00:00
Missing dates at  2001-05-20 00:00:00
Error with key 2001-12-18 00:00:00
Error with key 2001-09-18 00:00:00
Error with key 2001-08-24 00:00:00
Error with key 2001-05-20 00:00:00


In [None]:
sequence

{Timestamp('2002-02-17 00:00:00'):                zone1     zone2     zone3  ...   zone17    zone18    zone19
 2002-02-08 -1.835100 -2.006852 -2.110972  ... -2.14365 -1.750917 -2.537583
 2002-02-09 -1.614799 -1.817576 -2.110972  ... -1.98516 -1.750917 -2.537583
 2002-02-10 -1.614799 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
 2002-02-11 -1.835100 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
 2002-02-12 -1.835100 -1.817576 -2.242696  ... -2.14365 -1.750917 -2.537583
 2002-02-13 -1.614799 -1.817576 -2.110972  ... -2.14365 -1.750917 -2.456650
 2002-02-14 -1.835100 -2.006852 -2.242696  ... -1.98516 -1.750917 -2.294784
 2002-02-15 -1.835100 -2.006852 -2.110972  ... -2.14365 -1.750917 -2.537583
 2002-02-16 -1.835100 -1.817576 -2.242696  ... -2.14365 -1.750917 -2.456650
 2002-02-17 -1.835100 -2.006852 -2.242696  ... -2.14365 -1.750917 -2.537583
 2002-02-18 -1.835100 -2.006852 -2.242696  ... -2.14365 -1.432137 -2.537583
 2002-02-19 -1.835100 -2.006852 -2.110972  ... -2.1436

In [None]:
file_path=os.path.join(path,'data','training','theft2.csv')
save_to_file(sequence,file_path)