In [1]:
import pandas as pd
import numpy as np
import datetime
import sklearn
from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.express as px
import plotly.graph_objects as go
import fbprophet
pd.options.plotting.backend = "plotly"
from plotly.subplots import make_subplots
from pandas.plotting import lag_plot
from matplotlib import pyplot
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

from source import *


### Importing the data
The imported data below was created from a function that runs through each csv, creates a dataframe for each and then merges them.  Columns are converted to appropriate data types and any mismatches are fixed before merging.  This is important as Leeds City Council changed the formats of the files several times, which led to some differences in column names and potentially data types.

In [2]:
#import merged footfall data
footfalldf_imported = pd.read_csv("../data/footfall_merged.csv.gz",
								  parse_dates=['Date','DateTime'],
								  dtype={"BRCYear": int,"BRCWeekNum":int},
								  index_col=[0])


### Cleaning the data
The next step in the pipeline is to check for duplicates and remove them.  Initial data exploration revealed errors in some of the csv files where individual records had been duplicated.  In some instances, the same records existed in several different files, for example dates in early July appeared towards the end of the June csv.

The cameras don't all come online at the same time, with the last starting on 27th August 2008.  To ensure meaningful comparability, any records before this date have been removed.

Finally, one of the cameras appeared to have moved locations on 31st May 2015 from Commercial Street at Lush to Commercial Street at Sharps.  These are combined and renamed to Commercial Street Combined.

In [3]:
#Pipeline that imports csv files, creates a dataframe and applies cleaning functions
footfalldf = (footfalldf_imported
			  .pipe(start_pipeline)
			  .pipe(set_start_date, '2008-08-27')
			  .pipe(combine_cameras)
			  .pipe(check_remove_dup)
			  .pipe(remove_new_cameras)
			  .pipe(create_BRC_MonthNum))



#Useful list for if months ever lost when resampling or plotting.
Months = ['January','February','March','April','May','June','July','August','September','October','November','December']

Footfall hasn't changed when combining cameras
There are 0 duplicates left


#### Resample Data

First resample dataframes to daily, weekly and monthly total footfall.

In [4]:
#Resample into daily footfall.
day = footfalldf.groupby( [pd.Grouper(key='DateTime',freq='D')])['Count'].sum().to_frame()
#dayfinal = pd.concat([day,frame],verify_integrity=True)
day = day.drop(day[day['Count'] == 0].index)
#Set frequency to daily, creating additional rows for missing values and impute using the 'time' based interpolation
day = day.asfreq('D').dropna()#.replace(0,np.nan).interpolate(method='time')

month = footfalldf.groupby( [pd.Grouper(key='DateTime',freq='M')])['Count'].sum().to_frame()
month = month.drop(month[month['Count'] == 0].index)
month = month.asfreq('M').dropna()#.replace(0,np.nan).interpolate(method='time')

week = footfalldf.groupby( [pd.Grouper(key='DateTime',freq='W')])['Count'].sum().to_frame()
week = week.drop(week[week['Count'] == 0].index)
week = week.asfreq('W').dropna()
