# Step 2: Merging of grid and emergency calls data

**Goal of the step**: Merge the ambulance calls and the grid dataframes based on their geographical features, perform required data preparation and add the neighbourhood data to the dataframe.

**Step overview:**

1. Loading of the dataframes
2. Merging of the dataframes
3. Preparation of the data
4. Merging of the neighbourhood data

"m1" is the dataframe containing data about ambulance calls, and "m2" is the dataframe containing data about the grid.

In [None]:
import pandas as pd
import osmnx
import geopandas as gpd
import rioxarray
import xarray
import datashader
import contextily as cx
from shapely import geometry
from shapely import wkt
import matplotlib.pyplot as plt
import seaborn
from pysal.viz import splot
from splot.esda import plot_moran
import contextily
from pysal.explore import esda
from pysal.lib import weights
from numpy.random import seed
import os
import seaborn as sns
from scipy.stats import skew
import numpy as np

## 1. Data loading

In [None]:
# Loading of m1

m1 = pd.read_csv("data/m1.csv")
m1 = m1.drop('Unnamed: 0', axis=1)

In [None]:
# Loading of m2

m2 = pd.read_csv("Data/grid.csv")

## 2. Data merging

In [None]:
# Setting of the time stamp type

m1['pmeTimeStamp'] = pd.to_datetime(m1['pmeTimeStamp'])

In [None]:
# Setting of the geometry type

m1['geometry'] = gpd.GeoSeries.from_wkt(m1['geometry'])
m1 = gpd.GeoDataFrame(m1, geometry='geometry')

In [None]:
# Setting of the geometry type

m2['geometry'] = gpd.GeoSeries.from_wkt(m2['geometry'])
m2 = gpd.GeoDataFrame(m2, geometry='geometry')

In [None]:
# Setting the coordinate reference system

m1.crs = 'EPSG:4326'

In [None]:
# Setting the coordinate reference system

m1 = m1.to_crs("EPSG:28992")

In [None]:
# Joining of the two dataframes

m1_m2 = gpd.sjoin(m1, m2, op='within')

In [None]:
m1_m2.info()

In [None]:
calls = m1_m2.groupby(['n_grid']).size()

In [None]:
calls=calls.reset_index()

In [None]:
calls.columns = ['n_grid', 'n_calls']

In [None]:
df = pd.merge(calls, m2, on=["n_grid"])

In [None]:
df = gpd.GeoDataFrame(df, geometry=df['geometry'])

In [None]:
df.plot(column='n_calls', legend=True, scheme='fisher_jenks')

## 3. Data preparation

In [None]:
df['n_calls'].hist(bins=4)

In [None]:
sns.pairplot(df)

The number of calls appear to have significant skewness

In [None]:
plt.boxplot(df['n_calls'])

In [None]:
print(skew(df['n_calls'], bias=False))

In [None]:
sns.pairplot(df)

As expected, the value is 3.9, to improve the model a transformation like log or sqrt can be performed on the feature and then the outliers can be taken into account.

The max value is too distant from the distribution, it will be removed from the dataframe.

In [None]:
df = df[(df.n_calls != df['n_calls'].max())]

In [None]:
print(skew(df['n_calls'], bias=False))

Now the skewness already improved significantly.

We can apply a square root transformation.

In [None]:
df['sqrt_calls'] = np.sqrt(df['n_calls'])

In [None]:
print(skew(df['sqrt_calls'], bias=False))

In [None]:
df.to_csv('data/df2.csv', index=False)

In [None]:
df

In [None]:
# Transformation of number of calls with the sqrt function

df['sqrt_calls'] = np.sqrt(df['n_calls'])

In [None]:
# Standardization of the root of the number of ambulance calls

df['sqrt_calls'] = (df['sqrt_calls'] - df['sqrt_calls'].mean())/df['sqrt_calls'].std()

In [None]:
plt.boxplot(df['sqrt_calls'])

In [None]:
print(skew(df['sqrt_calls'], bias=False))

The skewness is still significant, from the boxplot a significant outlier can be seen, it will be removed from the analysis.

In [None]:
df = df[(df.sqrt_calls != df['sqrt_calls'].max())]

In [None]:
print(skew(df['sqrt_calls'], bias=False))

Now the skewness is under control.

## 4. Merging of the neighbourhood data

In [None]:
# Loading of the neighbourhood data

neigh = gpd.read_file('data/neighborhoods_2018.json').to_crs('EPSG:4326')

In [None]:
neigh.crs = 'EPSG:28992'

In [None]:
# Selection of the relevant features

neigh = neigh[['BU_CODE','geometry']]

In [None]:
# Merging of the dataframes

sjoin = gpd.sjoin(df, neigh, how='left', op='intersects')

Since each cell in the grid can be partially overlapping different neighbourhoods, each cell will be assigned to the neighbourhood with the highest area overlap

In [None]:
# Overlaying the dataframes

merged = gpd.overlay(df, neigh, how='intersection')

In [None]:
# Calculating the overlaying area

merged['area'] = merged.geometry.area

In [None]:
# Selecting the neighbourhood with the highest area overlap

merged = merged.sort_values(['n_grid', 'area'],ascending=False).groupby(['n_grid']).first().reset_index()

In [None]:
# Plot of the cells assigned to each neighbourhood

fig, ax = plt.subplots(figsize=(10, 10))
neigh.plot(ax=ax, color='none', edgecolor='black')
merged.plot(ax=ax, color='none', edgecolor='r')

In [None]:
merged.to_csv('data/df.csv', index = False)