# EDA & Visualisation

In [None]:
%reset

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports
import os
import timeit
import numpy as np
import pandas as pd
import seaborn as sns
from math import sqrt
from datetime import date
import holidays
sns.set()
import warnings
warnings.filterwarnings("ignore")

# To plot pretty figures
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
mpl.rcParams.update(mpl.rcParamsDefault)
mpl.rcParams["font.family"] = "serif"
mpl.rcParams["font.sans-serif"] = "Verdana"
mpl.rcParams["lines.markersize"] = 20

## Incident File

Incident data of Kwinana Fwy (Inner) northbound from 2018-01-01 00:00:00 to 2018-10-25 23:59:00

In [None]:
df1 = pd.read_csv('data/Kwinana_Fwy_Historic_Incidents_2018_Wide.csv')
df1 = df1[df1.Incident_Type != 'Duplicate Entry']
df1 = df1.reset_index(drop=True)
df1.WST_Start = pd.to_datetime(df1.WST_Start)
df1.WST_End = pd.to_datetime(df1.WST_End)
df1.Incident_Type = df1.Incident_Type.str.replace('/', '/\n')
df1.info()

In [None]:
fig, ax = plt.subplots(figsize=(6,6))
ax = sns.barplot(x=df1.Incident_Type.value_counts(),
           y=df1.Incident_Type.value_counts().index)
ax.set_xlabel('Frequency')
ax.set_xlim(right=200)
for p in ax.patches:
  width = p.get_width()
  ax.text(width + 1,
          p.get_y() + p.get_height()/2,
          int(width),
          ha="left",
          va="center")
#plt.savefig('fig/incident_type_1.png', bbox_inches="tight")
plt.show()

Since Special Event, Pothole / Road Surface Damage, and Hazmat (including spills) did not affect traffic congestion, these three categories are combined into one category called 'Special Event / Pothole / Hazmat'

Also, Flooding and Storm are combined into one category called 'Flooding / Storm'

In [None]:
df1.Incident_Type=df1.Incident_Type.replace(['Special Event',
                                            'Pothole /\n Road Surface Damage',
                                            'Hazmat (including spills)'],
                                           'Special Event /\nPothole / Hazmat')

df1.Incident_Type=df1.Incident_Type.replace(['Flooding', 'Storm'],
                                           'Flooding /\nStorm')

In [None]:
cmp = mpl.colors.ListedColormap(['#4053d3', '#00b25d', '#b51d14', '#ddb310'])

plt.rcParams.update({'font.size': 16})
fig, ax = plt.subplots(figsize=(10,8))
ax = sns.barplot(x=df1.Incident_Type.value_counts(),
           y=df1.Incident_Type.value_counts().index,
                palette=['#4053d3', '#00b25d', '#b51d14', '#ddb310'])
ax.set_ylabel('Incident Type', size=18)
ax.set_xlabel('Frequency', size=18)
ax.set_xlim(right=200)
for p in ax.patches:
  width = p.get_width()
  ax.text(width + 12,
          p.get_y() + p.get_height()/2,
          str(int(width)) + '\n(' + str(np.round(width/371*100, 1)) + '%)',
          ha="center",
          va="center")
plt.savefig('fig/incident_type.png', bbox_inches="tight")
plt.show()

In [None]:
df1.Incident_Type = df1.Incident_Type.replace([
    'Flooding /\nStorm', 'Special Event /\nPothole / Hazmat',
    'Special Event /\nPothole / Hazmat', 'Animal /\n Livestock',
    'Vehicle Fire'], 'Miscellaneous')

In [None]:
temp2 = df1.groupby(['Incident_Type', 'Congestion']).size().unstack()
temp2['sum'] = temp2.sum(axis=1)

plt.rcParams["figure.figsize"] = (10,8)
plt.rcParams.update({'font.size': 16})

cmp = mpl.colors.ListedColormap(['#efe645', '#e1562c', '#537eff'])

ax=(temp2.sort_values(by='sum').iloc[:,:-1]
    .plot(kind='barh', stacked=True, width=0.7,
          cmap=cmp))
ax.set_xlim(right=200)
ax.set_ylabel('Incident Type', size=18)
ax.set_xlabel('Frequency', size=18)
ax.legend(title='')
for i, v in enumerate(temp2.sort_values('sum')['sum']):
    ax.text(v+12, i, 
            str(int(v)) + '\n(' + str(np.round(v/temp2['sum'].sum()*100, 1)) + '%)',
            va = 'center', ha = 'center', fontsize = 14)
    
for n in temp2.iloc[:,:-1]: # for each column
    for i, (cs, ab) in enumerate(zip(temp2.sort_values(by='sum').cumsum(1)[n], 
                                     temp2.sort_values(by='sum')[n])):
        if ~np.isnan(cs) and ab > 4:
            ax.text(cs - ab / 2, i,
                 str(int(ab)) + '\n(' + str(np.round(ab/temp2['sum'].sum()*100, 1)) + '%)', 
                 va = 'center', ha = 'center', rotation = 20, fontsize = 12)   

plt.savefig('fig/incident_congestion_2.png', bbox_inches="tight")
plt.show()

In [None]:
df1.TrafficCondition=df1.TrafficCondition.replace(['Left Emergency Lane Blocked',
                                             'Right Emergency Lane Blocked'],
                                             'Emergency Lane Blocked')

df1.TrafficCondition=df1.TrafficCondition.replace(['Left Lane(s) Blocked',
                                             'Right Lane(s) Blocked',
                                             'Centre Lane(s) Blocked',
                                             'Left Centre Lane(s) Blocked',
                                             'Right Centre Lane(s) Blocked',
                                             'Bus Lane Blocked'],
                                             'Lane(s) Blocked')

df1.TrafficCondition=df1.TrafficCondition.replace(['Left Turning Pocket Blocked',
                                             'Right Turning Pocket Blocked'],
                                             'Turning Pocket Blocked')

In [None]:
temp2 = df1.groupby(['Incident_Type', 'TrafficCondition']).size().unstack()
temp2['sum'] = temp2.sum(axis=1)

plt.rcParams["figure.figsize"] = (10,8)
plt.rcParams.update({'font.size': 16})

cmp = mpl.colors.ListedColormap(['#00cb85', '#efe645', '#537eff',
                                '#e1562c', '#00e3ff'])

ax=(temp2.sort_values(by='sum').iloc[:,:-1]
    .plot(kind='barh', stacked=True, width=0.7,
          cmap=cmp))
ax.set_xlim(right=200)
ax.set_ylabel('Incident Type', size=18)
ax.set_xlabel('Frequency', size=18)
ax.legend(title='Traffic Condition')
for i, v in enumerate(temp2.sort_values('sum')['sum']):
    ax.text(v+12, i, 
            str(int(v)) + '\n(' + str(np.round(v/temp2['sum'].sum()*100, 1)) + '%)',
            va = 'center', ha = 'center', fontsize = 14)
    
for n in temp2.iloc[:,:-1]: # for each column
    for i, (cs, ab) in enumerate(zip(temp2.sort_values(by='sum').cumsum(1)[n], 
                                     temp2.sort_values(by='sum')[n])):
        if ~np.isnan(cs) and ab > 6:
            ax.text(cs - ab / 2, i,
                 str(int(ab)) + '\n(' + str(np.round(ab/temp2['sum'].sum()*100, 1)) + '%)', 
                 va = 'center', ha = 'center', rotation = 20, fontsize = 12)   

#plt.savefig('fig/incident_condition_2.png', bbox_inches="tight")
plt.show()

In [None]:
temp2 = df1.groupby(['TrafficCondition', 'Congestion']).size().unstack()
temp2['sum'] = temp2.sum(axis=1)

plt.rcParams["figure.figsize"] = (10,8)
plt.rcParams.update({'font.size': 16})

cmp = mpl.colors.ListedColormap(['#efe645', '#e1562c', '#537eff'])

ax=(temp2.sort_values(by='sum').iloc[:,:-1]
    .plot(kind='barh', stacked=True, width=0.7,
          cmap=cmp))
ax.set_xlim(right=200)
ax.set_ylabel('Traffic Condition', size=18)
ax.set_xlabel('Frequency', size=18)
ax.legend(title='')
for i, v in enumerate(temp2.sort_values('sum')['sum']):
    ax.text(v+12, i, 
            str(int(v)) + '\n(' + str(np.round(v/temp2['sum'].sum()*100, 1)) + '%)',
            va = 'center', ha = 'center', fontsize = 14)
    
for n in temp2.iloc[:,:-1]: # for each column
    for i, (cs, ab) in enumerate(zip(temp2.sort_values(by='sum').cumsum(1)[n], 
                                     temp2.sort_values(by='sum')[n])):
        if ~np.isnan(cs) and ab > 6:
            ax.text(cs - ab / 2, i,
                 str(int(ab)) + '\n(' + str(np.round(ab/temp2['sum'].sum()*100, 1)) + '%)', 
                 va = 'center', ha = 'center', rotation = 20, fontsize = 12)   

#plt.savefig('fig/condition_congestion.png', bbox_inches="tight")
plt.show()

In [None]:
def func2(a):
    if -32.091154 <= a < -32.080696:
        return "1"
    elif -32.080696 <= a < -32.074042:
        return "2"
    elif -32.074042 <= a < -32.071075:
        return "3"
    elif -32.071075 <= a < -32.057092:
        return "4"
    elif -32.057092 <= a < -32.052286:
        return "5"
    elif -32.052286 <= a < -32.043637:
        return "6"
    elif -32.043637 <= a < -32.040758:
        return "7"
    elif -32.040758 <= a < -32.030254:
        return "8"
    elif -32.030254 <= a < -32.012242:
        return "9"
    elif -32.012242 <= a < -32.010690:
        return "10"
    elif -32.010690 <= a < -32.003147:
        return "11"
    elif -32.003147 <= a < -31.969905:
        return "12"
    elif -31.969905 <= a < -31.966753:
        return "13"
    elif a >= -31.966753 :
        return "14"
    else:
        return "Other"

df1['ID'] = df1['Lat'].apply(lambda x: func2(x))
df1.ID = df1.ID.astype(int)
df1.head()

In [None]:
df1.describe()

In [None]:
temp2 = df1.groupby(['ID', 'Incident_Type']).size().unstack()
temp2['sum'] = temp2.sum(axis=1)

plt.rcParams["figure.figsize"] = (10,10)
plt.rcParams.update({'font.size': 16})

cmp = mpl.colors.ListedColormap(['#4053d3', '#00b25d', '#ddb310', '#b51d14'])

ax=(temp2.sort_values(by='sum').iloc[:,:-1]
    .plot(kind='barh', stacked=True, width=0.7,
          cmap=cmp))
ax.set_xlim(right=90)
ax.set_ylabel('Link', size=18)
ax.set_xlabel('Frequency', size=18)
ax.legend(title='Incident Type')
for i, v in enumerate(temp2.sort_values('sum')['sum']):
    ax.text(v+5, i, 
            str(int(v)) + '\n(' + str(np.round(v/temp2['sum'].sum()*100, 1)) + '%)',
            va = 'center', ha = 'center', fontsize = 12)
    
for n in temp2.iloc[:,:-1]: # for each column
    for i, (cs, ab) in enumerate(zip(temp2.sort_values(by='sum').cumsum(1)[n], 
                                     temp2.sort_values(by='sum')[n])):
        if ~np.isnan(cs) and ab >= 5:
            ax.text(cs - ab / 2, i-.01,
                 str(int(ab)) + '\n(' + str(np.round(ab/temp2['sum'].sum()*100, 1)) + '%)', 
                 va = 'center', ha = 'center', rotation=15, fontsize = 11)   

#plt.savefig('fig/incident_link_2.png', bbox_inches="tight")
plt.show()

In [None]:
del temp2

In [None]:
import matplotlib.cm as cm
import matplotlib.colors as mcolors

def colorbar_index(ncolors, cmap):
    cmap = cmap_discretize(cmap, ncolors)
    mappable = cm.ScalarMappable(cmap=cmap)
    mappable.set_array([])
    mappable.set_clim(-0.5, ncolors+0.5)
    colorbar = plt.colorbar(mappable)
    colorbar.set_ticks(np.linspace(0, ncolors, ncolors))
    colorbar.set_ticklabels(range(1, ncolors+1))
    colorbar.set_label('Link')
    
def cmap_discretize(cmap, N):
    """Return a discrete colormap from the continuous colormap cmap.

        cmap: colormap instance, eg. cm.jet. 
        N: number of colors.

    Example
        x = resize(arange(100), (5,100))
        djet = cmap_discretize(cm.jet, 5)
        imshow(x, cmap=djet)
    """

    if type(cmap) == str:
        cmap = plt.get_cmap(cmap)
    colors_i = np.concatenate((np.linspace(0, 1., N), (0.,0.,0.,0.)))
    colors_rgba = cmap(colors_i)
    indices = np.linspace(0, 1., N+1)
    cdict = {}
    for ki,key in enumerate(('red','green','blue')):
        cdict[key] = [ (indices[i], colors_rgba[i-1,ki], colors_rgba[i,ki])
                       for i in range(N+1) ]
    # Return colormap object.
    return mcolors.LinearSegmentedColormap(cmap.name + "_%d"%N, cdict, 1024)


cmp = mpl.colors.ListedColormap(['#ebac23', '#b80058', '#008cf9',
                                 '#006e00', '#00bbad', '#d163e6',
                                 '#b24502', '#ff9287', '#5954d6',
                                 '#00c6f8', '#878500', '#00a76c',
                                 '#bdbdbd', '#000078', '#b51d14'])
df1.plot(kind='scatter', x='Long', y='Lat', alpha=0.5,
        s='Duration', c='ID', label='Duration',
        cmap=cmp, colorbar=False, rot=45)
plt.ticklabel_format(useOffset=False)
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend(markerscale=0.2)
colorbar_index(ncolors=14, cmap=cmp)
#plt.savefig('fig/incident_loc_link_2.png', bbox_inches="tight")
plt.show()

## Traffic + Incident data

In [None]:
df = pd.read_csv('data/LAD+incident2.csv', index_col=0)
df.DateTime = pd.to_datetime(df.DateTime)
df.info()

In [None]:
df.head()

In [None]:
df.iloc[:,3:6] \
    .describe() \
    .apply(lambda s: s.apply('{0:.4f}'.format))

In [None]:
df.iloc[:,2:6].corr()

- Moderate positive correlation between volume and occupancy
- High negative correlation between speed and occupancy

In [None]:
# pd.plotting.scatter_matrix(df.iloc[:,3:6], alpha=0.05)
# plt.show()

In [None]:
# variables = ['Volume', 'Speed', 'Occupancy']
# g = sns.PairGrid(df, hue="Volume", vars=variables)
# g.map_diag(sns.histplot, hue=None, color=".3")
# g.map_offdiag(sns.scatterplot)
# g.add_legend()

In [None]:
df.iloc[:,3:6].plot.box(figsize=(6,6))
plt.show()

In [None]:
plt.rcParams.update({'font.size': 14,
                    'lines.markersize': 5})
fig, ax = plt.subplots(1, 3, figsize=(6,8))
ax.boxplot(df.Volume)
ax.boxplot(df.Speed)
ax.boxplot(df.Occupancy)

plt.show()

### Prepare the data for deep learning algorithms

In [None]:
# Drop Congestion column
df.drop('Congestion', axis=1, inplace=True)

In [None]:
# NaN TrafficCondition == 'All Lanes Open'

df.TrafficCondition = df.TrafficCondition.replace({
    np.nan: 'All Lanes Open',
    'All Lanes Open, Emergency Lane Blocked': 'Emergency Lane Blocked',
    'Lane Closures Unknown, Lane(s) Blocked': 'Lane(s) Blocked',
    'Lane(s) Blocked, All Lanes Open': 'Lane(s) Blocked',
    'All Lanes Open, Lane(s) Blocked': 'Lane(s) Blocked',
    'Emergency Lane Blocked, Emergency Lane Blocked': 'Emergency Lane Blocked',
    'Lane Closures Unknown, All Lanes Open': 'All Lanes Open',
    'Lane(s) Blocked, Lane(s) Blocked': 'Lane(s) Blocked'
})

df.TrafficCondition.value_counts()

In [None]:
df.Incident_Type.value_counts()

In [None]:
df.Incident_Type = df.Incident_Type.replace([
    'Flooding /\nStorm', 'Special Event /\nPothole / Hazmat',
    'Special Event /\nPothole / Hazmat', 'Animal /\n Livestock',
    'Vehicle Fire'], 'Miscellaneous')

df.Incident_Type = df.Incident_Type.replace({
    'Flooding /\nStorm, Road Crash': 'Road Crash, Miscellaneous',
    'Miscellaneous, Break Down /\n Tow Away': 'Break Down /\n Tow Away, Miscellaneous',
    'Flooding /\nStorm, Break Down /\n Tow Away': 'Break Down /\n Tow Away, Miscellaneous',
})
df.Incident_Type.value_counts()

In [None]:
df['Num_Incidents'] = df.Incident_Type.str.count(', ')
df['Num_Incidents'] = df['Num_Incidents'] + 1
df['Num_Incidents'] = df['Num_Incidents'].replace(np.nan, 0)
df['Num_Incidents'].value_counts()

In [None]:
df[['Incident_Type1','Incident_Type2']]=df['Incident_Type'].str.split(', ', 1, expand=True)

In [None]:
df.Incident_Type1 = df.Incident_Type1.replace(np.nan, 'No Incidents')
df.Incident_Type2 = df.Incident_Type2.replace(np.nan, 'No Incidents')

df.Incident_Type1.value_counts()

In [None]:
df.Incident_Type2.value_counts()

In [None]:
# Drop Incident_Type column
df.drop('Incident_Type', axis=1, inplace=True)

In [None]:
df.info()

#### Link 11

Consider

- Link 11 and Link 12 (downstream)
- Incident_Type1, Incident_Type2
    + No Incidents
    + Break Down / Tow Away
    + Debris / Trees / Lost Loads
    + Road Crash
    + miscellaneous: everything else

In [None]:
df11 = df[df.ID==11]
df11.info()

In [None]:
df11.drop('ID', axis=1, inplace=True)
df11 = df11.set_index('DateTime')
df11.index = pd.to_datetime(df11.index)
if not df11.index.is_monotonic:
    df11 = df11.sort_index()

df11.info()

In [None]:
from sklearn.preprocessing import LabelBinarizer

encoder = LabelBinarizer()
df11 = df11.join(pd.DataFrame(encoder.fit_transform(df11['TrafficCondition']),
                             columns=encoder.classes_,
                             index=df11.index))
df11.info()

In [311]:
df11.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 429120 entries, 2018-01-01 00:00:00 to 2018-10-25 23:59:00
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Length                  429120 non-null  float64
 1   Volume                  429120 non-null  float64
 2   Speed                   429120 non-null  float64
 3   Occupancy               429120 non-null  float64
 4   TrafficCondition        429120 non-null  object 
 5   Num_Incidents           429120 non-null  float64
 6   Incident_Type1          429120 non-null  object 
 7   Incident_Type2          429120 non-null  object 
 8   All Lanes Open          429120 non-null  int32  
 9   Emergency Lane Blocked  429120 non-null  int32  
 10  Lane Closures Unknown   429120 non-null  int32  
 11  Lane(s) Blocked         429120 non-null  int32  
 12  Turning Pocket Blocked  429120 non-null  int32  
dtypes: float64(5), int32(5), object(3)
memor

In [308]:
df11.Incident_Type1.value_counts()

No Incidents                       426712
Break Down /\n Tow Away              1023
Debris /\n Trees /\n Lost Loads       833
Road Crash                            297
Miscellaneous                         255
Name: Incident_Type1, dtype: int64

In [310]:
df11.Incident_Type2.value_counts()

No Incidents    429110
Road Crash          10
Name: Incident_Type2, dtype: int64

In [None]:
df11.to_csv('data/df11.csv')