# Exploration

This notebook explores the data in the traffic dataset provided by ETHZ.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import contextily as cx

## Load the data

This section loads the data into dataframes and cleans it up a bit.

In [2]:
detectors = pd.read_csv('dataset/detectors.csv')
links     = pd.read_csv('dataset/links.csv')
traffic   = pd.read_csv('dataset/smalltraffic.csv', low_memory=False)

### Cleanup

In [3]:
detectors['det_id']  = detectors['detid']
detectors['link_id'] = detectors['linkid']
detectors['type']    = detectors['fclass']
detectors['city']    = detectors['citycode']

detectors = detectors.drop(columns=['detid', 'linkid', 'fclass', 'citycode'])

In [4]:
links['link_id']  = links['linkid']
links['city'] = links['citycode']

links = links.drop(columns=['linkid', 'citycode', 'piece', 'group'])

In [5]:
traffic['interval'] = traffic['interval'].apply(lambda x: f'{x // 3600}:{x % 3600 // 60}:{x % 60}')
traffic['datetime'] = pd.to_datetime(traffic['day'] + ' ' + traffic['interval'], format='%Y-%m-%d %H:%M:%S')
traffic['det_id'] = traffic['detid']

day = 24 * 60 * 60
week = 7 * day
year = 365 * day

date_time = traffic['datetime']
timestamps = date_time.map(pd.Timestamp.timestamp)
traffic['day_sin'] = np.sin(timestamps * (2 * np.pi / day))
traffic['day_cos'] = np.cos(timestamps * (2 * np.pi / day))
traffic['week_sin'] = np.sin(timestamps * (2 * np.pi / week))
traffic['week_cos'] = np.cos(timestamps * (2 * np.pi / week))
traffic['year_sin'] = np.sin(timestamps * (2 * np.pi / year))
traffic['year_cos'] = np.cos(timestamps * (2 * np.pi / year))

traffic = traffic.drop(columns=['day', 'interval', 'detid', 'error'])

In [6]:
detectors = detectors[['det_id', 'link_id', 'city', 'road', 'type', 'lanes', 'limit', 'lat', 'long', 'length', 'pos']]
detectors.head()

Unnamed: 0,det_id,link_id,city,road,type,lanes,limit,lat,long,length,pos
0,U1-52G,72.0,augsburg,Gögginger Straße,secondary,1.0,50,48.359957,10.889553,0.196037,0.005512
1,U1-51G,73.0,augsburg,Gögginger Straße,secondary,1.0,50,48.359945,10.889601,0.130039,0.004013
2,U1-52L,70.0,augsburg,Gögginger Straße,secondary,1.0,50,48.359876,10.889356,0.155863,0.022228
3,U1-51L,71.0,augsburg,Gögginger Straße,secondary,1.0,50,48.359862,10.889396,0.197675,0.021889
4,U1-62,68.0,augsburg,Rosenaustraße,secondary,1.0,50,48.360578,10.889361,0.065183,0.024465


In [7]:
links = links[['link_id', 'order', 'city', 'lat', 'long']]
links.head()

Unnamed: 0,link_id,order,city,lat,long
0,0,1,augsburg,48.361079,10.891016
1,0,2,augsburg,48.360993,10.890894
2,0,3,augsburg,48.360853,10.890642
3,0,4,augsburg,48.360719,10.890404
4,0,5,augsburg,48.360497,10.889994


In [8]:
traffic = traffic[['det_id', 'datetime', 'city', 'flow', 'occ', 'speed', 'day_sin', 'day_cos', 'week_sin', 'week_cos', 'year_sin', 'year_cos']]
traffic.head()

Unnamed: 0,det_id,datetime,city,flow,occ,speed,day_sin,day_cos,week_sin,week_cos,year_sin,year_cos
0,06.X-2li,2017-05-06 00:00:00,augsburg,12,0.0,,-8.531351e-13,1.0,0.974928,-0.222521,0.705584,-0.708627
1,06.X-2li,2017-05-06 00:05:00,augsburg,12,0.0,,0.02181489,0.999762,0.97423,-0.225558,0.705541,-0.708669
2,06.X-2li,2017-05-06 00:10:00,augsburg,12,0.0,,0.04361939,0.999048,0.973522,-0.228594,0.705499,-0.708711
3,06.X-2li,2017-05-06 00:15:00,augsburg,16,0.0,,0.06540313,0.997859,0.972805,-0.231627,0.705457,-0.708753
4,06.X-2li,2017-05-06 00:20:00,augsburg,16,0.0,,0.08715574,0.996195,0.972078,-0.234657,0.705414,-0.708795


### Filtering

Here we filter out cities with insufficient data. In particular, we wish to keep cities that have flow and occupancy data available.

In [9]:
cities = detectors['city'].unique()

# Get the available columns out of ['flow', 'occ', 'speed'] for each city
available_columns = {}
for city in cities:
    # Load {city}.csv
    city_data = pd.read_csv(f'dataset/cities/{city}.csv')

    # Check which of the columns ['flow', 'occ', 'speed'] are not empty
    available_columns[city] = city_data[['flow', 'occ', 'speed']].notnull().any()

    print(f"{city}: {'flow' if available_columns[city]['flow'] else ''} {'occ' if available_columns[city]['occ'] else ''} {'speed' if available_columns[city]['speed'] else ''}")

augsburg: flow occ 
basel: flow occ 
bern: flow occ 
birmingham: flow  speed
bolton: flow occ speed
bordeaux: flow occ 
bremen: flow occ 
cagliari: flow occ 
constance: flow occ speed
darmstadt: flow occ 
essen: flow occ speed
frankfurt: flow occ 
graz: flow occ 
groningen: flow occ speed
hamburg: flow occ 
innsbruck: flow  speed
kassel: flow occ 
london: flow occ 
losangeles: flow occ 
luzern: flow occ 
madrid: flow occ 
melbourne: flow  
manchester: flow occ speed
marseille: flow occ 
munich: flow occ 
paris: flow occ 
rotterdam: flow occ speed
santander: flow occ 
speyer: flow occ 
strasbourg: flow occ 
stuttgart: flow occ 


  city_data = pd.read_csv(f'dataset/cities/{city}.csv')


taipeh: flow occ 
tokyo:   
torino: flow occ speed
toronto: flow occ 
toulouse: flow occ 
utrecht: flow  
vilnius: flow occ 
wolfsburg: flow occ 
zurich: flow occ 


In [10]:
cities_with_flow_occ = [city for city in cities if available_columns[city]['flow'] and available_columns[city]['occ']]
cities_with_flow_occ.remove('losangeles')
cities_with_flow_occ.remove('toronto')
cities_with_flow_occ.remove('taipeh')

cities = cities_with_flow_occ

In [11]:
unique_road_types = detectors['type'].unique()
unique_road_types.sort()
unique_road_types

array(['living_street', 'motorway', 'motorway_link', 'other', 'primary',
       'primary_link', 'residential', 'secondary', 'secondary_link',
       'service', 'tertiary', 'tertiary_link', 'trunk', 'trunk_link'],
      dtype=object)