## EDA Smart City

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN
# Set agg.path.chunksize to a higher value
mpl.rcParams['agg.path.chunksize'] = 10000

sns.set()

## Traffic information

- Run File Traffic Processing to aggregate and generate csv file

In [None]:
numeric_cols = ['vehicle_count', 'avg_speed',  'avg_measured_time', 'median_measured_time']

traffic_df = pd.read_csv("../data/aarhus_traffic_data_aug_nov_2014.csv")
traffic_df['timestamp'] = pd.to_datetime(traffic_df['timestamp'])
traffic_df.set_index('timestamp', inplace=True)
traffic_df.head()

In [None]:

traffic_df.describe()

In [None]:
# Sampling
# Aggregate numeric columns by mean for hourly intervals
#hourly_data = traffic_df[numeric_cols].resample('H').mean().reset_index()
sampled_data = traffic_df.sample(frac=0.1, random_state=42).sort_index()
sampled_data.describe()

In [None]:
# General Data Visualization
sampled_data[numeric_cols].plot(subplots=True)

In [None]:
# Compute weekly rolling average instead of 5 minute-level
window_size = 7 * 24  # 7 days * 24 hours (weekly window)
df_rolling = sampled_data[numeric_cols].rolling(window=window_size, min_periods=1).mean()
print(df_rolling.columns)
df_rolling.reset_index(inplace=True)
df_rolling.head()


In [None]:
# Reshape for Seaborn
long_rolling_avg_sampled = df_rolling.melt(id_vars='timestamp', var_name='Variable', value_name='Value')

# Plot with Seaborn
plt.figure(figsize=(15, 6))
sns.lineplot(data=long_rolling_avg_sampled, x='timestamp', y='Value', hue='Variable')
plt.title('Traffic Data Moving Average (Sampled)', fontsize=16)
plt.xlabel('Timestamp', fontsize=14)
plt.ylabel('Moving Average', fontsize=14)
plt.legend(title='Variable', loc='upper right')
plt.grid(True)
plt.show()

In [None]:

month_means = sampled_data.reset_index().groupby(
    pd.Grouper(key="timestamp", freq="1W"))[numeric_cols].mean().reset_index()
print(month_means.head())

month_means.plot('timestamp', 
                 figsize = (20,5), 
                 subplots=True, 
                 title="Weekly Average")


In [None]:
# Pivot table for heatmap
heatmap_data = traffic_df.pivot_table(values='vehicle_count', index='hour', columns='day', aggfunc='mean')

# Plot heatmap
plt.figure(figsize=(10, 5))
sns.heatmap(heatmap_data, cmap='coolwarm')
plt.title('Hourly Traffic Heatmap - Full timeframe')
plt.show()

In [None]:
# Cluster vehicle_count and avg_speed
# Grouping ttraffic information trying to indentify congestion
X = sampled_data[['vehicle_count', 'avg_speed']].values
clustering = DBSCAN(eps=0.5, min_samples=10).fit(X)
sampled_data['cluster'] = clustering.labels_

# Visualize clusters
sns.scatterplot(data=sampled_data, x='vehicle_count', y='avg_speed', hue='cluster')
plt.title('Traffic Behavior Clusters')
plt.show()

## Parking information



In [None]:
parking_df = pd.read_csv("../data/aarhus_parking_geolocated.csv")
parking_df['updatetime'] = pd.to_datetime(parking_df['updatetime'], format='mixed')
parking_df.set_index('updatetime', inplace=True)
parking_df.head()

In [None]:
# Pivot table for heatmap
heatmap_data = parking_df.pivot_table(values='vehiclecount', index=parking_df.index.date, columns=parking_df.index.hour, aggfunc='mean')

# Plot heatmap
plt.figure(figsize=(10, 5))
sns.heatmap(heatmap_data, cmap='coolwarm')
plt.title('Hourly Traffic Heatmap')
plt.show()

### Connecting Traffic and Parking

In [None]:
# Defining analysis period
# Using parking data information as baseline for analysis
date_start = parking_df.index.min()
date_end = parking_df.index.max() 

print(f'Period covered: {date_start} - {date_end}')

In [None]:
# sampling traffic for period based on parking timeframe
date_sample_traffic = traffic_df.loc[(traffic_df.index >= date_start) & (traffic_df.index <= date_end)]

# Resample traffic data
traffic_resampled = traffic_df['vehicle_count'].resample('15T').mean()

# Resample parking data
parking_resampled = parking_df['vehiclecount'].resample('15T').mean()

# Combine into a single DataFrame
combined_traffic_parking_df = pd.concat([traffic_resampled, parking_resampled], axis=1)

combined_traffic_parking_df = pd.merge_asof(
    traffic_resampled.sort_index(),
    parking_resampled.sort_index(),
    left_index=True,
    right_index=True,
    direction='nearest'
)
combined_traffic_parking_df.rename(columns = {"vehiclecount": "parking_occupancy"}, inplace=True)


median_values = combined_traffic_parking_df.median()
print(median_values)
for col, variable in median_values.items():
    combined_traffic_parking_df.fillna({col: variable}, inplace=True)

combined_traffic_parking_df

In [None]:
fig, ax1 = plt.subplots(figsize=(15, 6))

# Traffic data
ax1.plot(combined_traffic_parking_df.index, combined_traffic_parking_df['vehicle_count'], color='orange', label='Traffic')
ax1.set_ylabel('Vehicle Count')

# Parking data
ax2 = ax1.twinx()
ax2.plot(combined_traffic_parking_df.index, combined_traffic_parking_df['parking_occupancy'], color='green', label='Parking')
ax2.set_ylabel('Parking Occupancy (%)')

plt.title('Traffic vs Parking Occupancy')
fig.legend(loc='upper right')
plt.show()


## Weather Information


In [None]:
weather_df = pd.read_csv("../data/aarhus_weather_data_combined.csv")
weather_df['Datetime'] = pd.to_datetime(weather_df['Datetime'])
weather_df.set_index('Datetime', inplace=True)
weather_df.head()

In [None]:
weather_df.sort_index().plot(subplots=True, figsize=(10,7), fontsize=8)

### Asserting dataset Date Alignment

In [None]:
# Data Alignment
print("Traffic Date Range:", traffic_df.index.min(), "-", traffic_df.index.max())
print("Weather Date Range:", weather_df.index.min(), "-", weather_df.index.max())
print("Parking Date Range:", parking_df.index.min(), "-", parking_df.index.max())

## Correlations

In [None]:
combined_weather_traffic_parking_df = pd.merge_asof(
    combined_traffic_parking_df.sort_index(),
    weather_df.sort_index(),
    left_index=True,
    right_index=True,
    direction='nearest'
).merge(
    parking_df,
    left_index=True,
    right_index=True,
    how='left'
)

combined_weather_traffic_parking_df.head()

In [None]:
corr_cols = ['vehicle_count', 'parking_occupancy', 'tempm', 'hum', 'wspdm']
corr_matrix = combined_weather_traffic_parking_df[corr_cols].corr()

plt.figure(figsize=(10,4))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Weather-Traffic-Parking Correlation Matrix')
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(10,5))

# Traffic and Parking
ax1.plot(combined_weather_traffic_parking_df.index, combined_weather_traffic_parking_df['vehicle_count'], 
        color='blue', label='Traffic')
ax1.plot(combined_weather_traffic_parking_df.index, combined_weather_traffic_parking_df['parking_occupancy'], 
        color='green', linestyle='--', label='Parking')
ax1.set_ylabel('Traffic/Parking Units', color='navy')

# Weather (Temperature)
ax2 = ax1.twinx()
ax2.plot(combined_weather_traffic_parking_df.index, combined_weather_traffic_parking_df['tempm'], 
        color='red', label='Temperature')
ax2.set_ylabel('°C', color='red')

plt.title('Integrated Traffic-Parking-Weather Trends')
fig.legend(loc='upper left')
plt.show()

In [None]:
# Align frequencies (hourly example)
traffic_hourly = traffic_df['vehicle_count'].resample('H').mean()
weather_hourly = weather_df.resample('H').mean()

# Merge using nearest timestamps 
combined = pd.merge_asof(
    traffic_hourly.to_frame(),
    weather_hourly,
    left_index=True,
    right_index=True,
    direction='nearest'
)

combined

In [None]:
# Calculate correlation matrix 
corr_matrix = combined[['vehicle_count', 'tempm', 'hum', 'wspdm', 'vism']].corr()

# Visualize with heatmap
plt.figure(figsize=(10,6))
sns.heatmap(corr_matrix, annot=True, cmap='icefire', center=0)
plt.title('Weather-Traffic Correlation Matrix')
plt.show()

In [None]:
fig, ax1 = plt.subplots(figsize=(18,6))

# Traffic data 
ax1.plot(combined.index, combined['vehicle_count'], 'b', label='Traffic')
ax1.set_ylabel('Vehicle Count', color='b')

# Temperature overlay 
ax2 = ax1.twinx()
ax2.plot(combined.index, combined['tempm'], 'r', label='Temperature')
ax2.set_ylabel('°C', color='r')

# Formatting
plt.title('Traffic vs Temperature Trends')
fig.legend(loc='upper left')
plt.show()

# Pollution information


In [None]:
pollution_df = pd.read_csv("../data/aarhus_pollution_data_aug_oct_2014.csv")
pollution_df['timestamp'] = pd.to_datetime(pollution_df['timestamp'])
pollution_df.set_index('timestamp', inplace=True)
pollution_df.head()

## Pollution and Traffic Data (number of cars)

In [None]:
# Align frequencies (hourly example)
traffic_hourly = traffic_df['vehicle_count'].resample('H').mean()
pollution_hourly = pollution_df.resample('H').mean()

# Merge using nearest timestamps 
combined = pd.merge_asof(
    traffic_hourly.to_frame(),
    pollution_hourly,
    left_index=True,
    right_index=True,
    direction='nearest'
)

combined

In [None]:
# Time alignment
print("Traffic Date Range:", traffic_df.index.min(), "-", traffic_df.index.max())
print("Pollution Date Range:", pollution_df.index.min(), "-", pollution_df.index.max())

In [None]:
# Calculate correlation matrix 
corr_matrix = combined[['vehicle_count', 'ozone',	'particullate_matter', 'carbon_monoxide', 'sulfure_dioxide', 'nitrogen_dioxide'	]].corr()

# Visualize with heatmap
plt.figure(figsize=(10,6))
sns.heatmap(corr_matrix, annot=True, cmap="Blues" ,center=0)
plt.title('Pollution-Traffic Correlation Matrix')
plt.show()

### Pollution and location


In [None]:
import folium
from folium.plugins import HeatMap

AARHUS_GEOLOCATION = [56.16182966470494, 10.197421860666816]

def generateBaseMap(default_location=AARHUS_GEOLOCATION, default_zoom=11):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom)
    return base_map

pollution_sensor = pollution_df.groupby(["latitude", "longitude"]).count().reset_index()
map = generateBaseMap()

# markers on a map
for index, address in pollution_sensor.iterrows():
   folium.Marker(
         location=[address['latitude'], address['longitude']]
      ).add_to(map)

map