## Traffic Data Predictions

In this file, we use the same methodology as we did for the training dataset, to generate the number of intersections feature for the neighborhoods where there were no monitoring stations.

In [35]:
#Import python packages including overpy
import overpy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import descartes
import geopandas as gpd
import geopy.distance
import math
import time
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas
import folium
import us
from matplotlib import cm
import matplotlib.lines as mlines

sns.set(style = 'whitegrid')
sns.set_palette('bright')
%matplotlib inline

In [43]:
# Load traffic intersection data and clean it
traffic_df = pd.read_csv("../../data/raw/intersection_locations.csv")
#Drop the first column
traffic_df.drop(columns = ['Unnamed: 0'], inplace=True)
## Rename index and intersection number
traffic_df.rename(columns = {'index':'Intersection'}, inplace=True)
### Add an empty column for distance
traffic_df['dist'] = 0.0

# Create individual dataframes
traffic_lat = traffic_df.filter(['Intersection','Latitude'], axis=1)
traffic_long = traffic_df.filter(['Intersection','Longitude'], axis=1)
traffic_dist = traffic_df.filter(['Intersection','dist'], axis=1)

# Transpose all the dataframes
traffic_lat = traffic_lat.T
traffic_long = traffic_long.T
traffic_dist  = traffic_dist.T

#Add suffix to column header based on the dataframe type
traffic_lat.columns = [str(col) + '_latitude' for col in traffic_lat.columns]
traffic_long.columns = [str(col) + '_longitude' for col in traffic_long.columns]
traffic_dist.columns = [str(col) + '_distance' for col in traffic_dist.columns]

## Remove index for each dataframe
traffic_lat.reset_index(drop=True, inplace=True)
traffic_long.reset_index(drop=True, inplace=True)
traffic_dist.reset_index(drop=True, inplace=True)

### Combine individual dataframes into one
traffic_combined = traffic_lat.join(traffic_long).join(traffic_dist)

### Sort based on column names
traffic_combined = traffic_combined.reindex(columns=sorted(traffic_combined.columns))
#Update dataframe to contain 222 rows because there are 222 rows in predict.csv
traffic_combined = traffic_combined.loc[traffic_combined.index.repeat(222)].reset_index(drop=True)

# Load Data for Neighborhoods where we will predict 

In [44]:
neighbourhhood_predict = pd.read_csv("../../data/cleaned/predict.csv")

In [45]:
neighbourhhood_predict.shape

(222, 3)

In [46]:
neighbourhhood_predict.head()

Unnamed: 0.1,Unnamed: 0,lat,lon
0,0,29.672427,-95.3197
1,1,29.672427,-95.322203
2,2,29.672427,-95.324705
3,3,29.672427,-95.327208
4,4,29.672427,-95.32971


In [47]:
predict_df = neighbourhhood_predict[['lat', 'lon']]

# Combine predict neighbourhoods data with intersections data

In [48]:
combined_predict_traffic = predict_df.join(traffic_combined)
combined_predict_traffic.head()

Unnamed: 0,lat,lon,0_distance,0_latitude,0_longitude,1000_distance,1000_latitude,1000_longitude,1001_distance,1001_latitude,...,998_longitude,999_distance,999_latitude,999_longitude,99_distance,99_latitude,99_longitude,9_distance,9_latitude,9_longitude
0,29.672427,-95.3197,0.0,29.680002,-95.458484,0.0,29.744669,-95.410643,0.0,29.668656,...,-95.467527,0.0,29.796032,-95.409997,0.0,29.748337,-95.361603,0.0,29.718541,-95.408966
1,29.672427,-95.322203,0.0,29.680002,-95.458484,0.0,29.744669,-95.410643,0.0,29.668656,...,-95.467527,0.0,29.796032,-95.409997,0.0,29.748337,-95.361603,0.0,29.718541,-95.408966
2,29.672427,-95.324705,0.0,29.680002,-95.458484,0.0,29.744669,-95.410643,0.0,29.668656,...,-95.467527,0.0,29.796032,-95.409997,0.0,29.748337,-95.361603,0.0,29.718541,-95.408966
3,29.672427,-95.327208,0.0,29.680002,-95.458484,0.0,29.744669,-95.410643,0.0,29.668656,...,-95.467527,0.0,29.796032,-95.409997,0.0,29.748337,-95.361603,0.0,29.718541,-95.408966
4,29.672427,-95.32971,0.0,29.680002,-95.458484,0.0,29.744669,-95.410643,0.0,29.668656,...,-95.467527,0.0,29.796032,-95.409997,0.0,29.748337,-95.361603,0.0,29.718541,-95.408966


In [49]:
combined_predict_traffic.shape

(222, 11861)

# Calculate distance between monitoring location and each traffic intersection

Calculate the distance between the latitude and longitude of the monitoring location from EDF data, and the latitude and longitdue of the intersection using the distance function defined below. The function uses two GPS coordinates to calculate geodesic distance.

In [50]:
def distance(origin, destination):
    lat1, lon1 = origin
    lat2, lon2 = destination
    radius = 6371 # km
    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

In [51]:
time1 = time.time()
for index, row in combined_predict_traffic.iterrows():
    for idx, col in enumerate(combined_predict_traffic.columns):
        if "_dist" in col:
            combined_predict_traffic.at[index,col] = distance((row.iloc[1], row.iloc[0]), (row.iloc[idx+1], row.iloc[idx+2]))*3280.84
    if index % 1000 == 0:
        print(f"Completed {index} row")
time2 = time.time()         
print(time2 - time1)

Completed 0 row
26.07384204864502


In [52]:
combined_predict_traffic.head()

Unnamed: 0,lat,lon,0_distance,0_latitude,0_longitude,1000_distance,1000_latitude,1000_longitude,1001_distance,1001_latitude,...,998_longitude,999_distance,999_latitude,999_longitude,99_distance,99_latitude,99_longitude,9_distance,9_latitude,9_longitude
0,29.672427,-95.3197,42511740.0,29.680002,-95.458484,42536530.0,29.744669,-95.410643,42509110.0,29.668656,...,-95.467527,42555220.0,29.796032,-95.409997,42539180.0,29.748337,-95.361603,42527080.0,29.718541,-95.408966
1,29.672427,-95.322203,42511180.0,29.680002,-95.458484,42535970.0,29.744669,-95.410643,42508550.0,29.668656,...,-95.467527,42554660.0,29.796032,-95.409997,42538630.0,29.748337,-95.361603,42526520.0,29.718541,-95.408966
2,29.672427,-95.324705,42510630.0,29.680002,-95.458484,42535420.0,29.744669,-95.410643,42508000.0,29.668656,...,-95.467527,42554110.0,29.796032,-95.409997,42538070.0,29.748337,-95.361603,42525970.0,29.718541,-95.408966
3,29.672427,-95.327208,42510070.0,29.680002,-95.458484,42534870.0,29.744669,-95.410643,42507440.0,29.668656,...,-95.467527,42553550.0,29.796032,-95.409997,42537520.0,29.748337,-95.361603,42525410.0,29.718541,-95.408966
4,29.672427,-95.32971,42509520.0,29.680002,-95.458484,42534310.0,29.744669,-95.410643,42506890.0,29.668656,...,-95.467527,42553000.0,29.796032,-95.409997,42536970.0,29.748337,-95.361603,42524860.0,29.718541,-95.408966


# Clean Traffic+Distance data

In [53]:
#Drop the latitude column
combined_predict_traffic = combined_predict_traffic[combined_predict_traffic.columns.drop(list(combined_predict_traffic.filter(regex='_latitude')))]
#Drop the longitude column
combined_predict_traffic = combined_predict_traffic[combined_predict_traffic.columns.drop(list(combined_predict_traffic.filter(regex='_longitude')))]

In [54]:
combined_predict_traffic.shape

(222, 3955)

In [55]:
combined_predict_traffic.head()

Unnamed: 0,lat,lon,0_distance,1000_distance,1001_distance,1002_distance,1003_distance,1004_distance,1005_distance,1006_distance,...,992_distance,993_distance,994_distance,995_distance,996_distance,997_distance,998_distance,999_distance,99_distance,9_distance
0,29.672427,-95.3197,42511740.0,42536530.0,42509110.0,42535930.0,42542240.0,42514410.0,42518170.0,42518170.0,...,42528780.0,42520910.0,42518190.0,42514840.0,42514790.0,42514850.0,42514790.0,42555220.0,42539180.0,42527080.0
1,29.672427,-95.322203,42511180.0,42535970.0,42508550.0,42535370.0,42541690.0,42513850.0,42517610.0,42517620.0,...,42528220.0,42520350.0,42517630.0,42514280.0,42514230.0,42514290.0,42514240.0,42554660.0,42538630.0,42526520.0
2,29.672427,-95.324705,42510630.0,42535420.0,42508000.0,42534820.0,42541130.0,42513290.0,42517060.0,42517070.0,...,42527670.0,42519800.0,42517080.0,42513730.0,42513680.0,42513740.0,42513680.0,42554110.0,42538070.0,42525970.0
3,29.672427,-95.327208,42510070.0,42534870.0,42507440.0,42534270.0,42540580.0,42512740.0,42516510.0,42516510.0,...,42527110.0,42519240.0,42516520.0,42513170.0,42513120.0,42513180.0,42513130.0,42553550.0,42537520.0,42525410.0
4,29.672427,-95.32971,42509520.0,42534310.0,42506890.0,42533710.0,42540030.0,42512180.0,42515960.0,42515960.0,...,42526560.0,42518690.0,42515970.0,42512620.0,42512570.0,42512630.0,42512570.0,42553000.0,42536970.0,42524860.0


# Count the number of intersections with distance less than 1000 ft

In [56]:
#Create an empty column for number of intersection
combined_predict_traffic['number_intersections'] = 0

def count_values_in_range(series, range_min, range_max):
    # "between" returns a boolean Series equivalent to left <= series <= right.
    # NA values will be treated as False.
    return series.between(left=range_min, right=range_max).sum()

range_min, range_max = 0, 1000

combined_predict_traffic['number_intersections'] = combined_predict_traffic.iloc[:,2:].apply(
    func=lambda row: count_values_in_range(row, range_min, range_max), axis=1)


#Keep only coordinates of the monitoring station and number of intersections for each in the final dataset
combined_predict_traffic = combined_predict_traffic[['lon','lat','number_intersections']]

In [57]:
combined_predict_traffic.describe()

Unnamed: 0,lon,lat,number_intersections
count,222.0,222.0,222.0
mean,-95.418311,29.715005,1.0
std,0.082467,0.039068,0.0
min,-95.526338,29.672427,1.0
25%,-95.503857,29.677936,1.0
50%,-95.364746,29.6862,1.0
75%,-95.342223,29.755712,1.0
max,-95.3197,29.763409,1.0


In [58]:
combined_predict_traffic.head()

Unnamed: 0,lon,lat,number_intersections
0,-95.3197,29.672427,1
1,-95.322203,29.672427,1
2,-95.324705,29.672427,1
3,-95.327208,29.672427,1
4,-95.32971,29.672427,1


In [59]:
#Write to a csv file
combined_predict_traffic.to_csv("../../data/cleaned/traffic_data_predict.csv")