In [114]:
# import Python data analysis and plotting tools
import numpy as np; import pandas as pd
from pandas import DataFrame, Series
import matplotlib
import matplotlib.pyplot as plt
import random
import json
import urllib, simplejson
# mapping functionality
from mpl_toolkits.basemap import Basemap

In [115]:
# Read data on trips made in 2013 from csv
location_data = pd.read_csv('Divvy_Stations_Trips_2013/Divvy_Stations_2013.csv', dtype='unicode')
trip_data = pd.read_csv('Divvy_Stations_Trips_2013/Divvy_Trips_2013.csv', dtype='unicode')
location_data = location_data.set_index("id")
trip_data = trip_data.set_index("trip_id")

In [116]:
# Coordinates of landmarks to be included on map for geographical context
landmarks = ['University of Chicago', 'Millennium Park', 'Willis Tower', 'Wrigley Field', 'Field Museum']
lats = [41.7897, 41.8827, 41.8789, 41.9483, 41.8663]
longs = [-87.5997, -87.6227, -87.6358, -87.6556, -87.6170]

In [117]:
# Drop some superfluous columns
trip_data.drop(['gender', 'birthday'],inplace=True,axis=1)
# Make sure numeric fields are seen as correct data types
trip_data[['bikeid', 'tripduration', 'from_station_id', 'to_station_id']] = trip_data[['bikeid', 'tripduration', 'from_station_id', 'to_station_id']].astype(int)
location_data[['longitude', 'latitude']] = location_data[['longitude', 'latitude']].astype(float)

In [118]:
# Build a dataframe of trips, grouped by originating and terminal station id, 
# the number of trips of to/from those stations, and the average length of those trips
trip_summary = trip_data[['from_station_id', 'to_station_id', 'tripduration']]
trip_summary = trip_summary.groupby(['from_station_id', 'to_station_id'])['tripduration'].agg({'number_trips': len, 'avg_duration': np.mean})

In [119]:
# Count the number of trips to and from each station
trips_from = DataFrame(trip_data.groupby("from_station_id").size())
trips_to = DataFrame(trip_data.groupby("to_station_id").size())

In [120]:
trips_from.index.names = ["station_id"]
trips_to.index.names = ["station_id"]
trips_from.columns = ['number_trips_from']
trips_to.columns = ['number_trips_to']
trip_counts = trips_to.join(trips_from)

In [121]:
# Calculate the difference in number of trips starting vs ending at a station
trip_counts['diff'] = trip_counts['number_trips_to'] - trip_counts['number_trips_from']

In [122]:
# Join this with the table of latitudes and longitudes so that it can be plotted on a map
trip_counts = trip_counts.join(location_data[['latitude','longitude']])

In [123]:
# Make a figure
fig = plt.figure()
fig.set_size_inches(5,10)
# Map of the city of Chicago
map = Basemap(projection='merc', lat_0=41.8, lon_0=-87.5,
    resolution = 'h', area_thresh = 0.1,
    llcrnrlon=-87.71, llcrnrlat=41.78,
    urcrnrlon=-87.575, urcrnrlat=41.98)
map.drawcoastlines()
map.drawcountries()
# Color it in
map.fillcontinents(color='#AAF2AA', lake_color="#BAE5E8")
for ix, row in trip_counts.iterrows():
    to_dotsize = row['number_trips_to']/629
    from_dotsize = row['number_trips_to']/652
    if to_dotsize < 1:
        to_dotsize = 1
    if from_dotsize < 1:
        from_dotsize = 1
    x, y = map(row['longitude'], row['latitude'])
    map.plot(x, y, 'bo', alpha = 0.5, markersize=to_dotsize)
    map.plot(x, y, 'ro', alpha = 0.5, markersize=from_dotsize)
for i in range(0, len(landmarks)):
    x, y = map(longs[i],lats[i])
    plt.text(x,y, landmarks[i])
plt.savefig("mostused.png")
plt.close(fig)

In [124]:
fig = plt.figure()
fig.set_size_inches(5,10)
map = Basemap(projection='merc', lat_0=41.8, lon_0=-87.5,
    resolution = 'h', area_thresh = 0.1,
    llcrnrlon=-87.71, llcrnrlat=41.78,
    urcrnrlon=-87.575, urcrnrlat=41.98)
map.drawcoastlines()
map.drawcountries()
map.fillcontinents(color='#AAF2AA', lake_color="#BAE5E8") 
for ix, row in trip_counts.iterrows():
    dotsize = abs(row['diff'])/50
    if dotsize < 1:
        dotsize = 1
    if row['diff'] > 0:
        color = 'go'
    else:
        color = 'ro'
    x, y = map(row['longitude'], row['latitude'])
    map.plot(x, y, color, alpha=0.5, markersize=dotsize)
for i in range(0, len(landmarks)):
    x, y = map(longs[i],lats[i])
    plt.text(x,y, landmarks[i])
plt.savefig("diffs.png")
plt.close(fig)

In [125]:
top_trips = trip_summary.sort(['number_trips'], ascending=False)

In [126]:
top_trips = top_trips.head(100)

In [127]:
fig = plt.figure()
fig.set_size_inches(5,10)
map = Basemap(projection='merc', lat_0=41.8, lon_0=-87.5,
    resolution = 'h', area_thresh = 0.1,
    llcrnrlon=-87.71, llcrnrlat=41.78,
    urcrnrlon=-87.575, urcrnrlat=41.98)
map.drawcoastlines()
map.drawcountries()
map.fillcontinents(color='#AAF2AA', lake_color="#BAE5E8") 
for ix, row in top_trips.iterrows():
    width = row['number_trips']/100
    startlat = location_data['latitude'][ix[0]]
    startlong = location_data['longitude'][ix[0]]
    endlat = location_data['latitude'][ix[1]]
    endlong = location_data['longitude'][ix[1]]
    map.drawgreatcircle(startlong,startlat,endlong,endlat, alpha=0.2, linewidth=width,color='red')
for i in range(0, len(landmarks)):
    x, y = map(longs[i],lats[i])
    plt.text(x,y, landmarks[i])
plt.savefig("trips.png")

In [128]:
top_trips.to_csv('toptrips.csv', encoding='utf-8')
# import cleaned-up Google maps data containing travel times
travel_times = pd.read_csv('trip_output.csv', sep=';')
travel_times = travel_times.set_index("Unnamed: 0")
travel_times.index.names = ['id']

In [129]:
def ssxx(data):
    mean = sum(data)/len(data)
    result = 0
    for item in data:
        result = result + pow(item - mean, 2)
    return result
    
def ssxy(x, y):
    meanx = sum(x)/len(x)
    meany = sum(y)/len(y)
    result = 0
    for key, value in y.iteritems():
        result = result + ((x[key] - meanx) * (y[key] - meany))
    return result

def ssr(x, y, b0, b1):
    result = 0
    for key, value in y.iteritems():
        result = result + math.pow(y[key] - (b0 + b1 * x[key]), 2)
    return result

def regression_figure(x, y):
    plt.scatter(x,y)
    b1 = ssxy(x,y)/ssxx(x)
    b0 = sum(y)/len(y) - (b1 * sum(x)/len(x))
    diff = x.max() - x.min()
    start = x.min()
    linex = []
    for i in range(0, 11):
        linex.append(start + (i * diff/10))
    liney = []
    for item in linex:
        liney.append(b0 + b1 * item)
    plt.plot(linex, liney, linewidth=2.0, color='black')

In [131]:
x = travel_times['biking_time']
y = travel_times['transit_time']
plt.xlabel('Estimated biking time (seconds)')
plt.ylabel('Estimated public transit time (seconds)')
regression_figure(travel_times['biking_time'],travel_times['transit_time'])
plt.savefig("bike_vs_transit.png")
plt.close()

In [132]:
regression_figure(travel_times['biking_time'], travel_times['avg_duration'])
plt.xlabel('Estimated bike time (seconds)')
plt.ylabel('Average rental duration (seconds)')
plt.axhline(y = 1800, color='red')
plt.savefig("bike_vs_actual.png")
plt.close()

In [134]:
regression_figure(travel_times['transit_time'],travel_times['avg_duration'])
plt.xlabel('Estimated transit time (seconds)')
plt.ylabel('Average rental duration (seconds)')
plt.axhline(y = 1800, color='red')
plt.savefig("transit_vs_actual.png")
plt.close()