In [1]:
import os
import math
import json
import glob
import random
import zipfile
import numpy as np
import pandas as pd
from copy import copy
from ast import literal_eval
import matplotlib.pyplot as plt

from bokeh.io import show, output_notebook
from bokeh.tile_providers import CARTODBPOSITRON
from bokeh.plotting import figure, show, output_notebook, output_file

%matplotlib inline



In [2]:
DATA_PATH = './data'

os.chdir(DATA_PATH)
if not glob.glob('*csv'):
    zip_file = zipfile.ZipFile('chicago-taxi-rides-2016.zip')
    zip_file.extractall()
    zip_file.close()    
data_files = glob.glob('chicago_taxi_trips_*.csv')
data_files = [data_files[5]]
os.chdir('..')    

## Extract columns codings

In [5]:
biz_data = []
f = open('./data/column_remapping.json',encoding='utf8', errors='ignore')
data_dict = json.loads(next(f))


pickup_latitudes = {int(k):float(v) for k,v in data_dict['pickup_latitude'].items()}
pickup_longitudes = {int(k):float(v) for k,v in data_dict['pickup_longitude'].items()}
dropoff_latitudes = {int(k):float(v) for k,v in data_dict['dropoff_latitude'].items()}
dropoff_longitudes =  {int(k):float(v) for k,v in data_dict['dropoff_longitude'].items()}


## Loading data

In [6]:
usecols = ['taxi_id', 'trip_start_timestamp', 'trip_seconds',
           'trip_miles', 'trip_total', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
           'dropoff_longitude']
# parse_dates = ['trip_start_timestamp','trip_end_timestamp']

df = pd.DataFrame()
for file in data_files:
    tmp = pd.read_csv(os.path.join(DATA_PATH, file), usecols = usecols)
    tmp.dropna(inplace=True)
    df = pd.concat([df,tmp])
    
df['pickup_latitude'] = df['pickup_latitude'].map(pickup_latitudes)
df['pickup_longitude'] = df['pickup_longitude'].map(pickup_longitudes)
df['dropoff_latitude'] = df['dropoff_latitude'].map(dropoff_latitudes)
df['dropoff_longitude'] = df['dropoff_longitude'].map(dropoff_longitudes)

In [7]:
df.trip_start_timestamp = df.trip_start_timestamp.map(lambda x: int(x[-8:-6].strip()))

Unnamed: 0,taxi_id,trip_start_timestamp,trip_seconds,trip_miles,trip_total,pickup_latitude,pickup_longitude,dropoff_latitude,dropoff_longitude
0,5297.0,19,960.0,6.9,20.75,41.895033,-87.619711,41.979912,-87.664188
1,7491.0,17,3660.0,14.3,54.3,41.979071,-87.90304,41.962179,-87.645379
2,3668.0,7,1080.0,11.8,30.5,41.953582,-87.723452,41.980264,-87.913625
3,281.0,21,360.0,1.1,6.75,41.877406,-87.621972,41.895033,-87.619711
4,7985.0,17,900.0,0.1,10.0,41.8853,-87.642808,41.899156,-87.626211


## stripping latitude and longitude

In [8]:
coord_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
       'dropoff_longitude']

for col in coord_cols:
    df[col] = df[col].map(lambda x: int(100*x) /100.0)

### Ride counts

In [9]:
ride_counts = df.groupby('trip_start_timestamp').count()

In [10]:
def merc(coords):
    Coordinates = literal_eval(coords)
    lat = Coordinates[0]
    long = Coordinates[1]
    
    r_major = 6378137.000
    x = r_major * math.radians(long)
    scale = x/long
    y = 180.0/math.pi * math.log(math.tan(math.pi/4.0 + lat*(math.pi/180.0)/2.0)) * scale
    
    return (x,y)

In [11]:
def merc_coords(row):
    coords = '({}, {})'.format(row.pickup_latitude, row.pickup_longitude)
    return coords

## Accumulating data over latitudes and longitudes

In [12]:
cc = copy(df[['pickup_latitude', 'pickup_longitude','trip_start_timestamp','taxi_id']])
cc = cc.groupby(['pickup_latitude', 'pickup_longitude','trip_start_timestamp']).count().reset_index().set_index('trip_start_timestamp')
cc.rename(columns={'taxi_id':'ride_counts'}, inplace=True)
cc.sort_index(inplace=True)

In [13]:
cc = cc[(cc.pickup_longitude > -87.675)  & (cc.pickup_longitude <  -87.525)]
cc = cc[(cc.pickup_latitude > 41.84)  & (cc.pickup_latitude <  41.92)]

cc['merc_coords'] = cc.apply(merc_coords, axis=1).apply(merc)
del cc['pickup_latitude']
del cc['pickup_longitude']
cc['merc x coord'] = cc['merc_coords'].map(lambda x:x[0])
cc['merc y coord'] = cc['merc_coords'].map(lambda x:x[1])
del cc['merc_coords']

Unnamed: 0_level_0,ride_counts,merc x coord,merc y coord
trip_start_timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,680,-9756040.0,5143021.0
0,2232,-9754927.0,5143021.0
0,2004,-9753814.0,5143021.0
0,1356,-9758267.0,5141526.0
0,70,-9758267.0,5138537.0


In [14]:
# range bounds supplied in web mercator coordinates    
p = figure(x_axis_type="mercator", y_axis_type="mercator")
p.title.text = 'Comparing Taxi demands at 08:00 and 19:00 Hours'
p.add_tile(CARTODBPOSITRON)

p.circle(x=cc.loc[8]['merc x coord'],
         y=cc.loc[8]['merc y coord'], 
         size=500*(cc.loc[8]['ride_counts'] / cc.loc[8]['ride_counts'].sum()),
         line_color="#FF0000", 
         fill_color="#FF0000",
         fill_alpha=0.5, legend='Demand at 09:00')
    
p.circle(x=cc.loc[19]['merc x coord'],
         y=cc.loc[19]['merc y coord'], 
         size=500*(cc.loc[19]['ride_counts'] / cc.loc[19]['ride_counts'].sum()),
         line_color="black", 
         fill_color="black",
         fill_alpha=0.5, legend='Demand at 19:00')
    
p.legend.location = "top_left"
# p.legend.click_policy="hide"
output_notebook()
# output_file("temporal_taxi_demands.html", title="temporal_taxi_demands")

show(p)


## Extract columns codings

In [15]:
biz_data = []
f = open('./data/column_remapping.json',encoding='utf8', errors='ignore')
data_dict = json.loads(next(f))


pickup_latitudes = {int(k):float(v) for k,v in data_dict['pickup_latitude'].items()}
pickup_longitudes = {int(k):float(v) for k,v in data_dict['pickup_longitude'].items()}
dropoff_latitudes = {int(k):float(v) for k,v in data_dict['dropoff_latitude'].items()}
dropoff_longitudes =  {int(k):float(v) for k,v in data_dict['dropoff_longitude'].items()}


In [16]:
usecols = ['taxi_id', 'trip_start_timestamp', 'trip_seconds',
           'trip_miles', 'trip_total', 'pickup_latitude', 'pickup_longitude', 'dropoff_latitude',
           'dropoff_longitude']
# parse_dates = ['trip_start_timestamp','trip_end_timestamp']

df = pd.DataFrame()
for file in data_files:
    tmp = pd.read_csv(os.path.join(DATA_PATH, file), usecols = usecols)
    tmp.dropna(inplace=True)
    df = pd.concat([df,tmp])
    
df['pickup_latitude'] = df['pickup_latitude'].map(pickup_latitudes)
df['pickup_longitude'] = df['pickup_longitude'].map(pickup_longitudes)
df['dropoff_latitude'] = df['dropoff_latitude'].map(dropoff_latitudes)
df['dropoff_longitude'] = df['dropoff_longitude'].map(dropoff_longitudes)
df = df.drop_duplicates()

In [17]:
taxi_ride_counts = df.groupby('taxi_id').agg({'trip_seconds':'count'})
taxi_ride_counts = taxi_ride_counts.sort_values(by='trip_seconds', ascending=False)
random.seed(1105)
best_taxis = taxi_ride_counts.iloc[random.choices(range(taxi_ride_counts.shape[0]), k = 4)].reset_index()
worst_taxis = pd.DataFrame()# taxi_ride_counts.iloc[random.choices()].reset_index() # 

df = pd.merge(pd.concat([best_taxis, worst_taxis]),df, on='taxi_id', how='left')

df['trip_start_timestamp'] = pd.to_datetime(df['trip_start_timestamp'])

### Distance bt two coordinates

In [18]:
def degree_to_rad(degree):
    return degree * math.pi / 180

def dist_bw_coord(coord1, coord2):
    lat1, long1 = coord1
    lat2, long2 = coord2
    
    earthRadiusKm = 6371
    dLat = degree_to_rad(lat2-lat1)
    dLon = degree_to_rad(long2-long1)
    
    lat1 = degree_to_rad(lat1)
    lat2 = degree_to_rad(lat2)
    
    a = math.sin(dLat/2) * math.sin(dLat/2) + math.sin(dLon/2) * math.sin(dLon/2) * math.cos(lat1) * math.cos(lat2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    
    return earthRadiusKm * c

In [19]:
def return_dist(row):
    coord1 = (row.pickup_latitude, row.pickup_longitude)
    coord2 = (row.prev_dropoff_latitude, row.prev_dropoff_longitude)
    return dist_bw_coord(coord1, coord2)    

In [20]:
taxi_day_df = df.groupby(['taxi_id',df['trip_start_timestamp'].dt.date])

In [21]:
data_points = []
for key, taxi_df in taxi_day_df:
    tmp_taxi_df = taxi_df[['dropoff_latitude','dropoff_longitude']].shift(1)
    tmp_taxi_df.columns = ['prev_dropoff_latitude','prev_dropoff_longitude']
    taxi_df = pd.concat([taxi_df, tmp_taxi_df], axis=1)[['taxi_id', 'trip_start_timestamp', 
                                                    'trip_total', 'pickup_latitude', 'pickup_longitude', 
                                                    'prev_dropoff_latitude', 'prev_dropoff_longitude']]
    m = taxi_df.shape[0]
    taxi_df.dropna(inplace=True)
    total_distance = 1 * taxi_df.apply(return_dist, axis=1).sum() / 1.609344 # 3 for factors like we don't pt-pt distance
    taxi_id, day = key
    data_points.append({'taxi_id':taxi_id, 'day':day, 'distance':total_distance})    

In [22]:
sample_taxi_bw_distances = pd.DataFrame(data_points).pivot(index='day', columns='taxi_id',values='distance')

In [23]:
cols = sample_taxi_bw_distances.columns
sample_taxi_bw_distances = sample_taxi_bw_distances.reset_index()

In [24]:
p = figure(plot_width=800, plot_height=250, x_axis_type="datetime")
p.title.text = 'Distance between ride bookings'
colors = ['#440154','#000000', '#084594', '#AA0000']#,  '#2171b5', '#deebf7','#ff0000', '#4292c6', '#c6dbef']
colors = colors[:4]

for col, color in zip(cols, colors):
    p.line(sample_taxi_bw_distances['day'], sample_taxi_bw_distances[col], 
           line_width=2, color=color, alpha=0.8)
    
p.xaxis.axis_label = 'Day'
p.yaxis.axis_label = 'Distance (miles)'
# output_file("distance between ride bookings.html", title="distance between ride bookings")
output_notebook()


show(p)