In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import scipy
import spacy
from collections import Counter
%matplotlib inline
sns.set_style("darkgrid")
colors = ['#651FFF', '#00B0FF', '#1DE9B6', '#00E676', '#FF9100', '#FF3D00']
sns.set_palette(colors)

In [8]:
bus = pd.read_csv('data/mta_1710.csv', low_memory=False, usecols=['RecordedAtTime','DirectionRef','PublishedLineName','OriginName','OriginLat','OriginLong','DestinationName','DestinationLat','DestinationLong','VehicleRef','VehicleLocation.Latitude','VehicleLocation.Longitude',
                                                          'NextStopPointName','ArrivalProximityText','DistanceFromStop','ExpectedArrivalTime','ScheduledArrivalTime'])

In [9]:
bus = bus.dropna()

bus['RecordedAtDateTime'] = pd.to_datetime(bus['RecordedAtTime'])
bus['day_of_week'] = bus['RecordedAtDateTime'].dt.dayofweek
bus['RecordedAtTime'] = bus['RecordedAtDateTime'].dt.time
bus['date'] = bus['RecordedAtDateTime'].dt.date

In [10]:
# DONT RUN THIS TWICE, WILL BREAK THE TIME
bus['ExpectedArrivalTime'] = pd.to_datetime(bus['ExpectedArrivalTime'], errors='coerce', utc=True)
bus['ExpectedArrivalTime'].head()

bus['ExpectedArrivalTime'] = pd.to_datetime(bus['ExpectedArrivalTime'], errors='coerce', utc=True)
sch_hours = pd.DataFrame()
sch_hours['hour'] = bus['ExpectedArrivalTime'].dt.hour
sch_hours['min'] = bus['ExpectedArrivalTime'].dt.minute

sch_hours['time'] = sch_hours['hour'].astype(str).str.cat(sch_hours['min'].astype(str))
sch_hours['time'] = sch_hours['time'].str.replace('.0','')
sch_hours['time'] = pd.to_numeric(sch_hours['time'], errors='coerce')

In [11]:
bus['ExpectedArrivalTime'] = sch_hours['time']

___

In [12]:
# DONT RUN THIS TWICE, WILL BREAK THE TIME
bus['ScheduledArrivalTime'] = pd.to_datetime(bus['ScheduledArrivalTime'], errors='coerce', utc=True)
sch_hours2 = pd.DataFrame()
sch_hours2['hour'] = bus['ScheduledArrivalTime'].dt.hour
sch_hours2['min'] = bus['ScheduledArrivalTime'].dt.minute

sch_hours2['time'] = sch_hours2['hour'].astype(str).str.cat(sch_hours2['min'].astype(str))
sch_hours2['time'] = sch_hours2['time'].str.replace('.0','')
sch_hours2['time'] = pd.to_numeric(sch_hours2['time'], errors='coerce')

In [13]:
bus['ScheduledArrivalTime'] = sch_hours2['time']

___

In [14]:
# DONT RUN THIS TWICE, WILL BREAK THE TIME
rec_hours = pd.DataFrame()
rec_hours['hour'] = bus['RecordedAtDateTime'].dt.hour
rec_hours['min'] = bus['RecordedAtDateTime'].dt.minute

rec_hours['time'] = rec_hours['hour'].astype(str).str.cat(rec_hours['min'].astype(str))
rec_hours['time'] = rec_hours['time'].str.replace('.0','')
rec_hours['time'] = pd.to_numeric(rec_hours['time'], errors='coerce')

In [15]:
bus['RecordedAtTime'] = rec_hours['time']

___

In [16]:
bus = bus.dropna()
bus['delay'] = bus['ScheduledArrivalTime'] - bus['ExpectedArrivalTime']

# Subtract all of the delays greater than two hours by 2360 so as to get them back to a reasonable delay time
bus.delay.loc[bus.delay>120] = 2360 - bus.delay.loc[bus.delay>120]

bus = bus.drop(bus[bus['delay']>60].index)
bus = bus.drop(bus[bus['delay']<-60].index)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


___

In [17]:
# https://docs.google.com/spreadsheets/d/1WqLV3EeCMOZwi0oZdYICkuvN86N0S4QGTMvbhS8UU0E/edit#gid=0
bus['on_time'] = np.where((bus['delay'] > 5) | (bus['delay'] < -1), 0, 1)
bus.head(3)

Unnamed: 0,RecordedAtTime,DirectionRef,PublishedLineName,OriginName,OriginLat,OriginLong,DestinationName,DestinationLat,DestinationLong,VehicleRef,...,NextStopPointName,ArrivalProximityText,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime,RecordedAtDateTime,day_of_week,date,delay,on_time
1,3.0,0,Q58,PALMETTO ST/MYRTLE AV,40.700178,-73.910254,FLUSHING MAIN ST,40.757344,-73.829362,NYCT_3967,...,108 ST/OTIS AV,approaching,131,4.0,2333.0,2017-10-01 00:03:32,6,2017-10-01,31.0,0
2,3.0,0,M15,SOUTH ST/WHITEHALL ST,40.701536,-74.012485,EAST HARLEM 125 ST via 1 AV,40.80315,-73.932264,NYCT_5912,...,1 AV/E 77 ST,approaching,135,4.0,2351.0,2017-10-01 00:03:29,6,2017-10-01,13.0,0
4,3.0,0,B6,HARWAY AV/BAY 37 ST,40.59351,-73.993996,EAST NY NEW LOTS STA,40.66642,-73.883387,NYCT_5110,...,FLATLANDS AV/RALPH AV,< 1 stop away,271,4.0,2347.0,2017-10-01 00:03:22,6,2017-10-01,17.0,0


In [39]:
bus_s = bus.sample(frac=1, random_state=1)

ontime = bus_s.loc[bus['on_time']==1][:10000]
offtime = bus_s.loc[bus['on_time']==0][:10000]

bus = pd.concat([ontime, offtime])
bus = bus.sample(frac=1, random_state=40)

___

In [19]:
bus.columns

Index(['RecordedAtTime', 'DirectionRef', 'PublishedLineName', 'OriginName',
       'OriginLat', 'OriginLong', 'DestinationName', 'DestinationLat',
       'DestinationLong', 'VehicleRef', 'VehicleLocation.Latitude',
       'VehicleLocation.Longitude', 'NextStopPointName',
       'ArrivalProximityText', 'DistanceFromStop', 'ExpectedArrivalTime',
       'ScheduledArrivalTime', 'RecordedAtDateTime', 'day_of_week', 'date',
       'delay', 'on_time'],
      dtype='object')

In [33]:
bus['delay_code'] = np.where(bus['on_time']==1, 1,
                            np.where((((-1 > bus.delay) & (bus.delay >= -10))|((5 < bus.delay)& (bus.delay <= 10))), 2,
                                    np.where((((-10 > bus.delay)& (bus.delay >= -15))|((10 < bus.delay) & (bus.delay <= 15))), 3, 4)))
bus.head(2)         

Unnamed: 0,RecordedAtTime,DirectionRef,PublishedLineName,OriginName,OriginLat,OriginLong,DestinationName,DestinationLat,DestinationLong,VehicleRef,...,ArrivalProximityText,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime,RecordedAtDateTime,day_of_week,date,delay,on_time,delay_code
3259508,64.0,0,S44,YUKON AV/KMART DWY,40.575645,-74.16715,ST GEORGE FERRY,40.643743,-74.073214,NYCT_8309,...,< 1 stop away,186,64.0,65.0,2017-10-16 06:04:07,0,2017-10-16,1.0,1,1
5579387,642.0,0,B82,STILLWELL TERMINAL BUS LOOP,40.577079,-73.981296,SPRING CRK TWRS SEAVIEW AV via KINGS HWY,40.642991,-73.878325,NYCT_4835,...,approaching,53,643.0,625.0,2017-10-26 06:42:49,3,2017-10-26,-18.0,0,4


In [44]:
#Import Library
import folium
from folium.plugins import MarkerCluster

lat = bus['VehicleLocation.Latitude']
lon = bus['VehicleLocation.Longitude']
delays = bus['delay']
delay_code = bus['delay_code']

# add color change for map
def color_change(code):
    if(code == 1):
        return('green')
    elif(code == 2):
        return('yellow')
    elif(code == 3):
        return('orange')
    else:
        return('red')

#Create base map
map = folium.Map(location=[40.7590,-73.9845], zoom_start = 13, tiles = "CartoDB dark_matter")

#Create Cluster
marker_cluster = MarkerCluster().add_to(map)

#Add Marker
#Plot Markers
for lat, lon, delays, delay_code in zip(lat, lon, delays, delay_code):
    folium.CircleMarker(location=[lat, lon], radius = 1, popup=str(delays)+" min", fill_color=color_change(delay_code), color=color_change(delay_code), fill_opacity = 0.9).add_to(map)

#Save the map
map.save("map1.html")