In [5]:
library(tidyverse)
library(dplyr)
library(ggplot2)
library(ggthemes)
library(leaflet)

"package 'leaflet' was built under R version 3.6.3"

In [13]:
#read in the files
trips <- read.csv("https://raw.githubusercontent.com/sumusa/Data-Visualization-Course/master/Client%20Work/gtfs%20data/trips.txt")
shapes <- read.csv("https://raw.githubusercontent.com/sumusa/Data-Visualization-Course/master/Client%20Work/gtfs%20data/shapes.txt")
routes <- read.csv("https://raw.githubusercontent.com/sumusa/Data-Visualization-Course/master/Client%20Work/gtfs%20data/routes.txt")
stop_times <- read.csv("https://raw.githubusercontent.com/sumusa/Data-Visualization-Course/master/Client%20Work/gtfs%20data/stop_times.txt",
                      colClasses = c("arrival_time" = "character", "departure_time" = "character"))

In [14]:
head(stop_times)

trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
178899,23:30:00,23:30:00,1000,1,,0,0,,1
178899,23:31:34,23:31:34,1010,2,,0,0,0.7651,0
178899,23:31:54,23:31:54,1015,3,,0,0,0.9339,0
178899,23:32:31,23:32:31,1020,4,,0,0,1.2333,0
178899,23:33:05,23:33:05,1025,5,,0,0,1.5145,0
178899,23:33:32,23:33:32,1030,6,,0,0,1.7336,0


In [15]:
#join the stop times, trips and routes dataframe
stop_times <- stop_times %>% 
  left_join(trips) %>% 
  left_join(routes) %>% 
  select(route_id, route_short_name, trip_id, stop_id, service_id, arrival_time, 
         departure_time, direction_id, shape_id, stop_sequence)

Joining, by = "trip_id"
Joining, by = "route_id"


In [19]:
head(stop_times, 10)

route_id,route_short_name,trip_id,stop_id,service_id,arrival_time,departure_time,direction_id,shape_id,stop_sequence
1,1,178899,1000,1,23:30:00,23:30:00,0,14128,1
1,1,178899,1010,1,23:31:34,23:31:34,0,14128,2
1,1,178899,1015,1,23:31:54,23:31:54,0,14128,3
1,1,178899,1020,1,23:32:31,23:32:31,0,14128,4
1,1,178899,1025,1,23:33:05,23:33:05,0,14128,5
1,1,178899,1030,1,23:33:32,23:33:32,0,14128,6
1,1,178899,1035,1,23:33:56,23:33:56,0,14128,7
1,1,178899,1040,1,23:34:11,23:34:11,0,14128,8
1,1,178899,1045,1,23:34:36,23:34:36,0,14128,9
1,1,178899,1050,1,23:35:09,23:35:09,0,14128,10


In [20]:
#selecting the service-id with more trips
trips %>% 
  group_by(service_id) %>% 
  count(service_id) %>%
  arrange(desc(n))

service_id,n
5,741
1,534
2,330


In [21]:
bigger_service <- trips %>% 
  group_by(service_id) %>% 
  count(service_id) %>%
  arrange(desc(n)) %>% 
  head(1)

In [22]:
#filtering by service_id, stop-sequence and direction-id
stop_times <- stop_times %>% 
  filter(
    stop_sequence == 1 & 
      direction_id == 0 &
      service_id == bigger_service$service_id)
 
head(stop_times)

route_id,route_short_name,trip_id,stop_id,service_id,arrival_time,departure_time,direction_id,shape_id,stop_sequence
1,1,178854,1000,5,8:15:00,8:15:00,0,14127,1
1,1,178855,1000,5,8:00:00,8:00:00,0,14126,1
1,1,178856,1000,5,7:45:00,7:45:00,0,14127,1
1,1,178857,1000,5,7:30:00,7:30:00,0,14126,1
1,1,178858,1000,5,7:00:00,7:00:00,0,14125,1
1,1,178859,1000,5,6:30:00,6:30:00,0,14125,1


In [23]:
#transforming characters 
stop_times <- stop_times %>% 
  mutate(
    arrival_time = ifelse(
      as.integer(substr(arrival_time, 1, 2)) < 24,
      as.integer(substr(arrival_time, 1, 2)),
      as.integer(substr(arrival_time, 1, 2)) - 24),
    departure_time = ifelse(
      as.integer(substr(departure_time, 1, 2)) < 24,
      as.integer(substr(departure_time, 1, 2)),
      as.integer(substr(departure_time, 1, 2)) -24)
    )
head(stop_times)

"NAs introduced by coercion"

route_id,route_short_name,trip_id,stop_id,service_id,arrival_time,departure_time,direction_id,shape_id,stop_sequence
1,1,178854,1000,5,8,8,0,14127,1
1,1,178855,1000,5,8,8,0,14126,1
1,1,178856,1000,5,7,7,0,14127,1
1,1,178857,1000,5,7,7,0,14126,1
1,1,178858,1000,5,7,7,0,14125,1
1,1,178859,1000,5,6,6,0,14125,1


In [25]:
#calculate the number of trips per hour
output_data <- stop_times %>% 
  group_by_at(vars(route_id, route_short_name, arrival_time)) %>% 
  count(arrival_time) %>%
  mutate(time_window = paste(arrival_time, '00', sep = ':')) %>% 
  select(route_id, route_short_name, arrival_time, time_window, n)
head(output_data)

route_id,route_short_name,arrival_time,time_window,n
1,1,6,6:00,1
1,1,7,7:00,3
1,1,8,8:00,3
1,1,9,9:00,2
1,1,10,10:00,2
1,1,11,11:00,1


In [26]:
write.csv(output_data, "trips_per_hour.csv")

In [30]:
line <- output_data %>% 
  filter(route_id == '1')
line

route_id,route_short_name,arrival_time,time_window,n
1,1,6,6:00,1
1,1,7,7:00,3
1,1,8,8:00,3
1,1,9,9:00,2
1,1,10,10:00,2
1,1,11,11:00,1
1,1,12,12:00,2
1,1,13,13:00,2
1,1,14,14:00,3
1,1,15,15:00,4


In [31]:
#factorize time window
line$time_window <- factor(line$time_window, levels = unique(line$time_window))

also installing the dependencies 'XML', 'rlist', 'igraph', 'rjson'



package 'XML' successfully unpacked and MD5 sums checked
package 'rlist' successfully unpacked and MD5 sums checked
package 'igraph' successfully unpacked and MD5 sums checked
package 'rjson' successfully unpacked and MD5 sums checked
package 'highcharter' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\Sumayyah Musa\AppData\Local\Temp\Rtmp8Y0IjM\downloaded_packages
