import TLC summary statistics data

toddwschneider · Apr 3, 2016 · a21a3a5 · a21a3a5
1 parent f25b888
commit a21a3a5
Show file tree

Hide file tree

Showing 7 changed files with 201 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -63,6 +63,12 @@ These are bundled with the repository, so no need to download separately, but:
 
 [Google BigQuery](https://cloud.google.com/bigquery/) and [Amazon Redshift](https://aws.amazon.com/redshift/) would probably provide significant performance improvements over PostgreSQL. A lot of the data is already available on BigQuery, but in scattered tables, and each trip has only by latitude and longitude coordinates, not census tracts and neighborhoods. PostGIS seemed like the easiest way to map coordinates to census tracts. Once the mapping is complete, it might make sense to load the data back into BigQuery or Redshift to make the analysis faster. Note that BigQuery and Redshift cost some amount of money, while PostgreSQL and PostGIS are free.
 
+## TLC summary statistics
+
+There's a Ruby script in the `tlc_statistics/` folder to import data from the TLC's [summary statistics reports](http://www.nyc.gov/html/tlc/html/about/statistics.shtml):
+
+`ruby import_statistics_data.rb`
+
 ## Questions/issues/contact
 
 todd@toddwschneider.com, or open a GitHub issue
diff --git a/analysis/analysis.R b/analysis/analysis.R
@@ -669,3 +669,76 @@ ggplot(data = payments_split, aes(x = month, y = frac_credit, color = total_amou
   theme(legend.position = "bottom")
 add_credits()
 dev.off()
+
+# analysis of TLC summary reports
+uber_lyft_trips_per_day = tbl_df(query("
+  SELECT
+    dba_category AS category,
+    pickup_end_date AS date,
+    ROUND(SUM(total_dispatched_trips)::numeric / 7) AS trips_per_day
+  FROM fhv_weekly_reports
+  WHERE dba_category IN ('uber', 'lyft')
+    AND pickup_end_date >= '2015-01-08'
+  GROUP BY dba_category, pickup_end_date
+  ORDER BY dba_category, pickup_end_date
+"))
+
+yellow_trips_per_day = tbl_df(query("
+  SELECT
+    month AS date,
+    trips_per_day
+  FROM yellow_monthly_reports
+  ORDER BY month
+")) %>% mutate(category = "yellow")
+
+trips_per_day = bind_rows(yellow_trips_per_day, uber_lyft_trips_per_day) %>%
+  mutate(category = factor(category, levels = c("yellow", "uber", "lyft"), labels = c("Yellow Taxi", "Uber", "Lyft")))
+
+png(filename = "graphs/taxi_uber_lyft_trips_per_day.png", width = 640, height = 520)
+ggplot(data = trips_per_day, aes(x = date, y = trips_per_day, color = category)) +
+  geom_line(size = 1) +
+  scale_y_continuous("Trips per day\n", labels = unit_format("k", 1/1000, "")) +
+  scale_x_date("") +
+  scale_color_manual("", values = c("#F7B731", "#161629", "#E70B81")) +
+  title_with_subtitle("NYC Taxis Losing Market Share to Uber", "Trips per day in NYC, based on TLC summary data") +
+  theme_tws(base_size = 24) +
+  theme(legend.position = "bottom")
+add_credits()
+dev.off()
+
+png(filename = "graphs/lyft_trips_per_day.png", width = 640, height = 520)
+ggplot(data = filter(trips_per_day, category == "Lyft"), aes(x = date, y = trips_per_day)) +
+  geom_line(size = 1.5, color = "#E70B81") +
+  scale_y_continuous("Trips per day\n", labels = comma) +
+  scale_x_date("", labels = date_format("%b '%y"), minor_breaks = date_breaks("1 month")) +
+  title_with_subtitle("Lyft NYC Trips Per Day", "Based on TLC summary data") +
+  expand_limits(y = 0) +
+  theme_tws(base_size = 24) +
+  theme(legend.position = "bottom")
+add_credits()
+dev.off()
+
+uber_lyft_vehicles = tbl_df(query("
+  SELECT
+    dba_category AS category,
+    pickup_end_date AS date,
+    SUM(unique_dispatched_vehicles) AS vehicles
+  FROM fhv_weekly_reports_view
+  WHERE week_number NOT IN (1, 53)
+    AND dba_category IN ('uber', 'lyft')
+  GROUP BY dba_category, pickup_end_date
+  HAVING SUM(unreliable_vehicles_count::int) = 0
+  ORDER BY dba_category, pickup_end_date
+"))
+
+yellow_vehicles = tbl_df(query("
+  SELECT
+    month AS date,
+    trips_per_day,
+    unique_medallions AS vehicles
+  FROM yellow_monthly_reports
+  ORDER BY month
+")) %>% mutate(category = "yellow")
+
+vehicles = bind_rows(uber_lyft_vehicles, yellow_vehicles) %>%
+  mutate(category = factor(category, levels = c("yellow", "uber", "lyft"), labels = c("Yellow Taxi", "Uber", "Lyft")))
diff --git a/analysis/graphs/lyft_trips_per_day.png b/analysis/graphs/lyft_trips_per_day.png
diff --git a/analysis/graphs/taxi_uber_lyft_trips_per_day.png b/analysis/graphs/taxi_uber_lyft_trips_per_day.png
diff --git a/analysis/helpers.R b/analysis/helpers.R
@@ -19,7 +19,7 @@ add_credits = function(fontsize = 12, color = "#777777", xpos = 0.99) {
             x = xpos,
             y = 0.02,
             just = "right",
-            gp = gpar(fontsize = fontsize, col = color))
+            gp = gpar(fontsize = fontsize, col = color, fontfamily = font_family))
 }
 
 title_with_subtitle = function(title, subtitle = "") {

diff --git a/tlc_statistics/create_statistics_tables.sql b/tlc_statistics/create_statistics_tables.sql
@@ -0,0 +1,51 @@
+DROP TABLE IF EXISTS yellow_monthly_reports CASCADE;
+DROP TABLE IF EXISTS fhv_weekly_reports CASCADE;
+
+CREATE TABLE yellow_monthly_reports (
+  month date,
+  trips_per_day integer,
+  farebox_per_day integer,
+  unique_drivers integer,
+  unique_medallions integer,
+  medallions_per_day integer,
+  avg_days_medallions_on_road numeric,
+  avg_hours_per_day_per_medallion numeric,
+  avg_days_drivers_on_road numeric,
+  avg_hours_per_day_per_driver numeric,
+  avg_minutes_per_trip numeric,
+  percent_trips_paid_with_credit_card numeric
+);
+
+CREATE UNIQUE INDEX index_yellow_monthly_uniq ON yellow_monthly_reports (month);
+
+CREATE TABLE fhv_weekly_reports (
+  base_number varchar,
+  wave_number integer,
+  base_name varchar,
+  dba varchar,
+  year integer,
+  week_number integer,
+  pickup_start_date date,
+  pickup_end_date date,
+  total_dispatched_trips integer,
+  unique_dispatched_vehicles integer,
+  dba_category varchar
+);
+
+CREATE UNIQUE INDEX index_fhv_weekly_uniq ON fhv_weekly_reports (base_number, year, week_number);
+CREATE INDEX index_fhv_on_category ON fhv_weekly_reports (dba_category, pickup_end_date);
+
+CREATE VIEW fhv_weekly_reports_intermediate_view AS
+SELECT
+  *,
+  LAG(unique_dispatched_vehicles, 1) OVER (PARTITION BY base_number ORDER BY year, week_number) AS prev_uniq_vehicles,
+  LEAD(unique_dispatched_vehicles, 1) OVER (PARTITION BY base_number ORDER BY year, week_number) AS next_uniq_vehicles
+FROM fhv_weekly_reports;
+
+CREATE VIEW fhv_weekly_reports_view AS
+SELECT
+  *,
+  (unique_dispatched_vehicles > 5
+    AND unique_dispatched_vehicles::numeric / prev_uniq_vehicles > 1.4
+    AND unique_dispatched_vehicles::numeric / next_uniq_vehicles > 1.4) AS unreliable_vehicles_count
+FROM fhv_weekly_reports_intermediate_view;
diff --git a/tlc_statistics/import_statistics_data.rb b/tlc_statistics/import_statistics_data.rb
@@ -0,0 +1,70 @@
+require 'rubygems'
+require 'csv'
+require 'roo'
+require 'rest-client'
+require 'active_support'
+require 'active_support/core_ext'
+
+# yellow taxi data
+yellow_monthly_data_url = "http://www.nyc.gov/html/tlc/downloads/csv/data_reports_monthly_indicators_yellow.csv"
+yellow_monthly_data = CSV.parse(RestClient.get(yellow_monthly_data_url))
+yellow_monthly_data.shift
+
+CSV.open("yellow_monthly_data.csv", "wb") do |csv|
+  yellow_monthly_data.each do |row|
+    csv << [
+      Date.strptime(row[0], "%B-%y").end_of_month,
+      row[1].gsub(",", "").to_i,
+      row[2].gsub(",", "").to_i,
+      row[3].gsub(",", "").to_i,
+      row[4].gsub(",", "").to_i,
+      row[5].gsub(",", "").to_i,
+      row[6].to_f,
+      row[7].to_f,
+      row[8].to_f,
+      row[9].to_f,
+      row[10].presence.try(:to_f),
+      row[11].gsub("%", "").to_f
+    ]
+  end
+end
+
+# FHV weekly data (includes Uber and Lyft)
+fhv_weekly_data_url = "http://www.nyc.gov/html/tlc/downloads/excel/data_reports_weekly_fhv_base.xlsx"
+fhv = Roo::Spreadsheet.open(fhv_weekly_data_url)
+
+header_row_index = (fhv.first_row..fhv.last_row).detect do |i|
+  fhv.row(i).compact.size > 5
+end
+header_row = fhv.row(header_row_index)
+
+column_indexes_to_select = header_row.map.with_index do |name, idx|
+  idx if name.present?
+end.compact
+
+data_row_range = (header_row_index + 1)..fhv.last_row
+
+CSV.open("fhv_weekly_data.csv", "wb") do |csv|
+  data_row_range.each do |i|
+    row = fhv.row(i)
+
+    values = column_indexes_to_select.map { |col_idx| row[col_idx] }
+
+    dba_string = values[3]
+
+    values << if dba_string =~ /^uber/i
+      "uber"
+    elsif dba_string =~ /^lyft/i
+      "lyft"
+    else
+      "other"
+    end
+
+    csv << values
+  end
+end
+
+# create tables and import data
+system(%{psql nyc-taxi-data -f create_statistics_tables.sql})
+system(%{cat yellow_monthly_data.csv | psql nyc-taxi-data -c "COPY yellow_monthly_reports FROM stdin CSV;"})
+system(%{cat fhv_weekly_data.csv | psql nyc-taxi-data -c "COPY fhv_weekly_reports FROM stdin CSV;"})