Skip to content

Commit

Permalink
import TLC summary statistics data
Browse files Browse the repository at this point in the history
  • Loading branch information
toddwschneider committed Apr 3, 2016
1 parent f25b888 commit a21a3a5
Show file tree
Hide file tree
Showing 7 changed files with 201 additions and 1 deletion.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ These are bundled with the repository, so no need to download separately, but:

[Google BigQuery](https://cloud.google.com/bigquery/) and [Amazon Redshift](https://aws.amazon.com/redshift/) would probably provide significant performance improvements over PostgreSQL. A lot of the data is already available on BigQuery, but in scattered tables, and each trip has only by latitude and longitude coordinates, not census tracts and neighborhoods. PostGIS seemed like the easiest way to map coordinates to census tracts. Once the mapping is complete, it might make sense to load the data back into BigQuery or Redshift to make the analysis faster. Note that BigQuery and Redshift cost some amount of money, while PostgreSQL and PostGIS are free.

## TLC summary statistics

There's a Ruby script in the `tlc_statistics/` folder to import data from the TLC's [summary statistics reports](http://www.nyc.gov/html/tlc/html/about/statistics.shtml):

`ruby import_statistics_data.rb`

## Questions/issues/contact

todd@toddwschneider.com, or open a GitHub issue
73 changes: 73 additions & 0 deletions analysis/analysis.R
Original file line number Diff line number Diff line change
Expand Up @@ -669,3 +669,76 @@ ggplot(data = payments_split, aes(x = month, y = frac_credit, color = total_amou
theme(legend.position = "bottom")
add_credits()
dev.off()

# analysis of TLC summary reports
uber_lyft_trips_per_day = tbl_df(query("
SELECT
dba_category AS category,
pickup_end_date AS date,
ROUND(SUM(total_dispatched_trips)::numeric / 7) AS trips_per_day
FROM fhv_weekly_reports
WHERE dba_category IN ('uber', 'lyft')
AND pickup_end_date >= '2015-01-08'
GROUP BY dba_category, pickup_end_date
ORDER BY dba_category, pickup_end_date
"))

yellow_trips_per_day = tbl_df(query("
SELECT
month AS date,
trips_per_day
FROM yellow_monthly_reports
ORDER BY month
")) %>% mutate(category = "yellow")

trips_per_day = bind_rows(yellow_trips_per_day, uber_lyft_trips_per_day) %>%
mutate(category = factor(category, levels = c("yellow", "uber", "lyft"), labels = c("Yellow Taxi", "Uber", "Lyft")))

png(filename = "graphs/taxi_uber_lyft_trips_per_day.png", width = 640, height = 520)
ggplot(data = trips_per_day, aes(x = date, y = trips_per_day, color = category)) +
geom_line(size = 1) +
scale_y_continuous("Trips per day\n", labels = unit_format("k", 1/1000, "")) +
scale_x_date("") +
scale_color_manual("", values = c("#F7B731", "#161629", "#E70B81")) +
title_with_subtitle("NYC Taxis Losing Market Share to Uber", "Trips per day in NYC, based on TLC summary data") +
theme_tws(base_size = 24) +
theme(legend.position = "bottom")
add_credits()
dev.off()

png(filename = "graphs/lyft_trips_per_day.png", width = 640, height = 520)
ggplot(data = filter(trips_per_day, category == "Lyft"), aes(x = date, y = trips_per_day)) +
geom_line(size = 1.5, color = "#E70B81") +
scale_y_continuous("Trips per day\n", labels = comma) +
scale_x_date("", labels = date_format("%b '%y"), minor_breaks = date_breaks("1 month")) +
title_with_subtitle("Lyft NYC Trips Per Day", "Based on TLC summary data") +
expand_limits(y = 0) +
theme_tws(base_size = 24) +
theme(legend.position = "bottom")
add_credits()
dev.off()

uber_lyft_vehicles = tbl_df(query("
SELECT
dba_category AS category,
pickup_end_date AS date,
SUM(unique_dispatched_vehicles) AS vehicles
FROM fhv_weekly_reports_view
WHERE week_number NOT IN (1, 53)
AND dba_category IN ('uber', 'lyft')
GROUP BY dba_category, pickup_end_date
HAVING SUM(unreliable_vehicles_count::int) = 0
ORDER BY dba_category, pickup_end_date
"))

yellow_vehicles = tbl_df(query("
SELECT
month AS date,
trips_per_day,
unique_medallions AS vehicles
FROM yellow_monthly_reports
ORDER BY month
")) %>% mutate(category = "yellow")

vehicles = bind_rows(uber_lyft_vehicles, yellow_vehicles) %>%
mutate(category = factor(category, levels = c("yellow", "uber", "lyft"), labels = c("Yellow Taxi", "Uber", "Lyft")))
Binary file added analysis/graphs/lyft_trips_per_day.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added analysis/graphs/taxi_uber_lyft_trips_per_day.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion analysis/helpers.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ add_credits = function(fontsize = 12, color = "#777777", xpos = 0.99) {
x = xpos,
y = 0.02,
just = "right",
gp = gpar(fontsize = fontsize, col = color))
gp = gpar(fontsize = fontsize, col = color, fontfamily = font_family))
}

title_with_subtitle = function(title, subtitle = "") {
Expand Down
51 changes: 51 additions & 0 deletions tlc_statistics/create_statistics_tables.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
DROP TABLE IF EXISTS yellow_monthly_reports CASCADE;
DROP TABLE IF EXISTS fhv_weekly_reports CASCADE;

CREATE TABLE yellow_monthly_reports (
month date,
trips_per_day integer,
farebox_per_day integer,
unique_drivers integer,
unique_medallions integer,
medallions_per_day integer,
avg_days_medallions_on_road numeric,
avg_hours_per_day_per_medallion numeric,
avg_days_drivers_on_road numeric,
avg_hours_per_day_per_driver numeric,
avg_minutes_per_trip numeric,
percent_trips_paid_with_credit_card numeric
);

CREATE UNIQUE INDEX index_yellow_monthly_uniq ON yellow_monthly_reports (month);

CREATE TABLE fhv_weekly_reports (
base_number varchar,
wave_number integer,
base_name varchar,
dba varchar,
year integer,
week_number integer,
pickup_start_date date,
pickup_end_date date,
total_dispatched_trips integer,
unique_dispatched_vehicles integer,
dba_category varchar
);

CREATE UNIQUE INDEX index_fhv_weekly_uniq ON fhv_weekly_reports (base_number, year, week_number);
CREATE INDEX index_fhv_on_category ON fhv_weekly_reports (dba_category, pickup_end_date);

CREATE VIEW fhv_weekly_reports_intermediate_view AS
SELECT
*,
LAG(unique_dispatched_vehicles, 1) OVER (PARTITION BY base_number ORDER BY year, week_number) AS prev_uniq_vehicles,
LEAD(unique_dispatched_vehicles, 1) OVER (PARTITION BY base_number ORDER BY year, week_number) AS next_uniq_vehicles
FROM fhv_weekly_reports;

CREATE VIEW fhv_weekly_reports_view AS
SELECT
*,
(unique_dispatched_vehicles > 5
AND unique_dispatched_vehicles::numeric / prev_uniq_vehicles > 1.4
AND unique_dispatched_vehicles::numeric / next_uniq_vehicles > 1.4) AS unreliable_vehicles_count
FROM fhv_weekly_reports_intermediate_view;
70 changes: 70 additions & 0 deletions tlc_statistics/import_statistics_data.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
require 'rubygems'
require 'csv'
require 'roo'
require 'rest-client'
require 'active_support'
require 'active_support/core_ext'

# yellow taxi data
yellow_monthly_data_url = "http://www.nyc.gov/html/tlc/downloads/csv/data_reports_monthly_indicators_yellow.csv"
yellow_monthly_data = CSV.parse(RestClient.get(yellow_monthly_data_url))
yellow_monthly_data.shift

CSV.open("yellow_monthly_data.csv", "wb") do |csv|
yellow_monthly_data.each do |row|
csv << [
Date.strptime(row[0], "%B-%y").end_of_month,
row[1].gsub(",", "").to_i,
row[2].gsub(",", "").to_i,
row[3].gsub(",", "").to_i,
row[4].gsub(",", "").to_i,
row[5].gsub(",", "").to_i,
row[6].to_f,
row[7].to_f,
row[8].to_f,
row[9].to_f,
row[10].presence.try(:to_f),
row[11].gsub("%", "").to_f
]
end
end

# FHV weekly data (includes Uber and Lyft)
fhv_weekly_data_url = "http://www.nyc.gov/html/tlc/downloads/excel/data_reports_weekly_fhv_base.xlsx"
fhv = Roo::Spreadsheet.open(fhv_weekly_data_url)

header_row_index = (fhv.first_row..fhv.last_row).detect do |i|
fhv.row(i).compact.size > 5
end
header_row = fhv.row(header_row_index)

column_indexes_to_select = header_row.map.with_index do |name, idx|
idx if name.present?
end.compact

data_row_range = (header_row_index + 1)..fhv.last_row

CSV.open("fhv_weekly_data.csv", "wb") do |csv|
data_row_range.each do |i|
row = fhv.row(i)

values = column_indexes_to_select.map { |col_idx| row[col_idx] }

dba_string = values[3]

values << if dba_string =~ /^uber/i
"uber"
elsif dba_string =~ /^lyft/i
"lyft"
else
"other"
end

csv << values
end
end

# create tables and import data
system(%{psql nyc-taxi-data -f create_statistics_tables.sql})
system(%{cat yellow_monthly_data.csv | psql nyc-taxi-data -c "COPY yellow_monthly_reports FROM stdin CSV;"})
system(%{cat fhv_weekly_data.csv | psql nyc-taxi-data -c "COPY fhv_weekly_reports FROM stdin CSV;"})

0 comments on commit a21a3a5

Please sign in to comment.