### Objective: Manipulate (spatio-temporal) data through SQL

- Citibike ridership data: https://serv.cusp.nyu.edu/~hvo/files/citibike.csv
- Interactive SQL queries using Carto: https://carto.com

In [1]:
# function by @fedhere to import query results

SQL_SOURCE = 'https://xoxoaseka.carto.com/api/v2/sql?q='

import urllib2
import urllib
import StringIO
import ast
import pandas as pd

def queryCartoDB(query, format='CSV', source=SQL_SOURCE):
    '''queries carto datasets from a given carto account
    Arguments: 
    query - string: a valid sql query string
    format - outlut format  OPTIONAL (default CSV)
    source - a valid sql api endpoint OPTIONAL (default carto fb55 account)
    Returns:
    the return of the sql query AS A STRING
    NOTES:
    designed for the carto API, tested only with CSV return format'''
    
    data = urllib.urlencode({'format': format, 'q': query})
    try:
        response = urllib2.urlopen(source, data)
    except urllib2.HTTPError, e:
        raise ValueError('\n'.join(ast.literal_eval(e.readline())['error']))
    except Exception:
        raise
    return pd.read_csv(StringIO.StringIO(response.read()), sep=',')

In [2]:
test = '''
SELECT *
FROM citibike
'''

In [3]:
#pd.read_csv(StringIO.StringIO(queryCartoDB(test)), sep=',')
queryCartoDB(test)

Unnamed: 0,cartodb_id,the_geom,the_geom_webmercator,gender,birth_year,bikeid,end_station_id,start_station_id,tripduration,usertype,end_station_name,start_station_name,end_station_longitude,end_station_latitude,start_station_longitude,start_station_latitude,stoptime,starttime
0,5960,,,1,1965.0,19217,462,435,351,Subscriber,W 22 St & 10 Ave,W 21 St & 6 Ave,-74.004519,40.746920,-73.994156,40.741740,2015-02-01 20:42:00+00,2015-02-01 20:36:00+00
1,9806,,,1,1971.0,17575,488,167,643,Subscriber,W 39 St & 9 Ave,E 39 St & 3 Ave,-73.993722,40.756458,-73.976049,40.748901,2015-02-03 14:50:00+00,2015-02-03 14:39:00+00
2,13113,,,1,1973.0,20313,440,332,1379,Subscriber,E 45 St & 3 Ave,Cherry St,-73.972826,40.752554,-73.979481,40.712199,2015-02-04 07:30:00+00,2015-02-04 07:07:00+00
3,1,,,2,1978.0,17131,423,521,801,Subscriber,W 54 St & 9 Ave,8 Ave & W 31 St,-73.986905,40.765849,-73.994811,40.750450,2015-02-01 00:14:00+00,2015-02-01 00:00:00+00
4,2,,,1,1993.0,21289,504,497,379,Subscriber,1 Ave & E 15 St,E 17 St & Broadway,-73.981656,40.732219,-73.990093,40.737050,2015-02-01 00:07:00+00,2015-02-01 00:00:00+00
5,3,,,2,1969.0,18903,127,281,2474,Subscriber,Barrow St & Hudson St,Grand Army Plaza & Central Park S,-74.006744,40.731724,-73.973715,40.764397,2015-02-01 00:42:00+00,2015-02-01 00:01:00+00
6,4,,,2,1985.0,21044,505,2004,818,Subscriber,6 Ave & W 33 St,6 Ave & Broome St,-73.988484,40.749013,-74.004704,40.724399,2015-02-01 00:15:00+00,2015-02-01 00:01:00+00
7,5,,,1,1957.0,19868,83,323,544,Subscriber,Atlantic Ave & Fort Greene Pl,Lawrence St & Willoughby St,-73.976323,40.683826,-73.986317,40.692362,2015-02-01 00:10:00+00,2015-02-01 00:01:00+00
8,6,,,1,1979.0,15854,2002,373,717,Subscriber,Wythe Ave & Metropolitan Ave,Willoughby Ave & Walworth St,-73.963198,40.716887,-73.953820,40.693317,2015-02-01 00:14:00+00,2015-02-01 00:02:00+00
9,7,,,1,1983.0,15173,504,352,1306,Subscriber,1 Ave & E 15 St,W 56 St & 6 Ave,-73.981656,40.732219,-73.977225,40.763406,2015-02-01 00:26:00+00,2015-02-01 00:04:00+00


### Task 1 — Familiarize with SQL Clauses

1) Sort data by start_station_id, tripduration (only checking trips with duration <= 3 hours)

2) Only show the top/last 10 records (aka head and tail in SQL) List all unique start_station_id values

3) Aggregation functions:
- Count the number of trips (aka wc -l in SQL) 
- Find the average/min/max trip duration

In [4]:
task1 = '''
SELECT DISTINCT start_station_id, 
       COUNT(start_station_id) as trip_counts,
       AVG(tripduration) as average_trip_duration,
       MIN(tripduration) as min_trip_duration,
       MAX(tripduration) as max_trip_duration
FROM citibike
WHERE tripduration <= 10800
GROUP BY start_station_id
ORDER BY start_station_id ASC
LIMIT 10
'''
#show top 10 records
queryCartoDB(task1)

Unnamed: 0,start_station_id,trip_counts,average_trip_duration,min_trip_duration,max_trip_duration
0,72,114,742.701754,107,2099
1,79,71,659.140845,149,1916
2,82,49,445.346939,138,1581
3,83,35,783.685714,73,2647
4,116,288,543.965278,66,2081
5,119,19,1044.789474,278,2167
6,120,17,979.882353,270,2131
7,127,209,710.84689,83,5776
8,128,247,664.206478,62,2875
9,137,96,697.364583,85,2316


In [5]:
task1_1 = '''
SELECT DISTINCT start_station_id, 
       COUNT(start_station_id) as trip_counts,
       AVG(tripduration) as average_trip_duration,
       MIN(tripduration) as min_trip_duration,
       MAX(tripduration) as max_trip_duration
FROM citibike
WHERE tripduration <= 10800
GROUP BY start_station_id
ORDER BY start_station_id DESC
LIMIT 10
'''
# showing last 10 records
queryCartoDB(task1_1)

Unnamed: 0,start_station_id,trip_counts,average_trip_duration,min_trip_duration,max_trip_duration
0,3002,184,714.646739,79,7225
1,2023,91,771.978022,108,2454
2,2022,96,979.0625,107,2495
3,2021,141,738.815603,90,6592
4,2017,86,763.383721,109,2476
5,2012,256,714.519531,79,2026
6,2010,76,763.184211,167,2610
7,2009,107,777.794393,60,2765
8,2008,86,832.22093,94,3476
9,2006,79,1130.531646,90,3459


### Task 2 — Working with date/time

1) Selecting trips started on Feb-02-2015 only 

2) Selecting trips started on the weekends (What are average trip duration during weekends?) 

3) Can we do the same for weekday?

In [6]:
task2_1 = '''
SELECT *
FROM citibike
WHERE starttime >= '2015-02-02 00:00'
AND starttime < '2015-02-03 00:00'
'''
# showing trips started on Feb-02-2015 only
queryCartoDB(task2_1)

Unnamed: 0,cartodb_id,the_geom,the_geom_webmercator,gender,birth_year,bikeid,end_station_id,start_station_id,tripduration,usertype,end_station_name,start_station_name,end_station_longitude,end_station_latitude,start_station_longitude,start_station_latitude,stoptime,starttime
0,6442,,,1,1992.0,20684,489,442,199,Subscriber,10 Ave & W 28 St,W 27 St & 7 Ave,-74.001768,40.750664,-73.993915,40.746647,2015-02-02 00:05:00+00,2015-02-02 00:02:00+00
1,6443,,,2,1964.0,16094,349,326,418,Subscriber,Rivington St & Ridge St,E 11 St & 1 Ave,-73.983299,40.718502,-73.984267,40.729538,2015-02-02 00:10:00+00,2015-02-02 00:03:00+00
2,6444,,,1,1974.0,17362,415,279,276,Subscriber,Pearl St & Hanover Square,Peck Slip & Front St,-74.009260,40.704718,-74.001670,40.707873,2015-02-02 00:09:00+00,2015-02-02 00:04:00+00
3,6445,,,2,1992.0,15475,237,496,420,Subscriber,E 11 St & 2 Ave,E 16 St & 5 Ave,-73.986724,40.730473,-73.992390,40.737262,2015-02-02 00:13:00+00,2015-02-02 00:06:00+00
4,6446,,,1,1968.0,16272,450,477,304,Subscriber,W 49 St & 8 Ave,W 41 St & 8 Ave,-73.987882,40.762272,-73.990026,40.756405,2015-02-02 00:12:00+00,2015-02-02 00:07:00+00
5,6447,,,1,1979.0,16370,391,467,622,Subscriber,Clark St & Henry St,Dean St & 4 Ave,-73.993446,40.697601,-73.978951,40.683125,2015-02-02 00:18:00+00,2015-02-02 00:08:00+00
6,6448,,,1,1963.0,19437,482,434,371,Subscriber,W 15 St & 7 Ave,9 Ave & W 18 St,-73.999318,40.739355,-74.003664,40.743174,2015-02-02 00:15:00+00,2015-02-02 00:09:00+00
7,6449,,,1,1982.0,21211,2008,330,477,Subscriber,Little West St & 1 Pl,Reade St & Broadway,-74.016777,40.705693,-74.005628,40.714505,2015-02-02 00:19:00+00,2015-02-02 00:11:00+00
8,6450,,,2,1978.0,21417,423,521,743,Subscriber,W 54 St & 9 Ave,8 Ave & W 31 St,-73.986905,40.765849,-73.994811,40.750450,2015-02-02 00:25:00+00,2015-02-02 00:12:00+00
9,6451,,,1,1956.0,15722,453,504,599,Subscriber,W 22 St & 8 Ave,1 Ave & E 15 St,-73.999154,40.744751,-73.981656,40.732219,2015-02-02 00:22:00+00,2015-02-02 00:12:00+00


In [7]:
# showing trips started on the weekends
task2_2_1 = '''
SELECT *
FROM citibike
WHERE extract(DOW from starttime) IN (0,6)
'''
queryCartoDB(task2_2_1)

Unnamed: 0,cartodb_id,the_geom,the_geom_webmercator,gender,birth_year,bikeid,end_station_id,start_station_id,tripduration,usertype,end_station_name,start_station_name,end_station_longitude,end_station_latitude,start_station_longitude,start_station_latitude,stoptime,starttime
0,5960,,,1,1965.0,19217,462,435,351,Subscriber,W 22 St & 10 Ave,W 21 St & 6 Ave,-74.004519,40.746920,-73.994156,40.741740,2015-02-01 20:42:00+00,2015-02-01 20:36:00+00
1,1,,,2,1978.0,17131,423,521,801,Subscriber,W 54 St & 9 Ave,8 Ave & W 31 St,-73.986905,40.765849,-73.994811,40.750450,2015-02-01 00:14:00+00,2015-02-01 00:00:00+00
2,2,,,1,1993.0,21289,504,497,379,Subscriber,1 Ave & E 15 St,E 17 St & Broadway,-73.981656,40.732219,-73.990093,40.737050,2015-02-01 00:07:00+00,2015-02-01 00:00:00+00
3,3,,,2,1969.0,18903,127,281,2474,Subscriber,Barrow St & Hudson St,Grand Army Plaza & Central Park S,-74.006744,40.731724,-73.973715,40.764397,2015-02-01 00:42:00+00,2015-02-01 00:01:00+00
4,4,,,2,1985.0,21044,505,2004,818,Subscriber,6 Ave & W 33 St,6 Ave & Broome St,-73.988484,40.749013,-74.004704,40.724399,2015-02-01 00:15:00+00,2015-02-01 00:01:00+00
5,5,,,1,1957.0,19868,83,323,544,Subscriber,Atlantic Ave & Fort Greene Pl,Lawrence St & Willoughby St,-73.976323,40.683826,-73.986317,40.692362,2015-02-01 00:10:00+00,2015-02-01 00:01:00+00
6,6,,,1,1979.0,15854,2002,373,717,Subscriber,Wythe Ave & Metropolitan Ave,Willoughby Ave & Walworth St,-73.963198,40.716887,-73.953820,40.693317,2015-02-01 00:14:00+00,2015-02-01 00:02:00+00
7,7,,,1,1983.0,15173,504,352,1306,Subscriber,1 Ave & E 15 St,W 56 St & 6 Ave,-73.981656,40.732219,-73.977225,40.763406,2015-02-01 00:26:00+00,2015-02-01 00:04:00+00
8,8,,,1,1955.0,17862,116,439,913,Subscriber,W 17 St & 8 Ave,E 4 St & 2 Ave,-74.001497,40.741776,-73.989780,40.726281,2015-02-01 00:19:00+00,2015-02-01 00:04:00+00
9,9,,,2,1985.0,21183,2012,335,759,Subscriber,E 27 St & 1 Ave,Washington Pl & Broadway,-73.976806,40.739445,-73.994046,40.729039,2015-02-01 00:17:00+00,2015-02-01 00:04:00+00


In [8]:
# showing average trip duration during weekends
task2_2_2 = '''
SELECT AVG(tripduration)
FROM citibike
WHERE extract(DOW from starttime) IN (0,6) 
'''
queryCartoDB(task2_2_2)

Unnamed: 0,avg,Unnamed: 1
0,662.942181,


In [9]:
# showing trips started on the weekdays
task2_3_1 = '''
SELECT *
FROM citibike
WHERE extract(DOW from starttime) IN (1, 2, 3, 4, 5)
'''
queryCartoDB(task2_3_1)

Unnamed: 0,cartodb_id,the_geom,the_geom_webmercator,gender,birth_year,bikeid,end_station_id,start_station_id,tripduration,usertype,end_station_name,start_station_name,end_station_longitude,end_station_latitude,start_station_longitude,start_station_latitude,stoptime,starttime
0,9806,,,1,1971.0,17575,488,167,643,Subscriber,W 39 St & 9 Ave,E 39 St & 3 Ave,-73.993722,40.756458,-73.976049,40.748901,2015-02-03 14:50:00+00,2015-02-03 14:39:00+00
1,13113,,,1,1973.0,20313,440,332,1379,Subscriber,E 45 St & 3 Ave,Cherry St,-73.972826,40.752554,-73.979481,40.712199,2015-02-04 07:30:00+00,2015-02-04 07:07:00+00
2,6442,,,1,1992.0,20684,489,442,199,Subscriber,10 Ave & W 28 St,W 27 St & 7 Ave,-74.001768,40.750664,-73.993915,40.746647,2015-02-02 00:05:00+00,2015-02-02 00:02:00+00
3,6443,,,2,1964.0,16094,349,326,418,Subscriber,Rivington St & Ridge St,E 11 St & 1 Ave,-73.983299,40.718502,-73.984267,40.729538,2015-02-02 00:10:00+00,2015-02-02 00:03:00+00
4,6444,,,1,1974.0,17362,415,279,276,Subscriber,Pearl St & Hanover Square,Peck Slip & Front St,-74.009260,40.704718,-74.001670,40.707873,2015-02-02 00:09:00+00,2015-02-02 00:04:00+00
5,6445,,,2,1992.0,15475,237,496,420,Subscriber,E 11 St & 2 Ave,E 16 St & 5 Ave,-73.986724,40.730473,-73.992390,40.737262,2015-02-02 00:13:00+00,2015-02-02 00:06:00+00
6,6446,,,1,1968.0,16272,450,477,304,Subscriber,W 49 St & 8 Ave,W 41 St & 8 Ave,-73.987882,40.762272,-73.990026,40.756405,2015-02-02 00:12:00+00,2015-02-02 00:07:00+00
7,6447,,,1,1979.0,16370,391,467,622,Subscriber,Clark St & Henry St,Dean St & 4 Ave,-73.993446,40.697601,-73.978951,40.683125,2015-02-02 00:18:00+00,2015-02-02 00:08:00+00
8,6448,,,1,1963.0,19437,482,434,371,Subscriber,W 15 St & 7 Ave,9 Ave & W 18 St,-73.999318,40.739355,-74.003664,40.743174,2015-02-02 00:15:00+00,2015-02-02 00:09:00+00
9,6449,,,1,1982.0,21211,2008,330,477,Subscriber,Little West St & 1 Pl,Reade St & Broadway,-74.016777,40.705693,-74.005628,40.714505,2015-02-02 00:19:00+00,2015-02-02 00:11:00+00


In [10]:
# showing average trip duration during weekdays
task2_3_2 = '''
SELECT AVG(tripduration)
FROM citibike
WHERE extract(DOW from starttime) IN (1, 2, 3, 4, 5)
'''
queryCartoDB(task2_3_2)

Unnamed: 0,avg,Unnamed: 1
0,681.052292,


### Task 3 — Working with Space
1) Showing the list of start station locations (using GROUP BY)

2) Showing the number of trips started per station (only for stations within 500m of Time Square! The coordinates of Time Square is (40.7577,-73.9857))

In [11]:
task3_1 = '''
SELECT start_station_latitude, start_station_longitude 
FROM citibike
GROUP BY start_station_latitude, start_station_longitude
'''
# showing start station locations 
queryCartoDB(task3_1)

Unnamed: 0,start_station_latitude,start_station_longitude
0,40.686919,-73.976682
1,40.753202,-73.977987
2,40.715422,-74.011220
3,40.745497,-74.001971
4,40.744751,-73.999154
5,40.742065,-74.004432
6,40.740964,-73.986022
7,40.763406,-73.977225
8,40.716059,-73.991908
9,40.714948,-74.002345


In [12]:
task3_2 = '''
SELECT start_station_name, 
       CDB_TransformToWebmercator(CDB_LatLng(start_station_latitude, start_station_longitude)) as the_geom_webmercator,
       MIN(cartodb_id) as cartodb_id,
       COUNT(tripduration) as number_of_trips
FROM citibike
WHERE ST_DWithin(CDB_LatLng(start_station_latitude, start_station_longitude)::geography, 
                 CDB_LatLng(40.7577,-73.9857)::geography, 500)
GROUP BY start_station_latitude, start_station_longitude, start_station_name
ORDER BY number_of_trips
'''
# showing the number of trips started per station located within 500m of Time Square
queryCartoDB(task3_2)

Unnamed: 0,start_station_name,the_geom_webmercator,cartodb_id,number_of_trips
0,W 43 St & 6 Ave,0101000020110F0000F22A632FBE6A5FC1363A28CFADFB...,1115,112
1,W 45 St & 6 Ave,0101000020110F00009BB87D02B76A5FC146C144E9E5FB...,19,141
2,W 45 St & 8 Ave,0101000020110F00001607D538556B5FC119A6CA6F41FC...,124,141
3,Broadway & W 49 St,0101000020110F000020D6742CE16A5FC112F4D73D73FC...,42,213
4,W 42 St & 8 Ave,0101000020110F00003F9300B2976B5FC1B8B88F3102FC...,783,221
5,Broadway & W 41 St,0101000020110F000004AF971C1D6B5FC17A48F3C1A8FB...,54,251
6,W 41 St & 8 Ave,0101000020110F0000F55695027D6B5FC186E5BB69D7FB...,33,507


### Task 4 — Putting it all together
1) Find the station that had the longest average trip duration during weekends and within 500m of TimeSquare!

2) Extra: create lines for trips started from stations within 500m of Times Squares and lasted less than 2 hours. The number of trips per each pair of stations are output as attributes of these lines.

In [13]:
task4_1 = '''
SELECT start_station_name,
       CDB_TransformToWebmercator(CDB_LatLng(start_station_latitude, start_station_longitude)) as the_geom_webmercator,
       MIN(cartodb_id) as cartodb_id,
       AVG(tripduration) as average_trip_duration
FROM citibike
WHERE ST_DWithin(CDB_LatLng(start_station_latitude, start_station_longitude)::geography, 
                 CDB_LatLng(40.7577,-73.9857)::geography, 500)
      AND extract(DOW from starttime) IN (0,6)
GROUP BY start_station_latitude, start_station_longitude, start_station_name
ORDER BY average_trip_duration DESC
'''
# showing top 10 stations that had the longest average trip duration during weekends and within 500m of TimeSquare
queryCartoDB(task4_1)

Unnamed: 0,start_station_name,the_geom_webmercator,cartodb_id,average_trip_duration
0,Broadway & W 49 St,0101000020110F000020D6742CE16A5FC112F4D73D73FC...,42,1010.104167
1,W 45 St & 8 Ave,0101000020110F00001607D538556B5FC119A6CA6F41FC...,124,762.931818
2,Broadway & W 41 St,0101000020110F000004AF971C1D6B5FC17A48F3C1A8FB...,54,683.121212
3,W 45 St & 6 Ave,0101000020110F00009BB87D02B76A5FC146C144E9E5FB...,19,675.4
4,W 41 St & 8 Ave,0101000020110F0000F55695027D6B5FC186E5BB69D7FB...,33,643.260274
5,W 43 St & 6 Ave,0101000020110F0000F22A632FBE6A5FC1363A28CFADFB...,1115,629.7
6,W 42 St & 8 Ave,0101000020110F00003F9300B2976B5FC1B8B88F3102FC...,783,579.142857


In [14]:
task4_2 = '''
SELECT start_station_name, end_station_name,
       ST_Makeline(CDB_TransformToWebmercator(CDB_LatLng(start_station_latitude, start_station_longitude)),
                   CDB_TransformToWebmercator(CDB_LatLng(end_station_latitude, end_station_longitude))) as the_geom_webmercator,
       MIN(cartodb_id) as cartodb_id,
       COUNT(start_station_id) as trip_counts
FROM citibike
WHERE ST_DWithin(CDB_LatLng(start_station_latitude, start_station_longitude)::geography, 
                 CDB_LatLng(40.7577,-73.9857)::geography, 500)
      AND tripduration < 7200
GROUP BY start_station_latitude, start_station_longitude, start_station_name, end_station_latitude, end_station_longitude, end_station_name
ORDER BY trip_counts DESC
'''
# showing lines for trips started from stations within 500m of Times Squares and lasted less than 2 hours. 
# the number of trips per each pair of stations are output as attributes of these lines.
queryCartoDB(task4_2)

Unnamed: 0,start_station_name,end_station_name,the_geom_webmercator,cartodb_id,trip_counts
0,W 41 St & 8 Ave,E 43 St & Vanderbilt Ave,0102000020110F000002000000F55695027D6B5FC186E5...,13031,31
1,W 41 St & 8 Ave,E 47 St & Park Ave,0102000020110F000002000000F55695027D6B5FC186E5...,8025,21
2,W 41 St & 8 Ave,Pershing Square South,0102000020110F000002000000F55695027D6B5FC186E5...,6691,17
3,W 42 St & 8 Ave,11 Ave & W 41 St,0102000020110F0000020000003F9300B2976B5FC1B8B8...,1666,17
4,W 41 St & 8 Ave,Pershing Square North,0102000020110F000002000000F55695027D6B5FC186E5...,6527,15
5,Broadway & W 49 St,E 43 St & Vanderbilt Ave,0102000020110F00000200000020D6742CE16A5FC112F4...,10585,15
6,Broadway & W 41 St,W 33 St & 7 Ave,0102000020110F00000200000004AF971C1D6B5FC17A48...,18883,14
7,W 45 St & 6 Ave,W 33 St & 7 Ave,0102000020110F0000020000009BB87D02B76A5FC146C1...,1663,14
8,W 43 St & 6 Ave,E 43 St & Vanderbilt Ave,0102000020110F000002000000F22A632FBE6A5FC1363A...,10984,14
9,W 41 St & 8 Ave,E 40 St & 5 Ave,0102000020110F000002000000F55695027D6B5FC186E5...,7964,14
