# Download Citibike trip data from January 2016 through March 2018

Citibike trips data source: https://www.citibikenyc.com/system-data

In [1]:
import os
from os.path import join
import pandas as pd
from tqdm import tqdm

In [28]:
nyc_citibike_urls = [
    "https://s3.amazonaws.com/tripdata/201601-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201602-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201603-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201604-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201605-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201606-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201607-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201608-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201609-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201610-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201611-citibike-tripdata.zip",
    "https://s3.amazonaws.com/tripdata/201612-citibike-tripdata.zip",    
    "https://s3.amazonaws.com/tripdata/201701-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201702-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201703-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201704-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201705-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201706-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201707-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201708-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201709-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201710-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201711-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201712-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201801_citibikenyc_tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201802_citibikenyc_tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201803_citibikenyc_tripdata.csv.zip"
]

jerseycity_citibike_urls = [
    "https://s3.amazonaws.com/tripdata/JC-201601-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201602-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201603-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201604-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201605-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201606-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201607-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201608-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201609-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201610-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201611-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201612-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201701-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201702-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201703-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201704-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201705-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201706-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201707-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201708-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201709-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201710-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201711-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201712-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201801_citibikejc_tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201802_citibikejc_tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201803_citibikejc_tripdata.csv.zip",
]

In [18]:
def download_citibike_trips(url, city, outfildir):
    
    # Read zipfile into dataframe
    df = pd.read_csv(url, compression="infer")

    # Delete a junk column that exists in some but not all months
    if 'name_localizedValue0' in df.columns:
        del df['name_localizedValue0']

    # Rename columns
    df.columns = ['tripduration', 'starttime', 'stoptime', 'start_station_id',
          'start_station_name', 'start_station_latitude', 'start_station_longitude', 
          'end_station_id', 'end_station_name','end_station_latitude', 'end_station_longitude',
          'bike_id', 'user_type', 'user_birth_year', 'user_gender']

    # Drop any rows with zero for coordinates
    df = df[df['start_station_latitude'] != 0.0]
    df = df[df['start_station_longitude'] != 0.0]
    df = df[df['end_station_latitude'] != 0.0]
    df = df[df['end_station_longitude'] != 0.0]
    
    # Keep track of city: NYC or JC (Jersey City)
    df['city'] = city
    
    # Create a linestring geometry object
    df['linestring'] = "LINESTRING(" + df['start_station_longitude'].astype(str) + " " + \
                    df['start_station_latitude'].astype(str) + ", " + \
                    df['end_station_longitude'].astype(str) + " " + \
                    df['end_station_latitude'].astype(str) + ")"
                
    # Create a logical filename, i.e. 201701_NYC.csv, 201702_NYC.csv, etc.
    outfilename = i.replace("https://s3.amazonaws.com/tripdata/","").replace(".csv.zip","").replace("JC-","")[:6]
    outfilename += "_" + city + ".csv"
    
    # Define output filepath
    outfilepath = join(outfiledir, outfilename)
    
    # Save dataframe to csv
    df.to_csv(outfilepath)

In [19]:
root = ".."

outfiledir = join(root,"data", "citibike_trips")

# Make data directory
if not os.path.exists(outfiledir):
    os.mkdir(outfiledir)

In [20]:
%%time

print("Downloading NYC Citibike trips")

# Download nyc citibike trips
for i in tqdm(nyc_citibike_urls):
    download_citibike_trips(i, "NYC", outfiledir)

  0%|          | 0/27 [00:00<?, ?it/s]

Downloading NYC Citibike trips


100%|██████████| 27/27 [15:41<00:00, 34.88s/it]

CPU times: user 13min 6s, sys: 1min 3s, total: 14min 9s
Wall time: 15min 41s





In [27]:
%%time

print("Downloading Jersey City Citibike trips")
    
# Download jersey city citibike trips
for i in tqdm(jerseycity_citibike_urls):
    download_citibike_trips(i, "JC",outfiledir)


  0%|          | 0/8 [00:00<?, ?it/s][A

Downloading Jersey City Citibike trips



Exception in thread Thread-6:
Traceback (most recent call last):
  File "/anaconda/envs/py36/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tqdm/_monitor.py", line 63, in run
    for instance in self.tqdm_cls._instances:
  File "/anaconda/envs/py36/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 8/8 [00:54<00:00,  6.86s/it]

CPU times: user 43.7 s, sys: 3.06 s, total: 46.8 s
Wall time: 54.9 s



