# Download Citibike trip data from January 2017 through March 2018

Citibike trips data source: https://www.citibikenyc.com/system-data

In [328]:
import os
from os.path import join
import pandas as pd
from tqdm import tqdm

In [329]:
nyc_citibike_urls = [
    "https://s3.amazonaws.com/tripdata/201701-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201702-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201703-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201704-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201705-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201706-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201707-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201708-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201709-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201710-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201711-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201712-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201801_citibikenyc_tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201802_citibikenyc_tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201803_citibikenyc_tripdata.csv.zip"
]

jerseycity_citibike_urls = [
    "https://s3.amazonaws.com/tripdata/JC-201701-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201702-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201703-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201704-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201705-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201706-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201707-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201708%20citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201709-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201710-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201711-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/JC-201712-citibike-tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201801_citibikejc_tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201802_citibikejc_tripdata.csv.zip",
    "https://s3.amazonaws.com/tripdata/201803_citibikejc_tripdata.csv.zip",
]

In [351]:
def download_citibike_trips(url, city):
    
    # Read zipfile into dataframe
    df = pd.read_csv(url, compression="infer")

    # Delete a junk column that exists in some but not all months
    if 'name_localizedValue0' in df.columns:
        del df['name_localizedValue0']

    # Rename columns
    df.columns = ['tripduration', 'starttime', 'stoptime', 'start_station_id',
          'start_station_name', 'start_station_latitude', 'start_station_longitude', 
          'end_station_id', 'end_station_name','end_station_latitude', 'end_station_longitude',
          'bike_id', 'user_type', 'user_birth_year', 'user_gender']

    # Drop any rows with zero for coordinates
    df = df[df['start_station_latitude'] != 0.0]
    df = df[df['start_station_longitude'] != 0.0]
    df = df[df['end_station_latitude'] != 0.0]
    df = df[df['end_station_longitude'] != 0.0]
    
    # Keep track of city: NYC or JC (Jersey City)
    df['city'] = city
    
    # Create a linestring geometry object
    df['linestring'] = "LINESTRING(" + df['start_station_longitude'].astype(str) + " " + \
                    df['start_station_latitude'].astype(str) + ", " + \
                    df['end_station_longitude'].astype(str) + " " + \
                    df['end_station_latitude'].astype(str) + ")"
                
    # Create a logical filename, i.e. 201701_NYC.csv, 201702_NYC.csv, etc.
    outfilename = i.replace("https://s3.amazonaws.com/tripdata/","").replace(".csv.zip","").replace("JC-","")[:6]
    outfilename += "_" + city + ".csv"
    
    # Define output filepath
    outfilepath = join("data", "citibike_trips", outfilename)
    
    # Save dataframe to csv
    df.to_csv(outfilepath)

In [352]:
# Make data directory
if not os.path.exists("data"):
    os.mkdir("data")
    
output_dir = join("data", "citibike_trips")

# Make output directory
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
print("Downloading NYC Citibike trips")

# Download nyc citibike trips
for i in tqdm(nyc_citibike_urls):
    download_citibike_trips(i, "NYC")
    
print("Downloading Jersey City Citibike trips")
    
# Download jersey city citibike trips
for i in tqdm(jerseycity_citibike_urls):
    download_citibike_trips(i, "JC")


  0%|          | 0/15 [00:00<?, ?it/s][A

Downloading NYC Citibike trips



Exception in thread Thread-8:
Traceback (most recent call last):
  File "/anaconda/envs/py36/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/anaconda/envs/py36/lib/python3.6/site-packages/tqdm/_monitor.py", line 63, in run
    for instance in self.tqdm_cls._instances:
  File "/anaconda/envs/py36/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

100%|██████████| 15/15 [08:35<00:00, 34.37s/it]
  0%|          | 0/15 [00:00<?, ?it/s]

Downloading Jersey City Citibike trips


100%|██████████| 15/15 [00:21<00:00,  1.43s/it]
