In [2]:
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver
from io import BytesIO
from zipfile import ZipFile
import urllib.request
import os

In [3]:
browser = webdriver.Chrome('/usr/local/bin/chromedriver')

In [4]:
browser.get('https://s3.amazonaws.com/tripdata/index.html')

In [5]:
#get all links
all_links = [link.get_attribute('href') for link in browser.find_elements_by_tag_name('a')]

In [8]:
# extract necessary links
linksJC_17 = [link for link in all_links if 'JC' in link if '2017' in link]
linksJC_18 = [link for link in all_links if 'JC' in link if '2018' in link]
links_17 = [link for link in all_links if 'JC' not in link if '2017' in link]
links_18 = [link for link in all_links if 'JC' not in link if '2018' in link]

In [10]:
# function for downloading, unziping and dataframing data from a link
def create_df(link):
    
    url = urllib.request.urlopen(link)
    output = open('temporary.zip', 'wb')    
    output.write(url.read())
    output.close()
    dataframe = pd.read_csv('temporary.zip')
    
    if (len(dataframe.columns) == 15):
        dataframe.columns = ['Trip Duration (sec)', 'Start Time', 'Stop Time', 'Start Station ID',
       'Start Station Name', 'Start Station Latitude',
       'Start Station Longitude', 'End Station ID', 'End Station Name',
       'End Station Latitude', 'End Station Longitude', 'Bike ID', 'User Type',
       'Birth_Year', 'Gender']
    else:
        dataframe.columns = ['Trip Duration (sec)', 'Start Time', 'Stop Time', 'Start Station ID',
       'Start Station Name', 'Start Station Latitude',
       'Start Station Longitude', 'End Station ID', 'End Station Name',
       'End Station Latitude', 'End Station Longitude', 'Bike ID',
       'Localized Value', 'User Type', 'Birth_Year', 'Gender']
        dataframe = dataframe.drop('Localized Value',1)
        
    print(link)
    os.remove('temporary.zip')
    
    return dataframe

In [11]:
# function for cleaning and preparing df
def prepare_df(df):
    
#     drop n/a
    df = df.dropna(how='any').reset_index(drop=True)
    
#     change data types
    df['Birth_Year'] = df.Birth_Year.astype(int)
    df['Start Time'] = pd.to_datetime(df['Start Time'])
    df['Stop Time'] = pd.to_datetime(df['Stop Time'])
    
#     add Dge column
    df['Age'] = 2018 - df['Birth_Year']
    
#     exclude ages > 90 years
    df = df[df['Age'] < 90]
    
    return df

In [12]:
# create empty df
ny17 = pd.DataFrame()
ny18 = pd.DataFrame()

In [13]:
# append to new df
for link in links_17:
    temporary_df = create_df(link)
    ny17 = ny17.append(temporary_df, ignore_index=True, sort=False)

https://s3.amazonaws.com/tripdata/201701-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201702-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201703-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201704-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201705-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201706-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201707-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201708-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201709-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201710-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201711-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201712-citibike-tripdata.csv.zip


In [14]:
for link in links_18:
    temporary_df = create_df(link)
    ny18 = ny18.append(temporary_df, ignore_index=True, sort=False)

https://s3.amazonaws.com/tripdata/201801-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201802-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201803-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201804-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201805-citibike-tripdata.csv.zip
https://s3.amazonaws.com/tripdata/201806-citibike-tripdata.csv.zip


In [15]:
# clean and prepare df
ny17 = prepare_df(ny17)
ny18 = prepare_df(ny18)

In [16]:
ny17.head()

Unnamed: 0,Trip Duration (sec),Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth_Year,Gender,Age
0,680,2017-01-01 00:00:21,2017-01-01 00:11:41,3226,W 82 St & Central Park West,40.78275,-73.97137,3165,Central Park West & W 72 St,40.775794,-73.976206,25542,Subscriber,1965,2,53
1,1282,2017-01-01 00:00:45,2017-01-01 00:22:08,3263,Cooper Square & E 7 St,40.729236,-73.990868,498,Broadway & W 32 St,40.748549,-73.988084,21136,Subscriber,1987,2,31
2,666,2017-01-01 00:01:51,2017-01-01 00:12:57,3163,Central Park West & W 68 St,40.773407,-73.977825,3163,Central Park West & W 68 St,40.773407,-73.977825,16050,Subscriber,2000,1,18
3,559,2017-01-01 00:05:00,2017-01-01 00:14:20,499,Broadway & W 60 St,40.769155,-73.981918,479,9 Ave & W 45 St,40.760193,-73.991255,27294,Subscriber,1973,1,45
4,826,2017-01-01 00:05:37,2017-01-01 00:19:24,362,Broadway & W 37 St,40.751726,-73.987535,445,E 10 St & Avenue A,40.727408,-73.98142,23288,Subscriber,1977,2,41


In [17]:
ny18.head()

Unnamed: 0,Trip Duration (sec),Start Time,Stop Time,Start Station ID,Start Station Name,Start Station Latitude,Start Station Longitude,End Station ID,End Station Name,End Station Latitude,End Station Longitude,Bike ID,User Type,Birth_Year,Gender,Age
0,196,2018-01-01 00:01:51,2018-01-01 00:05:07,315,South St & Gouverneur Ln,40.703554,-74.006702,259,South St & Whitehall St,40.701221,-74.012342,18534,Subscriber,1997,1,21
1,207,2018-01-01 00:02:44,2018-01-01 00:06:11,3224,W 13 St & Hudson St,40.739974,-74.005139,470,W 20 St & 8 Ave,40.743453,-74.00004,19651,Subscriber,1978,1,40
2,613,2018-01-01 00:03:15,2018-01-01 00:13:28,386,Centre St & Worth St,40.714948,-74.002345,2008,Little West St & 1 Pl,40.705693,-74.016777,21678,Subscriber,1982,1,36
3,375,2018-01-01 00:06:44,2018-01-01 00:12:59,466,W 25 St & 6 Ave,40.743954,-73.991449,325,E 19 St & 3 Ave,40.736245,-73.984738,29822,Subscriber,1982,1,36
4,402,2018-01-01 00:06:57,2018-01-01 00:13:40,438,St Marks Pl & 1 Ave,40.727791,-73.985649,380,W 4 St & 7 Ave S,40.734011,-74.002939,30722,Subscriber,1989,1,29


In [18]:
ny17.dtypes

Trip Duration (sec)                 int64
Start Time                 datetime64[ns]
Stop Time                  datetime64[ns]
Start Station ID                    int64
Start Station Name                 object
Start Station Latitude            float64
Start Station Longitude           float64
End Station ID                      int64
End Station Name                   object
End Station Latitude              float64
End Station Longitude             float64
Bike ID                             int64
User Type                          object
Birth_Year                          int64
Gender                              int64
Age                                 int64
dtype: object

In [20]:
ny18.dtypes

Trip Duration (sec)                 int64
Start Time                 datetime64[ns]
Stop Time                  datetime64[ns]
Start Station ID                    int64
Start Station Name                 object
Start Station Latitude            float64
Start Station Longitude           float64
End Station ID                      int64
End Station Name                   object
End Station Latitude              float64
End Station Longitude             float64
Bike ID                             int64
User Type                          object
Birth_Year                          int64
Gender                              int64
Age                                 int64
dtype: object

In [21]:
nydf = ny17.append(ny18, ignore_index=True, sort=False)

In [22]:
nydf.count()

Trip Duration (sec)        21902634
Start Time                 21902634
Stop Time                  21902634
Start Station ID           21902634
Start Station Name         21902634
Start Station Latitude     21902634
Start Station Longitude    21902634
End Station ID             21902634
End Station Name           21902634
End Station Latitude       21902634
End Station Longitude      21902634
Bike ID                    21902634
User Type                  21902634
Birth_Year                 21902634
Gender                     21902634
Age                        21902634
dtype: int64

In [23]:
nydf.to_csv('data/nybike_from_url.csv', encoding='utf-8', index=False)