In [2]:
#This script reads csv files with monthly ride data and outputs the following parameters and label:
#Parameters: time bucket, date, holidday or not, start station latitude and longitude. 
#Label: Number of bikes departing from station within that time bucket. 

import pandas as pd
import numpy as np
import datetime
from datetime import date
from dateutil import parser
from pandas.tseries.holiday import USFederalHolidayCalendar
import time

#rescale latitude
def lat(number):
    return (number-40.7)*100

#rescale longitude
def long(number):
    return (number+74)*100

#extract timebin from string and determine if date is workday.
#return 0 if workday
def time_day(string):
    global holidays
    interval = 60
    date_, time_ = string.split(' ')
    hourstr,minutestr,secondstr = time_.split(':')
    hour = int(hourstr)
    minute = int(minutestr)
    timebin = (hour*60+minute)//interval
    
    #check if date is workday 
    is_holiday = int(date_ in holidays)
    
    #check if date is weekend
    dt = parser.parse(date_)
    day = dt.weekday()
    is_weekend = int(day>4)
    
    not_workday = int(is_holiday+is_weekend>0)
 
    return timebin, not_workday, date_
    

#This function reads, cleans data and saves data. 
def cleancsv(filename, inf, sup):
    #Read in data. First column is duration. Second
    #column is date and time. Third and fourth columns are 
    #coordinates of start station and last two columns are
    #coordinates of end station.
    global names
    global holidays
    raw = pd.read_csv(filename,usecols=[0,1,4,5,6,9,10,13,14],skiprows=1,names=names)
    
    #delete trips too short or too long
    raw = raw[raw['Trip_Duration'] < sup]
    raw = raw[raw['Trip_Duration'] > inf]
    
    #rescale latitude and longitude
    #raw.iloc[:,[2,4]] = raw.iloc[:,[2,4]].apply(lat)
    #raw.iloc[:,[3,5]] = raw.iloc[:,[3,5]].apply(long)
    
    #delete invalid rows such as those with no recorded gender, or operational logs. 
    raw = raw[(raw['Start_Station_Latitude']>1)]
    raw = raw[(raw['Gender']>0)]
    
    #extract relevant holidays in date range
    cal = USFederalHolidayCalendar()
    d = raw['Start_Time']
    length = len(d)
    print(d.iloc[0])
    print(d.iloc[length-1])
    start_date, start_time = d.iloc[0].split(' ')
    end_date, end_time = d.iloc[length-1].split(' ')
    
    holidays = cal.holidays(start=start_date, end=end_date)
    
    #apply function to each entry in Start_Time
    
    print('starting processing date and time data')
    raw['Start_Time'],raw['Holiday'],raw['Date'] = zip(*raw['Start_Time'].map(time_day))
    
    return raw
    
if __name__ == '__main__':
    names = ['Trip_Duration','Start_Time','Start_Station_Name','Start_Station_Latitude','Start_Station_Longitude','End_Station_Latitude','End_Station_Longitude','Birth_Year','Gender']
    #months =['201701','201702','201703','201704','201705','201706','201707','201708','201709']
    #months = ['201606']
    #months=['201607','201608']
    months=['201602']
    
    for i in range(len(months)):
        
        filename = months[i]+'-citibike-tripdata.csv'
        cleaned_filename = 'cleaned'+months[i]+'.csv'
        print('cleaning ',filename)
        start = time.time()
        cleaned = cleancsv(filename, 60, 3600)
        
        cleaned.to_csv(cleaned_filename)
        print(filename,' finished cleaning')
        end = time.time()
        print('It took ',end-start,' seconds.')

            


cleaning  201602-citibike-tripdata.csv
2/1/2016 00:00:08
2/29/2016 23:59:55
starting processing date and time data
201602-citibike-tripdata.csv  finished cleaning
It took  397.6231162548065  seconds.
