In [1]:
#This script merges cleaned citi bike trip data from each month together, and count the number
#of trips departing each station at a specific day and time bucket, and fill those time buckets 
#with 0 trips. 

import pandas as pd
import numpy as np

#This function reads cleaned data and merge them according to timebins
#so we can perform regression on number of trips. 
def read_cleaned(filename):
    global name
    raw = pd.read_csv(filename,usecols=[2,3,4,5,6,7,10,11])
    return raw


#This function computes the Cartesian product between two dataframes.
#This is taken from https://mkonrad.net/2016/04/16/cross-join--cartesian-product-between-pandas-dataframes.html
def df_crossjoin(df1, df2):
 
    df1['_tmpkey'] = 1
    df2['_tmpkey'] = 1

    product = pd.merge(df1, df2, on='_tmpkey').drop('_tmpkey', axis=1)

    df1.drop('_tmpkey', axis=1, inplace=True)
    df2.drop('_tmpkey', axis=1, inplace=True)
    return product

#This function reads in monthly rides and merges them into time buckets, and returns the counts.
#Moreover, it fills in the missing observations where 0 number of rides were observed. 
def merge_month(to_add):
    raw = to_add
    #merge trips into time buckets on a given day.
    drop_end_station = raw.drop(columns=['End_Station_Latitude', 'End_Station_Longitude'])
    incomplete = drop_end_station.groupby(drop_end_station.columns.tolist(),as_index=False).size().reset_index().rename(columns={0:'Count'})

    #produce a dataframe with 0 counts for all locations, all times, and all dates
    #then, insert rows into the incomplete merged dataframe if those rows are missing, i.e.
    #the number of trips is 0 for that particular combination.
    location = raw[['Start_Station_Name','Start_Station_Latitude','Start_Station_Longitude']]
    location = location.drop_duplicates()
    date_info = raw[['Holiday','Date']]
    date_info = date_info.drop_duplicates()
    time_bucket = raw['Start_Time']
    time_bucket = pd.DataFrame(time_bucket.drop_duplicates())
    #cross product of the three locations. 
    cruz = df_crossjoin(time_bucket,location)
    empty_cross = df_crossjoin(cruz,date_info)
    empty_cross['Count']= 0 
    
    #combine the empty list with actual observations, then merge using groupby.
    c = pd.concat([empty_cross,incomplete])
    merge_index = ['Start_Time','Start_Station_Name','Start_Station_Latitude','Start_Station_Longitude','Holiday','Date']
    complete_month = c.groupby(merge_index).sum()
    complete_month = complete_month.reset_index()
    return complete_month

    
if __name__ == '__main__':
    name = ['Start_Time','Start_Station_Name','Start_Station_Latitude','Start_Station_Longitude','End_Station_Latitude','End_Station_Longitude','Holiday','Date']
    
    database_name = ['Start_Time','Start_Station_Name','Start_Station_Latitude','Start_Station_Longitude','Holiday','Date','Count']
   
    training_date = ['201706','201707','201708']
    training_database = pd.DataFrame(columns=database_name)
    
    
    #create training database
    for i in range(len(training_date)):
        
        filename = 'cleaned'+training_date[i]+'.csv'
        print('merging ',filename)
        
        to_add = read_cleaned(filename)
        
        #merge the rides in each month
        merged_month_rides = merge_month(to_add)
        training_database = pd.concat([training_database,merged_month_rides])
        
        
    training_database.to_csv('demand_merged_data.csv',index=False)
    print('training data merge complete')  
    
    
    
   
    


merging  cleaned201706.csv
merging  cleaned201707.csv
merging  cleaned201708.csv
training data merge complete
