## Data Source

## Dependencies

In [36]:
import pandas as pd
import glob

# Calculate geo distance between two geographic points with coordinates
from geopy.distance import distance

In [37]:
# Function to get geo distance between start station and end station

def geo_distance(row):
    coord_a = (row['start station latitude'], row['start station longitude'])
    coord_b = (row['end station latitude'], row['end station longitude'])
    return distance(coord_a, coord_b).miles  

## Preparing data for NYC

In [38]:
# Initialize dataframe
df_nyc = pd.DataFrame()

# Loop through all NYC .csv files
for file_name in glob.glob('./Resources/20*.csv'):
    # Read .csv files
    df = pd.read_csv(file_name)
    print(50*'-')
    print(f"File Name: {file_name} \nDF shape: {df.shape}")
    
    # Insert column "serving area" to index 0
    df.insert(0, 'serving area','NYC')
    
    # Calculate geo distance between start and stations
    df['geodistance (miles)'] = df.apply(geo_distance, axis=1)
    # Move the new column "geo distance" to index 2
    col = df.pop('geodistance (miles)')
    df.insert(2, 'geodistance (miles)', col)
    # Calculating Geo distance is time consuming --> save df to csv file
    newft = file_name.split('\\')
    newf = f"{newft[0]}-gd/{newft[1]}"
    df.to_csv(newf, index=False)
    
    # Join iterated dataframe with previuos one
    df_nyc = pd.concat([df_nyc, df]).reset_index(drop=True)
    print(f"Current joined DF shape: {df_nyc.shape}")
    
print(50*'=')
print(f"Final joined DF shape: {df_nyc.shape}")
# Save the dataframe to csv file
file_name = f"./Data/2019-citibike-tripdata-gd.csv"
df_nyc.to_csv(file_name, index=False)

--------------------------------------------------
File Name: ./Resources\201901-citibike-tripdata.csv 
DF shape: (967287, 15)
Current joined DF shape: (967287, 17)
--------------------------------------------------
File Name: ./Resources\201902-citibike-tripdata.csv 
DF shape: (943744, 15)
Current joined DF shape: (1911031, 17)
--------------------------------------------------
File Name: ./Resources\201903-citibike-tripdata.csv 
DF shape: (1327960, 15)
Current joined DF shape: (3238991, 17)
--------------------------------------------------
File Name: ./Resources\201904-citibike-tripdata.csv 
DF shape: (1766094, 15)
Current joined DF shape: (5005085, 17)
--------------------------------------------------
File Name: ./Resources\201905-citibike-tripdata.csv 
DF shape: (1924563, 15)
Current joined DF shape: (6929648, 17)
--------------------------------------------------
File Name: ./Resources\201906-citibike-tripdata.csv 
DF shape: (2125370, 15)
Current joined DF shape: (9055018, 17)
-

In [40]:
df_nyc.head()

Unnamed: 0,serving area,tripduration,geodistance (miles),starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,NYC,320,0.661999,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1
1,NYC,316,0.359046,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1
2,NYC,591,1.26497,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1
3,NYC,2719,0.873519,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.99643,21579,Subscriber,1990,1
4,NYC,303,0.817204,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.99379,503.0,E 20 St & Park Ave,40.738274,-73.98752,35379,Subscriber,1979,1


## Preparing data for JC

In [41]:
# Initialize dataframe
df_jc = pd.DataFrame()

# Loop through all JC .csv files
for file_name in glob.glob('./Resources/JC-20*.csv'):
    # Read .csv files
    df = pd.read_csv(file_name)
    print(50*'-')
    print(f"File Name: {file_name} \nDF shape: {df.shape}")
    
    # Insert column "serving area" to index 0
    df.insert(0, 'serving area','JC')
    
    # Calculate geo distance between start and stations
    df['geodistance (miles)'] = df.apply(geo_distance, axis=1)
    # Move the new column "geo distance" to index 2
    col = df.pop('geodistance (miles)')
    df.insert(2, 'geodistance (miles)', col)
    # Calculating Geo distance is time consuming --> save df to csv file
    newft = file_name.split('\\')
    newf = f"{newft[0]}-gd/{newft[1]}"
    df.to_csv(newf, index=False)
    
    # Join iterated dataframe with previuos one
    df_jc = pd.concat([df_jc, df]).reset_index(drop=True)
    print(f"Current joined DF shape: {df_jc.shape}")
    
print(50*'=')
print(f"Final joined DF shape: {df_jc.shape}")
# Save the dataframe to csv file
file_name = f"./Data/JC-2019-citibike-tripdata-gd.csv"
df_jc.to_csv(file_name, index=False)

--------------------------------------------------
File Name: ./Resources\JC-201901-citibike-tripdata.csv 
DF shape: (19676, 15)
Current joined DF shape: (19676, 17)
--------------------------------------------------
File Name: ./Resources\JC-201902-citibike-tripdata.csv 
DF shape: (18565, 15)
Current joined DF shape: (38241, 17)
--------------------------------------------------
File Name: ./Resources\JC-201903-citibike-tripdata.csv 
DF shape: (23606, 15)
Current joined DF shape: (61847, 17)
--------------------------------------------------
File Name: ./Resources\JC-201904-citibike-tripdata.csv 
DF shape: (33056, 15)
Current joined DF shape: (94903, 17)
--------------------------------------------------
File Name: ./Resources\JC-201905-citibike-tripdata.csv 
DF shape: (36135, 15)
Current joined DF shape: (131038, 17)
--------------------------------------------------
File Name: ./Resources\JC-201906-citibike-tripdata.csv 
DF shape: (39430, 15)
Current joined DF shape: (170468, 17)
--

In [42]:
df_jc.head()

Unnamed: 0,serving area,tripduration,geodistance (miles),starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,JC,201,0.287536,2019-01-01 03:09:09.7110,2019-01-01 03:12:30.8790,3183,Exchange Place,40.716247,-74.033459,3214,Essex Light Rail,40.712774,-74.036486,29612,Subscriber,1993,1
1,JC,505,0.565349,2019-01-01 05:18:00.1060,2019-01-01 05:26:25.9050,3183,Exchange Place,40.716247,-74.033459,3638,Washington St,40.724294,-74.035483,29213,Subscriber,1972,2
2,JC,756,1.217823,2019-01-01 10:36:33.3400,2019-01-01 10:49:10.2600,3183,Exchange Place,40.716247,-74.033459,3192,Liberty Light Rail,40.711242,-74.055701,26164,Subscriber,1985,1
3,JC,1575,0.565349,2019-01-01 12:43:38.6430,2019-01-01 13:09:54.5280,3183,Exchange Place,40.716247,-74.033459,3638,Washington St,40.724294,-74.035483,29672,Customer,1969,0
4,JC,1566,0.565349,2019-01-01 12:43:39.6010,2019-01-01 13:09:46.5100,3183,Exchange Place,40.716247,-74.033459,3638,Washington St,40.724294,-74.035483,29522,Customer,1969,0
