## Data Source

[The Citi Bike Data webpage](https://www.citibikenyc.com/system-data)

Jersey City trip histories files from 2018 to 2020 were downloaded (30 files).

## Dependencies

In [1]:
import pandas as pd
import glob

# Calculate geo distance between two geographic points with coordinates
from geopy.distance import distance

In [2]:
# Function to get geo distance between start station and end station

def geo_distance(row):
    coord_a = (row['start station latitude'], row['start station longitude'])
    coord_b = (row['end station latitude'], row['end station longitude'])
    return distance(coord_a, coord_b).miles  

## Preparing data for JC

In [3]:
# Initialize dataframe
df_jc = pd.DataFrame()

# Loop through all JC .csv files
for file_name in glob.glob('./Resources/JC-20*.csv'):
    # Read .csv files
    df = pd.read_csv(file_name)
    print(50*'-')
    print(f"File Name: {file_name} \nDF shape: {df.shape}")
    
    # Calculate geo distance between start and stations
    df['geodistance (miles)'] = df.apply(geo_distance, axis=1)
    # Move the new column "geo distance" to index 1
    col = df.pop('geodistance (miles)')
    df.insert(1, 'geodistance (miles)', col)
    # Calculating Geo distance is time consuming --> save df to csv file
    newft = file_name.split('\\')
    newf = f"{newft[0]}-gd/{newft[1]}"
    df.to_csv(newf, index=False)
    
    # Join iterated dataframe with previuos one
    df_jc = pd.concat([df_jc, df]).reset_index(drop=True)
    print(f"Current joined DF shape: {df_jc.shape}")
    
print(50*'=')
print(f"Final joined DF shape: {df_jc.shape}")
# Save the dataframe to csv file
file_name = f"./Data/JC-2018-2020-citibike-tripdata-gd.csv"
df_jc.to_csv(file_name, index=False)

--------------------------------------------------
File Name: ./Resources\JC-201801-citibike-tripdata.csv 
DF shape: (12677, 15)
Current joined DF shape: (12677, 16)
--------------------------------------------------
File Name: ./Resources\JC-201802-citibike-tripdata.csv 
DF shape: (15104, 15)
Current joined DF shape: (27781, 16)
--------------------------------------------------
File Name: ./Resources\JC-201803-citibike-tripdata.csv 
DF shape: (17109, 15)
Current joined DF shape: (44890, 16)
--------------------------------------------------
File Name: ./Resources\JC-201804-citibike-tripdata.csv 
DF shape: (23634, 15)
Current joined DF shape: (68524, 16)
--------------------------------------------------
File Name: ./Resources\JC-201805-citibike-tripdata.csv 
DF shape: (34456, 15)
Current joined DF shape: (102980, 16)
--------------------------------------------------
File Name: ./Resources\JC-201806-citibike-tripdata.csv 
DF shape: (40937, 15)
Current joined DF shape: (143917, 16)
--

In [4]:
df_jc.head()

Unnamed: 0,tripduration,geodistance (miles),starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,932,0.865295,2018-01-01 02:06:17.5410,2018-01-01 02:21:50.0270,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,-74.032108,31929,Subscriber,1992,1
1,550,0.865295,2018-01-01 12:06:18.0390,2018-01-01 12:15:28.4430,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,-74.032108,31845,Subscriber,1969,2
2,510,0.865295,2018-01-01 12:06:56.9780,2018-01-01 12:15:27.8100,3183,Exchange Place,40.716247,-74.033459,3199,Newport Pkwy,40.728745,-74.032108,31708,Subscriber,1946,1
3,354,0.374889,2018-01-01 14:53:10.1860,2018-01-01 14:59:05.0960,3183,Exchange Place,40.716247,-74.033459,3267,Morris Canal,40.712419,-74.038526,31697,Subscriber,1994,1
4,250,0.211295,2018-01-01 17:34:30.1920,2018-01-01 17:38:40.9840,3183,Exchange Place,40.716247,-74.033459,3639,Harborside,40.719252,-74.034234,31861,Subscriber,1991,1
