In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import glob

In [3]:
# Unzip all csv files
unzipped = glob.glob('NYCBike_unzipped/*.csv')

In [4]:
# Column names in each cvs file not the same.  Rename them so they are the same.  
rename_dict = {'bikeid': 'Bike ID',
 'birth year': 'Birth Year',
 'end station id': 'End Station ID',
 'end station latitude': 'End Station Latitude',
 'end station longitude': 'End Station Longitude',
 'end station name': 'End Station Name',
 'gender': 'Gender',
 'start station id': 'Start Station ID',
 'start station latitude': 'Start Station Latitude',
 'start station longitude': 'Start Station Longitude',
 'start station name': 'Start Station Name',
 'starttime': 'Start Time',
 'stoptime': 'Stop Time',
 'tripduration': 'Trip Duration',
 'usertype': 'User Type'}

In [5]:
def combine_df(files, smpl_frac = 0.25):
    '''
    Given an array of csv files, sample part of each csv and concatenate the samples into a master dataframe
    
    Parameters:  
    files (array of strings): csv file names to be sampled and concatenated
    smpl_frac (float):  a number between 0 and 1 of what percent you want each file in files to be sampled
    
    Returns:
    df (pandas df): a dataframe combining all the sampled data from each file
    '''
    df = pd.DataFrame()
    for file in files:
        tmp = pd.read_csv(file)
        tmp.rename(columns = rename_dict, inplace = True)
        tmp = tmp.sample(frac = smpl_frac, replace = False)
        print(file, tmp.shape)
        df = pd.concat([df, tmp], axis = 0)
    return df

In [6]:
# The csv files from 01/2017 to 12/2018
unzipped[43:-4]

['NYCBike_unzipped\\201701-citibike-tripdata.csv',
 'NYCBike_unzipped\\201702-citibike-tripdata.csv',
 'NYCBike_unzipped\\201703-citibike-tripdata.csv',
 'NYCBike_unzipped\\201704-citibike-tripdata.csv',
 'NYCBike_unzipped\\201705-citibike-tripdata.csv',
 'NYCBike_unzipped\\201706-citibike-tripdata.csv',
 'NYCBike_unzipped\\201707-citibike-tripdata.csv',
 'NYCBike_unzipped\\201708-citibike-tripdata.csv',
 'NYCBike_unzipped\\201709-citibike-tripdata.csv',
 'NYCBike_unzipped\\201710-citibike-tripdata.csv',
 'NYCBike_unzipped\\201711-citibike-tripdata.csv',
 'NYCBike_unzipped\\201712-citibike-tripdata.csv',
 'NYCBike_unzipped\\201801-citibike-tripdata.csv',
 'NYCBike_unzipped\\201802-citibike-tripdata.csv',
 'NYCBike_unzipped\\201803-citibike-tripdata.csv',
 'NYCBike_unzipped\\201804-citibike-tripdata.csv',
 'NYCBike_unzipped\\201805-citibike-tripdata.csv',
 'NYCBike_unzipped\\201806-citibike-tripdata.csv',
 'NYCBike_unzipped\\201807-citibike-tripdata.csv',
 'NYCBike_unzipped\\201808-citi

In [8]:
combined_df = combine_df(unzipped[43:-4], smpl_frac = 0.1)

NYCBike_unzipped\201701-citibike-tripdata.csv (72668, 15)
NYCBike_unzipped\201702-citibike-tripdata.csv (79165, 15)
NYCBike_unzipped\201703-citibike-tripdata.csv (72766, 15)
NYCBike_unzipped\201704-citibike-tripdata.csv (131540, 15)
NYCBike_unzipped\201705-citibike-tripdata.csv (152327, 15)
NYCBike_unzipped\201706-citibike-tripdata.csv (173159, 15)
NYCBike_unzipped\201707-citibike-tripdata.csv (173560, 15)
NYCBike_unzipped\201708-citibike-tripdata.csv (181650, 15)
NYCBike_unzipped\201709-citibike-tripdata.csv (187810, 15)
NYCBike_unzipped\201710-citibike-tripdata.csv (189759, 15)
NYCBike_unzipped\201711-citibike-tripdata.csv (133065, 15)
NYCBike_unzipped\201712-citibike-tripdata.csv (88997, 15)
NYCBike_unzipped\201801-citibike-tripdata.csv (71899, 15)
NYCBike_unzipped\201802-citibike-tripdata.csv (84311, 15)
NYCBike_unzipped\201803-citibike-tripdata.csv (97667, 15)
NYCBike_unzipped\201804-citibike-tripdata.csv (130754, 15)
NYCBike_unzipped\201805-citibike-tripdata.csv (182471, 15)
NYCB

In [9]:
combined_df.shape

(3391298, 15)

In [10]:
combined_df.head().T

Unnamed: 0,62650,476003,525273,212919,704070
Trip Duration,605,936,1324,1133,1966
Start Time,2017-01-04 17:29:06,2017-01-21 23:35:23,2017-01-24 22:22:37,2017-01-12 08:15:57,2017-01-31 07:34:53
Stop Time,2017-01-04 17:39:11,2017-01-21 23:50:59,2017-01-24 22:44:42,2017-01-12 08:34:50,2017-01-31 08:07:40
Start Station ID,341,466,268,120,344
Start Station Name,Stanton St & Mangin St,W 25 St & 6 Ave,Howard St & Centre St,Lexington Ave & Classon Ave,Monroe St & Bedford Ave
Start Station Latitude,40.7178,40.744,40.7191,40.6868,40.6851
Start Station Longitude,-73.9763,-73.9914,-73.9997,-73.9593,-73.9538
End Station ID,3263,3142,409,3364,312
End Station Name,Cooper Square & E 7 St,1 Ave & E 62 St,DeKalb Ave & Skillman St,Carroll St & 5 Ave,Allen St & Stanton St
End Station Latitude,40.7292,40.7612,40.6906,40.6752,40.7221


In [11]:
combined_df.to_csv('bikes_10.csv', index = False)

In [12]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3391298 entries, 62650 to 32651
Data columns (total 15 columns):
Trip Duration              int64
Start Time                 object
Stop Time                  object
Start Station ID           float64
Start Station Name         object
Start Station Latitude     float64
Start Station Longitude    float64
End Station ID             float64
End Station Name           object
End Station Latitude       float64
End Station Longitude      float64
Bike ID                    int64
User Type                  object
Birth Year                 float64
Gender                     int64
dtypes: float64(7), int64(3), object(5)
memory usage: 414.0+ MB


In [13]:
combined_df.describe()

Unnamed: 0,Trip Duration,Start Station ID,Start Station Latitude,Start Station Longitude,End Station ID,End Station Latitude,End Station Longitude,Bike ID,Birth Year,Gender
count,3391298.0,3391050.0,3391298.0,3391298.0,3391050.0,3391298.0,3391298.0,3391298.0,3228245.0,3391298.0
mean,1005.06,1477.496,40.73706,-73.98357,1469.052,40.73671,-73.98372,25210.54,1978.955,1.136305
std,20495.1,1400.395,0.04407566,0.05968771,1399.095,0.05394568,0.08241892,5982.479,11.86829,0.5527479
min,61.0,72.0,0.0,-74.03423,72.0,0.0,-74.0557,14529.0,1885.0,0.0
25%,365.0,369.0,40.71775,-73.99596,368.0,40.71757,-73.99601,19364.0,1969.0,1.0
50%,613.0,497.0,40.73818,-73.98672,496.0,40.73726,-73.98692,26639.0,1982.0,1.0
75%,1068.0,3167.0,40.75669,-73.97412,3167.0,40.75641,-73.97493,30244.0,1988.0,1.0
max,19510050.0,3721.0,45.50636,0.0,3721.0,45.50636,0.0,35806.0,2002.0,2.0
