# Preprocessed Citibike Data: Feature Engineering

## Import libraries

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from pandas.api.types import CategoricalDtype

## Import and load data

Import preprocessed data:

In [2]:
cb_raw = pd.read_csv('../../data/02_processed/citibike_final.csv', parse_dates=['starttime', 'stoptime'])

Make a copy: 

In [3]:
cb = cb_raw.copy()

## Inspect data frame

Get dimensions of data frame and datatypes of each feature:

In [4]:
cb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4015035 entries, 0 to 4015034
Data columns (total 15 columns):
bikeid                   int64
birthyear                int64
endstationid             int64
endstationlatitude       float64
endstationlongitude      float64
endstationname           object
gender                   int64
startstationid           int64
startstationlatitude     float64
startstationlongitude    float64
startstationname         object
starttime                datetime64[ns]
stoptime                 datetime64[ns]
tripduration             int64
usertype                 object
dtypes: datetime64[ns](2), float64(4), int64(6), object(3)
memory usage: 459.5+ MB


Randomly sample five observations:

In [5]:
cb.sample(n=5)

Unnamed: 0,bikeid,birthyear,endstationid,endstationlatitude,endstationlongitude,endstationname,gender,startstationid,startstationlatitude,startstationlongitude,startstationname,starttime,stoptime,tripduration,usertype
194122,17693,1989,521,40.75045,-73.994811,8 Ave & W 31 St,2,307,40.714275,-73.9899,Canal St & Rutgers St,2013-09-18 19:40:47.000,2013-09-18 20:05:05.000,1458,Subscriber
2057973,19879,1969,248,40.721854,-74.007718,Laight St & Hudson St,1,79,40.719116,-74.006667,Franklin St & W Broadway,2013-10-23 18:07:10.000,2013-10-23 18:09:47.000,157,Subscriber
2899049,23327,1900,144,40.698399,-73.980689,Nassau St & Navy St,0,532,40.710451,-73.960876,S 5 Pl & S 4 St,2015-09-27 17:27:55.000,2015-09-27 17:42:27.000,871,Customer
764721,18378,1989,483,40.732233,-73.9889,E 12 St & 3 Ave,2,495,40.762699,-73.993012,W 47 St & 10 Ave,2015-06-17 20:33:00.000,2015-06-17 20:53:00.000,1212,Subscriber
133782,32215,1985,3695,40.72687,-73.98919,E 5 St & 2 Ave,1,401,40.720196,-73.989978,Allen St & Rivington St,2018-09-18 17:54:11.716,2018-09-18 17:59:04.922,293,Subscriber


Check for missing values – there should be none:

In [6]:
cb.isnull().sum()

bikeid                   0
birthyear                0
endstationid             0
endstationlatitude       0
endstationlongitude      0
endstationname           0
gender                   0
startstationid           0
startstationlatitude     0
startstationlongitude    0
startstationname         0
starttime                0
stoptime                 0
tripduration             0
usertype                 0
dtype: int64

## Data preprocessing

Show unique values per feature:

In [7]:
pd.DataFrame.from_records([(col, cb[col].nunique()) for col in cb.columns],
                          columns=['Feature', 'Number of Unique Values']).sort_values(by=['Number of Unique Values'])

Unnamed: 0,Feature,Number of Unique Values
6,gender,3
14,usertype,3
1,birthyear,85
7,startstationid,1050
2,endstationid,1066
10,startstationname,1080
5,endstationname,1098
9,startstationlongitude,1107
4,endstationlongitude,1125
8,startstationlatitude,1183


### Define features

Define numerical features:

In [8]:
map_feat = ['endstationlatitude','endstationlongitude','startstationlatitude','startstationlongitude']
datetime_feat = ['birthyear','starttime', 'stoptime']
cont_feat = map_feat + datetime_feat + ['tripduration']

Define categorical features:

In [9]:
num_nom_feat = ['bikeid','endstationid','startstationid','gender']
cat_nom_feat = ['endstationname','startstationname','usertype']
nom_feat = num_nom_feat + cat_nom_feat

### Perform datatype conversions

Convert to datetime format:

In [10]:
cb['birthyear'] = pd.to_datetime(cb.birthyear, format='%Y')

Convert features to `float16` or `int16` to reduce memory:

In [11]:
float_to_int = ['startstationid','endstationid','tripduration','bikeid','gender']

cb[float_to_int] = cb[float_to_int].apply(lambda x: x.astype('int16'))

Convert untreated nominal features to categorical datatype to reduce memory:

In [12]:
cb[cat_nom_feat] = cb[cat_nom_feat].apply(lambda x: x.astype('category'))

Check for successful datatype conversion and reduction in memory:

In [13]:
cb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4015035 entries, 0 to 4015034
Data columns (total 15 columns):
bikeid                   int16
birthyear                datetime64[ns]
endstationid             int16
endstationlatitude       float64
endstationlongitude      float64
endstationname           category
gender                   int16
startstationid           int16
startstationlatitude     float64
startstationlongitude    float64
startstationname         category
starttime                datetime64[ns]
stoptime                 datetime64[ns]
tripduration             int16
usertype                 category
dtypes: category(3), datetime64[ns](3), float64(4), int16(5)
memory usage: 272.0 MB


## Feature engineering

### Imputation

In [14]:
cb.groupby(['startstationid', 'startstationname', 'startstationlatitude', 'startstationlongitude']).size()

startstationid  startstationname               startstationlatitude  startstationlongitude
72              W 52 St & 11 Ave               40.767272             -73.993929               10161
79              Franklin St & W Broadway       40.719116             -74.006667                7828
82              St James Pl & Pearl St         40.711174             -74.000165                3249
83              Atlantic Ave & Fort Greene Pl  40.683826             -73.976323                4079
116             W 17 St & 8 Ave                40.741776             -74.001497               12170
                                                                                              ...  
3910            Greene Ave & Grandview Ave     40.709697             -73.907856                   1
3911            Wyckoff St & Nevins St         40.683426             -73.984275                   8
3916            Pearl St & Peck Slip           40.708485             -74.002751                   7
3917     

Recompute station coordinates to correct inconsistencies:

In [15]:
end_coords = ['endstationid','endstationname','endstationlatitude','endstationlongitude']
start_coords = ['startstationid','startstationname','startstationlatitude','startstationlongitude']

In [16]:
stn3240_lat = cb[end_coords].loc[(cb['endstationid'] == 3240) & (cb['endstationlatitude'] != 40.750)].iloc[0][2]
stn3240_lon = cb[end_coords].loc[(cb['endstationid'] == 3240) & (cb['endstationlatitude'] != 74.0)].iloc[0][3]

In [17]:
cb.loc[cb['endstationlatitude'] == 40.75000, 'endstationlatitude'] = stn3240_lat
cb.loc[cb['endstationlongitude'] == -74.0, 'endstationlongitude'] = stn3240_lon
cb.loc[cb['startstationlongitude']== -74.0, 'startstationlongitude'] = stn3240_lon
cb.loc[cb['startstationlatitude'] == 40.75000, 'startstationlatitude'] = stn3240_lat

### Distance features

Define helper function to calculate distance between coordinates:

In [18]:
from pyproj import Geod

def calc_vincenty_dist(lat1, lon1, lat2, lon2): 
    az12, az21, dist = Geod(ellps='WGS84').inv(lon1, lat1, lon2, lat2)
    return dist

In [19]:
# Haversine & inclination 29 degrees to True north
# Credits: https://www.movable-type.co.uk/scripts/latlong.html,
# https://gist.github.com/jkAtGitHub/8ae7da4d5dacb9969bff43500b5efbc0#file-manhattan_dist-py
def calc_haversine_dist(lat1, lon1, lat2, lon2):
    lat1, lat2, lon1, lon2 = np.radians(lat1), np.radians(lat2), np.radians(lon1), np.radians(lon2)
    dlat, dlon = lat2 - lat1, lon2 - lon1
    r = 3963 # Earth's radius in miles
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a)) 
    
    total_dist = r * c
    return total_dist

def calc_manh_disth(lat1, lon1, lat2, lon2):
    start = np.stack([lat1, lon1], axis=1)
    end = np.stack([lat2, lon2], axis=1)
    
    theta1, theta2 = np.radians(-28.904), np.radians(28.904)
    rmat1 = np.array([[np.cos(theta1), np.sin(theta1)], 
                      [-np.sin(theta1), np.cos(theta1)]])
    rmat2 = np.array([[np.cos(theta2), np.sin(theta2)], 
                      [-np.sin(theta2), np.cos(theta2)]])
    
    start_rot = rmat1 @ start.T
    end_rot = rmat1 @ end.T
    
    hinge = np.stack((start_rot[0,:], end_rot[1,:]))
    hinge_coords = rmat2 @ hinge
    
    manh_dist = calc_haversine_dist(start.T[0], start.T[1], hinge_coords[0], hinge_coords[1]) + calc_haversine_dist(hinge_coords[0], hinge_coords[1], end.T[0], end.T[1])
    return manh_dist

# Unused functions -----------------------------------------------
# # Cartesian Coordinate system, output in latlong dist
# def calc_manh_distc(lat, lon):
#     return sum(abs(lat_i-lat_j) for lat_i, lat_j in zip(lat, lon))

Create new features, `vicentydistance` and `manhdistance` (both in miles):

In [20]:
cb['vicentydistance'] = calc_vincenty_dist(cb.startstationlatitude.tolist(), 
                                        cb.startstationlongitude.tolist(),
                                        cb.endstationlatitude.tolist(),
                                        cb.endstationlongitude.tolist())
cb['vicentydistance'] = cb['vicentydistance'].apply(lambda x: x*0.000621371) # Convert from meters to miles

In [21]:
cb['manhdistance'] = calc_manh_disth(cb.startstationlatitude,
                                     cb.startstationlongitude,
                                     cb.endstationlatitude,
                                     cb.endstationlongitude)

### Datetime features

Create new features, `startmonth` and `endmonth`:

In [22]:
cb['startmonth'] = cb['starttime'].dt.month.astype('int16')
cb['stopmonth'] = cb['stoptime'].dt.month.astype('int16')

Create new features for days of week, `startdayname` (numerical code: `startday`), `enddayname` (numerical code: `stopday`), `startdaytype`, and `startendtype`:

In [23]:
cb['startdayname'] = cb['starttime'].dt.day_name().astype('category')
cb['startday'] = cb['starttime'].dt.dayofweek.astype('int16')
cb['stopdayname'] = cb['stoptime'].dt.day_name().astype('category')
cb['stopday'] = cb['stoptime'].dt.dayofweek.astype('int16')

In [24]:
start_weekday_conds = [(cb.startday >= 0) & (cb.startday < 5), (cb.startday >= 5) & (cb.startday < 7)]
end_weekday_conds = [(cb.startday >= 0) & (cb.startday < 5), (cb.startday >= 5) & (cb.startday < 7)]
day_type_labels = ['Weekday','Weekend']

In [25]:
cb['startdaytype'] = np.select(start_weekday_conds, day_type_labels)
cb['startdaytype'] = cb['startdaytype'].astype('category')

In [26]:
cb['enddaytype'] = np.select(end_weekday_conds, day_type_labels)
cb['enddaytype'] = cb['enddaytype'].astype('category')

Create new features, `starthour` and `endhour`:

In [27]:
cb['starthour'] = cb['starttime'].dt.hour.astype('int16')
cb['endhour'] = cb['stoptime'].dt.hour.astype('int16')

Create new feature, `rushhour` (as defined by MTA):

In [28]:
start_rushhour_conds = [(cb.starthour >= 0) & (cb.starthour < 6), 
                       (cb.starthour >= 6) & (cb.starthour < 10),
                       (cb.starthour >= 10) & (cb.starthour < 15),
                       (cb.starthour >= 15) & (cb.starthour < 21),
                       (cb.starthour >= 21) & (cb.starthour <= 23)]
end_rushhour_conds = [(cb.endhour >= 0) & (cb.endhour < 6), 
                       (cb.endhour >= 6) & (cb.endhour < 10),
                       (cb.endhour >= 10) & (cb.endhour < 15),
                       (cb.endhour >= 15) & (cb.endhour < 21),
                       (cb.endhour >= 21) & (cb.endhour <= 23)]
rushhour_labels = ['Morning Off-Peak', 'Morning Rush', 'Afternoon Off-Peak', 'Afternoon Rush','Evening Off-Peak']

In [29]:
cb['startrushhour'] = np.select(start_rushhour_conds, rushhour_labels)
cb['startrushhour'] = cb['startrushhour'].astype('category')

In [30]:
cb['endrushhour'] = np.select(start_rushhour_conds, rushhour_labels)
cb['endrushhour'] = cb['startrushhour'].astype('category')

Create new features, `starttimeofday` and `endtimeofday`:

In [31]:
start_timeofday_conds = [(cb.starthour >= 6) & (cb.starthour < 12), 
                         (cb.starthour >= 12) & (cb.starthour < 17),
                         (cb.starthour >= 17) & (cb.starthour < 20),
                         ((cb.starthour >= 20) & (cb.starthour < 23)) | (cb.starthour == 0)]
end_timeofday_conds = [(cb.endhour >= 6) & (cb.endhour < 12), 
                       (cb.endhour >= 12) & (cb.endhour < 17),
                       (cb.endhour >= 17) & (cb.endhour < 20),
                       ((cb.endhour >= 20) & (cb.endhour < 23)) | (cb.endhour == 0)]
timeofday_labels = ['Morning (6AM-11:59PM)', 'Afternoon (12PM-4:59PM)', 'Evening (5-7:59PM)', 'Night (8PM-5:59AM)']

In [32]:
cb['starttimeofday'] = np.select(start_timeofday_conds, timeofday_labels)
cb['starttimeofday'] = cb['starttimeofday'].astype('category')

In [33]:
cb['endtimeofday'] = np.select(end_timeofday_conds, timeofday_labels)
cb['endtimeofday'] = cb['endtimeofday'].astype('category')

### Demographic features

Create new features, `age_at_ride` and `age_group`:

In [34]:
cb['age_at_ride'] = cb['starttime'].dt.year.astype('int16') - cb['birthyear'].dt.year.astype('int16')
cb['age_at_ride'] = cb['age_at_ride'].astype('int16')

In [35]:
cb['age_at_ride'].describe().astype('int16')

count    17339
mean        44
std         22
min         16
25%         30
50%         37
75%         50
max        120
Name: age_at_ride, dtype: int16

In [36]:
age_groups_conds = [(cb.age_at_ride >= 16) & (cb.age_at_ride < 20), 
                    (cb.age_at_ride >= 20) & (cb.age_at_ride < 30),
                    (cb.age_at_ride >= 30) & (cb.age_at_ride < 40),
                    (cb.age_at_ride >= 40) & (cb.age_at_ride < 50),
                    (cb.age_at_ride >= 50) & (cb.age_at_ride < 60),
                    (cb.age_at_ride >= 60) & (cb.age_at_ride <= 120)]
age_groups_labels = ['Teens (16-19)', '20s', '30s', '40s', '50s', '60s+']

In [37]:
cb['age_group'] = np.select(age_groups_conds, age_groups_labels)
cb['age_group'] = cb['age_group'].astype('category')

### Geographic features

Create new features, `startboro`, `endboro`, `startneighborhood`, and `endneighborhood`:

In [38]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm._tqdm_notebook import tqdm_notebook

In [39]:
cb['startcoords'] = list(zip(cb.startstationlatitude, cb.startstationlongitude))
cb['endcoords'] = list(zip(cb.endstationlatitude, cb.endstationlongitude))

In [40]:
print(cb['startcoords'].nunique())
print(cb['endcoords'].nunique())

1182
1199


Create data frame `unique_coords` consisting of unique coordinates for reverse geocoding processing:

In [41]:
end_coords = end_coords + ['endcoords']
start_coords = start_coords + ['startcoords']
coords_all = end_coords + start_coords

In [42]:
unique_startcoords = pd.DataFrame(cb[start_coords].groupby(['startstationid', 'startcoords']).size()).reset_index()
unique_startcoords.columns = ['startstationid','startcoords','todrop']
unique_startcoords.drop(columns=['todrop'], inplace=True)

In [43]:
unique_endcoords = pd.DataFrame(cb[end_coords].groupby(['endstationid', 'endcoords']).size()).reset_index()
unique_endcoords.columns = ['endstationid','endcoords','todrop']
unique_endcoords.drop(columns=['todrop'], inplace=True)

In [44]:
unique_coords = pd.concat([unique_startcoords, unique_endcoords], axis=1)

Reverse geocode station coordinates for borough and neighborhood:

In [45]:
geolocator = Nominatim(user_agent='myGeocoder', timeout=10)
rgeoloc = RateLimiter(geolocator.reverse, min_delay_seconds=5)

In [46]:
tqdm_notebook().pandas(desc='Progress Bar')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

  from pandas import Panel


In [47]:
# unique_coords['endaddress'] = unique_coords['endcoords'].progress_apply(rgeoloc)

In [48]:
# unique_coords['startaddress'] = unique_coords['startcoords'].dropna().progress_apply(rgeoloc)

Save as temporary .csv file to avoid running reverse geocoding again:

In [49]:
# unique_coords.to_csv('../../data/02_processed/unique_coords.csv')

Define helper function:

In [50]:
import re

In [51]:
def pattern_searcher(search_str:str, search_list:str):
    search_obj = re.search(search_list, search_str)
    if search_obj :
        return_str = search_str[search_obj.start(): search_obj.end()]
    else:
        return_str = 'NA'
    return return_str

Extract neighborhood and borough from `startaddress` and `endaddress`:

In [52]:
# RUN THIS INSTEAD OF REVERSE GEOCODING
unique_coords = pd.read_csv('../../data/02_processed/unique_coords.csv')
unique_coords = unique_coords.drop(columns=['Unnamed: 0'])
unique_coords['startaddress'] = unique_coords['startaddress'].astype(str)

In [53]:
m_hoods = ["Lower West Side","Financial District","Lower East Side","Tribeca",
           "East Village","Alphabet City","Stuy Town","Peter Cooper Village",
           "Hudson Square","SoHo","Nolita","Lower Manhattan","Little Italy", "NoHo",
           "Greenwich Village","West Village","Kips Bay","Murray Hill","Gramercy",
           "Hell's Kitchen","East Village","Upper East Side","Upper West Side",
           "NoMad","Chinatown","Korea Town","Midtown","Midtown West","Midtown East",
           "Midtown South","Lincoln Square","Theater District","Chelsea","Harlem",
           "East Harlem","Garment District","Meatpacking District","Battery Park City",
           "Lenox Hill","Tudor City","Five Points","Turtle Bay","Yorkville",
           "Carnegie Hill","Manhattanville","Manhattan Valley","Columbus Circle",
           "Two Bridges","Broadway","Morningside Heights"]

bk_hoods = ["Bay Ridge", "Sunset Park", "Bensonhurst", "Sheepshead Bay",
            "Borough Park", "Midwood", "Flatbush", "East Flatbush",
            "Park Slope", "East New York", "Bedford-Stuyvesant", 
            "Williamsburg", "Greenpoint", "Red Hook", "Downtown Brooklyn",
            "DUMBO", "Brownsville", "Prospect Park", "Fort Hamilton", 
            "Cypress Hills", "Bushwick", "Canarsie", "Brooklyn Heights",
            "Cobble Hill","Canarsie","Bergen Beach","Flatlands","Bronwsville",
            "Crown Heights","Brooklyn Navy Yard","Boerum Hill","Caroll Gardens",
            "Gowanus","Sunset Park","Coney Island","Gravesend","Fort Hamilton",
            "Prospect Heights","Fort Greene","Broadway Junction","Clinton Hill",
            "Ocean Hill","Kings County","Nolan Park","Columbia Street Waterfront District"]

q_hoods = ["Astoria", "Long Island City", "Steinway", "Ridgewood", "Woodside", 
           "Elmhurst", "Jackson Heights", "Corona", "Murray Hill", "Flushing", 
           "Kew Gardens", "Fresh Meadows", "Jamaica", "Bayside", "Whitestone",
           "Sunnyside","Rego Park","Bayside","Maspeth","Queens County"]

nj_hoods = ["Downtown Jersey City","Communipaw","Newark","Bergen","Hoboken",
            "Newport","Grand Street","Hudson Street","Indian Square","Colgate Center",
           "Croxton"]

all_hoods = m_hoods + bk_hoods + q_hoods + nj_hoods

boros_plus = ['Manhattan','Brooklyn','Bronx','Queens','Staten Island','Jersey City']

In [54]:
all_pattern = '|'.join(all_hoods)
boros_pattern = '|'.join(boros_plus)

unique_coords['startneighborhood'] = unique_coords['startaddress'].apply(lambda x: pattern_searcher(search_str=x, search_list=all_pattern))
unique_coords['endneighborhood'] = unique_coords['endaddress'].apply(lambda x: pattern_searcher(search_str=x, search_list=all_pattern))




In [55]:
unique_coords['startboro'] = unique_coords['startaddress'].apply(lambda x: pattern_searcher(search_str=x, search_list=boros_pattern))
unique_coords['endboro'] = unique_coords['endaddress'].apply(lambda x: pattern_searcher(search_str=x, search_list=boros_pattern))

In [56]:
unique_coords.to_csv('../../data/02_processed/unique_coords_final_v2.csv')

In [57]:
unique_coords['startstationid'] = unique_coords['startstationid'].fillna(0).astype('int16')

Create dictionaries to conditionally map values of `unique_coords` onto `cb`:

In [58]:
cb_master = cb.copy()

In [59]:
cb_master['startneighborhood'] = cb_master['startstationid']
cb_master['endneighborhood'] = cb_master['endstationid']

In [60]:
sstnid_hood_dict = dict(unique_coords[['startstationid','startneighborhood']].values)
cb_master['startneighborhood'] = cb_master.startneighborhood.map(sstnid_hood_dict)

In [61]:
estnid_hood_dict = dict(unique_coords[['endstationid','endneighborhood']].values)
cb_master['endneighborhood'] = cb_master.endneighborhood.map(estnid_hood_dict)

In [62]:
cb_master['startboro'] = cb_master['startneighborhood']
cb_master['endboro'] = cb_master['startneighborhood']

In [63]:
shood_boro_dict = dict(unique_coords[['startneighborhood','startboro']].values)
cb_master['startboro'] = cb_master.startboro.map(shood_boro_dict)

In [64]:
ehood_boro_dict = dict(unique_coords[['endneighborhood','endboro']].values)
cb_master['endboro'] = cb_master.endboro.map(ehood_boro_dict)

In [65]:
cb_master.isnull().sum()

bikeid                   0
birthyear                0
endstationid             0
endstationlatitude       0
endstationlongitude      0
endstationname           0
gender                   0
startstationid           0
startstationlatitude     0
startstationlongitude    0
startstationname         0
starttime                0
stoptime                 0
tripduration             0
usertype                 0
vicentydistance          0
manhdistance             0
startmonth               0
stopmonth                0
startdayname             0
startday                 0
stopdayname              0
stopday                  0
startdaytype             0
enddaytype               0
starthour                0
endhour                  0
startrushhour            0
endrushhour              0
starttimeofday           0
endtimeofday             0
age_at_ride              0
age_group                0
startcoords              0
endcoords                0
startneighborhood        0
endneighborhood          0
s

Perform datatype conversion to reduce memory:

In [66]:
cb_master.drop(columns=['startcoords','endcoords'], inplace=True)

In [67]:
cat_nom_feat = ['endstationname','startstationname','usertype',
               'startneighborhood','startboro','endneighborhood','endboro',
               'startdayname','stopdayname','startdaytype','enddaytype',
               'startrushhour','endrushhour','starttimeofday','endtimeofday',
               'age_group']

In [68]:
cb_master[cat_nom_feat] = cb_master[cat_nom_feat].apply(lambda x: x.astype('category'))

In [69]:
float_to_int = ['startstationid','endstationid','tripduration','bikeid','gender']

cb[float_to_int] = cb[float_to_int].apply(lambda x: x.astype('int16'))

In [70]:
cb_master['vicentydistance'] = cb_master['vicentydistance'].astype('float16')
cb_master['manhdistance'] = cb_master['manhdistance'].astype('float16')

### Final check

Inspect memory:

In [71]:
cb_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4015035 entries, 0 to 4015034
Data columns (total 37 columns):
bikeid                   int16
birthyear                datetime64[ns]
endstationid             int16
endstationlatitude       float64
endstationlongitude      float64
endstationname           category
gender                   int16
startstationid           int16
startstationlatitude     float64
startstationlongitude    float64
startstationname         category
starttime                datetime64[ns]
stoptime                 datetime64[ns]
tripduration             int16
usertype                 category
vicentydistance          float16
manhdistance             float16
startmonth               int16
stopmonth                int16
startdayname             category
startday                 int16
stopdayname              category
stopday                  int16
startdaytype             category
enddaytype               category
starthour                int16
endhour              

Inspect missing values:

In [72]:
cb_master.isnull().sum()

bikeid                   0
birthyear                0
endstationid             0
endstationlatitude       0
endstationlongitude      0
endstationname           0
gender                   0
startstationid           0
startstationlatitude     0
startstationlongitude    0
startstationname         0
starttime                0
stoptime                 0
tripduration             0
usertype                 0
vicentydistance          0
manhdistance             0
startmonth               0
stopmonth                0
startdayname             0
startday                 0
stopdayname              0
stopday                  0
startdaytype             0
enddaytype               0
starthour                0
endhour                  0
startrushhour            0
endrushhour              0
starttimeofday           0
endtimeofday             0
age_at_ride              0
age_group                0
startneighborhood        0
endneighborhood          0
startboro                0
endboro                  0
d

Create csv file of feature engineering citibike dataset:

In [None]:
cb_master.to_csv('../../data/02_processed/citibike_feature_engineered_final_v2.csv', index=False)

## Appendix

### Identifying unique trips

Create new feature, `uniquetripid`:

In [None]:
cb_filtered = cb_master[cb_master['birthyear'].dt.year.astype('int16') > 1920]
cb_filtered['birthyear'] = cb_filtered['birthyear'].dt.year.astype('int16')

In [None]:
cb_filtered.groupby(['birthyear','gender','usertype','startstationid','starttime','startneighborhood']).size()

In [None]:
cb_filtered.groupby(['startneighborhood','startstationid','starttime','usertype','birthyear','gender']).size()