# Preprocessed Citibike Data: Feature Engineering

## Import libraries

In [2]:
import numpy as np
import pandas as pd
import datetime as dt
from pandas.api.types import CategoricalDtype

## Import and load data

Import preprocessed data:

In [3]:
cb_raw = pd.read_csv('../../data/02_processed/citibike_final.csv', parse_dates=['starttime', 'stoptime'])

Make a copy: 

In [4]:
cb = cb_raw.copy()

## Inspect data frame

Get dimensions of data frame and datatypes of each feature:

In [5]:
cb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4015035 entries, 0 to 4015034
Data columns (total 15 columns):
bikeid                   int64
birthyear                int64
endstationid             int64
endstationlatitude       float64
endstationlongitude      float64
endstationname           object
gender                   int64
startstationid           int64
startstationlatitude     float64
startstationlongitude    float64
startstationname         object
starttime                datetime64[ns]
stoptime                 datetime64[ns]
tripduration             int64
usertype                 object
dtypes: datetime64[ns](2), float64(4), int64(6), object(3)
memory usage: 459.5+ MB


Randomly sample five observations:

In [6]:
cb.sample(n=5)

Unnamed: 0,bikeid,birthyear,endstationid,endstationlatitude,endstationlongitude,endstationname,gender,startstationid,startstationlatitude,startstationlongitude,startstationname,starttime,stoptime,tripduration,usertype
2420308,19893,1980,529,40.75757,-73.990985,W 42 St & 8 Ave,1,515,40.760094,-73.994618,W 43 St & 10 Ave,2015-07-26 13:04:26,2015-07-26 13:08:09,223,Subscriber
3961621,32473,1981,305,40.760958,-73.967245,E 58 St & 3 Ave,1,2022,40.759107,-73.959223,E 60 St & York Ave,2017-10-22 13:10:57,2017-10-22 13:15:05,248,Subscriber
656469,18219,1974,536,40.741444,-73.975361,1 Ave & E 30 St,1,477,40.756405,-73.990026,W 41 St & 8 Ave,2015-10-12 08:47:41,2015-10-12 09:05:32,1070,Subscriber
3149308,23490,1988,519,40.751873,-73.977706,Pershing Square North,1,3141,40.765005,-73.958185,1 Ave & E 68 St,2016-11-20 09:41:48,2016-11-20 09:51:15,566,Subscriber
2630665,26114,1957,2010,40.721655,-74.002347,Grand St & Greene St,1,312,40.722055,-73.989111,Allen St & Stanton St,2017-02-17 19:00:33,2017-02-17 19:10:03,569,Subscriber


Check for missing values – there should be none:

In [7]:
cb.isnull().sum()

bikeid                   0
birthyear                0
endstationid             0
endstationlatitude       0
endstationlongitude      0
endstationname           0
gender                   0
startstationid           0
startstationlatitude     0
startstationlongitude    0
startstationname         0
starttime                0
stoptime                 0
tripduration             0
usertype                 0
dtype: int64

## Data preprocessing

Show unique values per feature:

In [8]:
pd.DataFrame.from_records([(col, cb[col].nunique()) for col in cb.columns],
                          columns=['Feature', 'Number of Unique Values']).sort_values(by=['Number of Unique Values'])

Unnamed: 0,Feature,Number of Unique Values
6,gender,3
14,usertype,3
1,birthyear,85
7,startstationid,1050
2,endstationid,1066
10,startstationname,1080
5,endstationname,1098
9,startstationlongitude,1107
4,endstationlongitude,1125
8,startstationlatitude,1183


### Define features

Define numerical features:

In [9]:
map_feat = ['endstationlatitude','endstationlongitude','startstationlatitude','startstationlongitude']
datetime_feat = ['birthyear','starttime', 'stoptime']
cont_feat = map_feat + datetime_feat + ['tripduration']

Define categorical features:

In [10]:
num_nom_feat = ['bikeid','endstationid','startstationid','gender']
cat_nom_feat = ['endstationname','startstationname','usertype']
nom_feat = num_nom_feat + cat_nom_feat

### Perform datatype conversions

Convert to datetime format:

In [11]:
cb['birthyear'] = pd.to_datetime(cb.birthyear, format='%Y')

Convert features to `float16` or `int16` to reduce memory:

In [12]:
float_to_int = ['startstationid','endstationid','tripduration','bikeid','gender']

cb[float_to_int] = cb[float_to_int].apply(lambda x: x.astype('int16'))

Convert untreated nominal features to categorical datatype to reduce memory:

In [13]:
cb[cat_nom_feat] = cb[cat_nom_feat].apply(lambda x: x.astype('category'))

Check for successful datatype conversion and reduction in memory:

In [14]:
cb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4015035 entries, 0 to 4015034
Data columns (total 15 columns):
bikeid                   int16
birthyear                datetime64[ns]
endstationid             int16
endstationlatitude       float64
endstationlongitude      float64
endstationname           category
gender                   int16
startstationid           int16
startstationlatitude     float64
startstationlongitude    float64
startstationname         category
starttime                datetime64[ns]
stoptime                 datetime64[ns]
tripduration             int16
usertype                 category
dtypes: category(3), datetime64[ns](3), float64(4), int16(5)
memory usage: 272.0 MB


## Feature engineering

### Imputation

In [15]:
cb.groupby(['startstationid', 'startstationname', 'startstationlatitude', 'startstationlongitude']).size()

startstationid  startstationname               startstationlatitude  startstationlongitude
72              W 52 St & 11 Ave               40.767272             -73.993929               10161
79              Franklin St & W Broadway       40.719116             -74.006667                7828
82              St James Pl & Pearl St         40.711174             -74.000165                3249
83              Atlantic Ave & Fort Greene Pl  40.683826             -73.976323                4079
116             W 17 St & 8 Ave                40.741776             -74.001497               12170
                                                                                              ...  
3910            Greene Ave & Grandview Ave     40.709697             -73.907856                   1
3911            Wyckoff St & Nevins St         40.683426             -73.984275                   8
3916            Pearl St & Peck Slip           40.708485             -74.002751                   7
3917     

Recompute station coordinates to correct inconsistencies:

In [16]:
end_coords = ['endstationid','endstationname','endstationlatitude','endstationlongitude']
start_coords = ['startstationid','startstationname','startstationlatitude','startstationlongitude']

In [17]:
stn3240_lat = cb[end_coords].loc[(cb['endstationid'] == 3240) & (cb['endstationlatitude'] != 40.750)].iloc[0][2]
stn3240_lon = cb[end_coords].loc[(cb['endstationid'] == 3240) & (cb['endstationlatitude'] != 74.0)].iloc[0][3]

In [18]:
cb.loc[cb['endstationlatitude'] == 40.75000, 'endstationlatitude'] = stn3240_lat
cb.loc[cb['endstationlongitude'] == -74.0, 'endstationlongitude'] = stn3240_lon
cb.loc[cb['startstationlongitude']== -74.0, 'startstationlongitude'] = stn3240_lon
cb.loc[cb['startstationlatitude'] == 40.75000, 'startstationlatitude'] = stn3240_lat

### Distance features

Define helper function to calculate distance between coordinates:

In [19]:
from pyproj import Geod

def calc_vincenty_dist(lat1, lon1, lat2, lon2): 
    az12, az21, dist = Geod(ellps='WGS84').inv(lon1, lat1, lon2, lat2)
    return dist

In [20]:
# Haversine & inclination 29 degrees to True north
# Credits: https://www.movable-type.co.uk/scripts/latlong.html,
# https://gist.github.com/jkAtGitHub/8ae7da4d5dacb9969bff43500b5efbc0#file-manhattan_dist-py
def calc_haversine_dist(lat1, lon1, lat2, lon2):
    lat1, lat2, lon1, lon2 = np.radians(lat1), np.radians(lat2), np.radians(lon1), np.radians(lon2)
    dlat, dlon = lat2 - lat1, lon2 - lon1
    r = 3963 # Earth's radius in miles
    
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a)) 
    
    total_dist = r * c
    return total_dist

def calc_manh_disth(lat1, lon1, lat2, lon2):
    start = np.stack([lat1, lon1], axis=1)
    end = np.stack([lat2, lon2], axis=1)
    
    theta1, theta2 = np.radians(-28.904), np.radians(28.904)
    rmat1 = np.array([[np.cos(theta1), np.sin(theta1)], 
                      [-np.sin(theta1), np.cos(theta1)]])
    rmat2 = np.array([[np.cos(theta2), np.sin(theta2)], 
                      [-np.sin(theta2), np.cos(theta2)]])
    
    start_rot = rmat1 @ start.T
    end_rot = rmat1 @ end.T
    
    hinge = np.stack((start_rot[0,:], end_rot[1,:]))
    hinge_coords = rmat2 @ hinge
    
    manh_dist = calc_haversine_dist(start.T[0], start.T[1], hinge_coords[0], hinge_coords[1]) + calc_haversine_dist(hinge_coords[0], hinge_coords[1], end.T[0], end.T[1])
    return manh_dist

# Unused functions -----------------------------------------------
# # Cartesian Coordinate system, output in latlong dist
# def calc_manh_distc(lat, lon):
#     return sum(abs(lat_i-lat_j) for lat_i, lat_j in zip(lat, lon))

Create new features, `vicentydistance` and `manhdistance` (both in miles):

In [21]:
cb['vicentydistance'] = calc_vincenty_dist(cb.startstationlatitude.tolist(), 
                                        cb.startstationlongitude.tolist(),
                                        cb.endstationlatitude.tolist(),
                                        cb.endstationlongitude.tolist())
cb['vicentydistance'] = cb['vicentydistance'].apply(lambda x: x*0.000621371) # Convert from meters to miles

In [22]:
cb['manhdistance'] = calc_manh_disth(cb.startstationlatitude,
                                     cb.startstationlongitude,
                                     cb.endstationlatitude,
                                     cb.endstationlongitude)

### Datetime features

Create new features, `startmonth` and `endmonth`:

In [23]:
cb['startmonth'] = cb['starttime'].dt.month.astype('int16')
cb['stopmonth'] = cb['stoptime'].dt.month.astype('int16')

Create new features for days of week, `startdayname` (numerical code: `startday`), `enddayname` (numerical code: `stopday`), `startdaytype`, and `startendtype`:

In [24]:
cb['startdayname'] = cb['starttime'].dt.day_name().astype('category')
cb['startday'] = cb['starttime'].dt.dayofweek.astype('int16')
cb['stopdayname'] = cb['stoptime'].dt.day_name().astype('category')
cb['stopday'] = cb['stoptime'].dt.dayofweek.astype('int16')

In [25]:
start_weekday_conds = [(cb.startday >= 0) & (cb.startday < 5), (cb.startday >= 5) & (cb.startday < 7)]
end_weekday_conds = [(cb.startday >= 0) & (cb.startday < 5), (cb.startday >= 5) & (cb.startday < 7)]
day_type_labels = ['Weekday','Weekend']

In [26]:
cb['startdaytype'] = np.select(start_weekday_conds, day_type_labels)
cb['startdaytype'] = cb['startdaytype'].astype('category')

In [27]:
cb['enddaytype'] = np.select(end_weekday_conds, day_type_labels)
cb['enddaytype'] = cb['enddaytype'].astype('category')

Create new features, `starthour` and `endhour`:

In [28]:
cb['starthour'] = cb['starttime'].dt.hour.astype('int16')
cb['endhour'] = cb['stoptime'].dt.hour.astype('int16')

Create new feature, `rushhour` (as defined by MTA):

In [29]:
start_rushhour_conds = [(cb.starthour >= 0) & (cb.starthour < 6), 
                       (cb.starthour >= 6) & (cb.starthour < 10),
                       (cb.starthour >= 10) & (cb.starthour < 15),
                       (cb.starthour >= 15) & (cb.starthour < 21),
                       (cb.starthour >= 21) & (cb.starthour <= 23)]
end_rushhour_conds = [(cb.endhour >= 0) & (cb.endhour < 6), 
                       (cb.endhour >= 6) & (cb.endhour < 10),
                       (cb.endhour >= 10) & (cb.endhour < 15),
                       (cb.endhour >= 15) & (cb.endhour < 21),
                       (cb.endhour >= 21) & (cb.endhour <= 23)]
rushhour_labels = ['Morning Off-Peak', 'Morning Rush', 'Afternoon Off-Peak', 'Afternoon Rush','Evening Off-Peak']

In [30]:
cb['startrushhour'] = np.select(start_rushhour_conds, rushhour_labels)
cb['startrushhour'] = cb['startrushhour'].astype('category')

In [31]:
cb['endrushhour'] = np.select(start_rushhour_conds, rushhour_labels)
cb['endrushhour'] = cb['startrushhour'].astype('category')

Create new features, `starttimeofday` and `endtimeofday`:

In [32]:
start_timeofday_conds = [(cb.starthour >= 6) & (cb.starthour < 12), 
                         (cb.starthour >= 12) & (cb.starthour < 17),
                         (cb.starthour >= 17) & (cb.starthour < 20),
                         ((cb.starthour >= 20) & (cb.starthour < 23)) | (cb.starthour == 0)]
end_timeofday_conds = [(cb.endhour >= 6) & (cb.endhour < 12), 
                       (cb.endhour >= 12) & (cb.endhour < 17),
                       (cb.endhour >= 17) & (cb.endhour < 20),
                       ((cb.endhour >= 20) & (cb.endhour < 23)) | (cb.endhour == 0)]
timeofday_labels = ['Morning (6AM-11:59PM)', 'Afternoon (12PM-4:59PM)', 'Evening (5-7:59PM)', 'Night (8PM-5:59AM)']

In [33]:
cb['starttimeofday'] = np.select(start_timeofday_conds, timeofday_labels)
cb['starttimeofday'] = cb['starttimeofday'].astype('category')

In [34]:
cb['endtimeofday'] = np.select(end_timeofday_conds, timeofday_labels)
cb['endtimeofday'] = cb['endtimeofday'].astype('category')

### Demographic features

Create new features, `age_at_ride` and `age_group`:

In [35]:
cb['age_at_ride'] = cb['starttime'].dt.year.astype('int16') - cb['birthyear'].dt.year.astype('int16')
cb['age_at_ride'] = cb['age_at_ride'].astype('int16')

In [36]:
cb['age_at_ride'].describe().astype('int16')

count    17339
mean        44
std         22
min         16
25%         30
50%         37
75%         50
max        120
Name: age_at_ride, dtype: int16

In [37]:
age_groups_conds = [(cb.age_at_ride >= 16) & (cb.age_at_ride < 20), 
                    (cb.age_at_ride >= 20) & (cb.age_at_ride < 30),
                    (cb.age_at_ride >= 30) & (cb.age_at_ride < 40),
                    (cb.age_at_ride >= 40) & (cb.age_at_ride < 50),
                    (cb.age_at_ride >= 50) & (cb.age_at_ride < 60),
                    (cb.age_at_ride >= 60) & (cb.age_at_ride <= 120)]
age_groups_labels = ['Teens (16-19)', '20s', '30s', '40s', '50s', '60s+']

In [38]:
cb['age_group'] = np.select(age_groups_conds, age_groups_labels)
cb['age_group'] = cb['age_group'].astype('category')

### Geographic features

Create new features, `startboro`, `endboro`, `startneighborhood`, and `endneighborhood`:

In [39]:
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm._tqdm_notebook import tqdm_notebook

In [40]:
cb['startcoords'] = list(zip(cb.startstationlatitude, cb.startstationlongitude))
cb['endcoords'] = list(zip(cb.endstationlatitude, cb.endstationlongitude))

In [41]:
print(cb['startcoords'].nunique())
print(cb['endcoords'].nunique())

1182
1199


Create data frame `unique_coords` consisting of unique coordinates for reverse geocoding processing:

In [42]:
end_coords = end_coords + ['endcoords']
start_coords = start_coords + ['startcoords']
coords_all = end_coords + start_coords

In [43]:
unique_startcoords = pd.DataFrame(cb[start_coords].groupby(['startstationid', 'startcoords']).size()).reset_index()
unique_startcoords.columns = ['startstationid','startcoords','todrop']
unique_startcoords.drop(columns=['todrop'], inplace=True)

In [44]:
unique_endcoords = pd.DataFrame(cb[end_coords].groupby(['endstationid', 'endcoords']).size()).reset_index()
unique_endcoords.columns = ['endstationid','endcoords','todrop']
unique_endcoords.drop(columns=['todrop'], inplace=True)

In [45]:
unique_coords = pd.concat([unique_startcoords, unique_endcoords], axis=1)

Reverse geocode station coordinates for borough and neighborhood:

In [46]:
geolocator = Nominatim(user_agent='myGeocoder', timeout=10)
rgeoloc = RateLimiter(geolocator.reverse, min_delay_seconds=5)

In [47]:
tqdm_notebook().pandas(desc='Progress Bar')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


The Panel class is removed from pandas. Accessing it from the top-level namespace will also be removed in the next version



In [42]:
# unique_coords['endaddress'] = unique_coords['endcoords'].progress_apply(rgeoloc)

HBox(children=(IntProgress(value=0, description='Progress Bar', max=1199, style=ProgressStyle(description_widt…




In [43]:
# unique_coords['startaddress'] = unique_coords['startcoords'].dropna().progress_apply(rgeoloc)

HBox(children=(IntProgress(value=0, description='Progress Bar', max=1182, style=ProgressStyle(description_widt…




Save as temporary .csv file to avoid running reverse geocoding again:

In [56]:
# unique_coords.to_csv('../../data/02_processed/unique_coords.csv')

Extract neighborhood and borough from `startaddress` and `endaddress`:

In [71]:
# RUN THIS INSTEAD OF REVERSE GEOCODING
unique_coords = pd.read_csv('../../data/02_processed/unique_coords.csv')
unique_coords = unique_coords.drop(columns=['Unnamed: 0'])
unique_coords['startaddress'] = unique_coords['startaddress'].astype(str)

In [72]:
m_hoods = ["Lower West Side","Financial District","Lower East Side","Tribeca",
           "East Village","Alphabet City","Stuy Town","Peter Cooper Village",
           "Hudson Square","SoHo","Nolita","Lower Manhattan","Little Italy", "NoHo",
           "Greenwich Village","West Village","Kips Bay","Murray Hill","Gramercy",
           "Hell's Kitchen","East Village","Upper East Side","Upper West Side",
           "NoMad","Chinatown","Korea Town","Midtown","Midtown West","Midtown East",
           "Midtown South","Lincoln Square","Theater District","Chelsea","Harlem",
           "East Harlem","Garment District","Meatpacking District","Battery Park City",
           "Lenox Hill","Tudor City","Five Points","Turtle Bay","Yorkville","Lenox Hill",
           "Carnegie Hill","Manhattanville","Manhattan Valley","Ansonia","Columbus Circle",
          "Two Bridges","New York County","Broadway"]

bk_hoods = ["Bay Ridge", "Sunset Park", "Bensonhurst", "Sheepshead Bay",
            "Borough Park", "Midwood", "Flatbush", "East Flatbush",
            "Park Slope", "East New York", "Bedford-Stuyvesant", 
            "Williamsburg", "Greenpoint", "Red Hook", "Downtown Brooklyn",
            "DUMBO", "Brownsville", "Prospect Park", "Fort Hamilton", 
            "Cypress Hills", "Bushwick", "Canarsie", "Brooklyn Heights",
            "Cobble Hill","Canarsie","Bergen Beach","Flatlands","Bronwsville",
            "Crown Heights","Brooklyn Navy Yard","Boerum Hill","Caroll Gardens",
            "Gowanus","Sunset Park","Coney Island","Gravesend","Fort Hamilton",
            "Prospect Heights","Fort Greene","Broadway Junction","Clinton Hill",
            "Ocean Hill","Kings County","Nolan Park","Columbia Street Waterfront District"]

q_hoods = ["Astoria", "Long Island City", "Steinway", "Ridgewood", "Woodside", 
           "Elmhurst", "Jackson Heights", "Corona", "Murray Hill", "Flushing", 
           "Kew Gardens", "Fresh Meadows", "Jamaica", "Bayside", "Whitestone",
           "Sunnyside","Rego Park","Bayside","Maspeth","Queens County"]

nj_hoods = ["Downtown Jersey City","Communipaw","Newark","Bergen","Hoboken",
            "Newport","Grand Street","Hudson Street","Indian Square","Colgate Center"]

all_hoods = m_hoods + bk_hoods + q_hoods + nj_hoods

boros_plus = ['Manhattan','Brooklyn','Bronx','Queens','Staten Island','Jersey City']

In [73]:
unique_coords['startaddress'] = unique_coords['startaddress'].apply(lambda x: x.split(',')[2:6])
unique_coords['startaddress'] = unique_coords['startaddress'].apply(lambda x: ','.join(map(str, x)))

In [74]:
unique_coords['endaddress'] = unique_coords['endaddress'].apply(lambda x: x.split(',')[2:6])
unique_coords['endaddress'] = unique_coords['endaddress'].apply(lambda x: ','.join(map(str, x)))

In [75]:
import re

In [76]:
unique_coords['startneighborhoods'] = unique_coords.startaddress.str.extract('({0})'.format('|'.join(all_hoods)), flags=re.IGNORECASE)

In [77]:
unique_coords['endneighborhoods'] = unique_coords.endaddress.str.extract('({0})'.format('|'.join(all_hoods)), flags=re.IGNORECASE)

In [78]:
unique_coords['startboro'] = unique_coords.startaddress.str.extract('({0})'.format('|'.join(boros_plus)), flags=re.IGNORECASE)

In [79]:
unique_coords['endboro'] = unique_coords.endaddress.str.extract('({0})'.format('|'.join(boros_plus)), flags=re.IGNORECASE)

Drop observations with missing start- or end- station information:

In [80]:
# unique_coords.loc[unique_coords['startstationid'].isnull()]
unique_coords.drop(unique_coords.index[1182:1199], inplace=True)
unique_coords.drop(unique_coords.index[593], inplace=True)
unique_coords.drop(unique_coords.index[582], inplace=True)

In [81]:
unique_coords.loc[unique_coords['startneighborhoods'] == 'Kings County', 'startboro'] = 'Brooklyn'
unique_coords.loc[unique_coords['endneighborhoods'] == 'Kings County', 'endboro'] = 'Brooklyn'

In [82]:
unique_coords.isnull().sum()

startstationid        0
startcoords           0
endstationid          0
endcoords             0
endaddress            0
startaddress          0
startneighborhoods    0
endneighborhoods      0
startboro             0
endboro               0
dtype: int64

In [83]:
# uniq_coords.to_csv('../../data/02_processed/unique_coords_final.csv')

In [84]:
unique_coords['startstationid'] = unique_coords['startstationid'].astype(int)

Create dictionaries to conditionally map values of `unique_coords` onto `cb`:

In [85]:
cb_master = cb.copy()

In [86]:
cb_master['startneighborhood'] = cb_master['startstationid']
cb_master['endneighborhood'] = cb_master['endstationid']

In [87]:
sstnid_hood_dict = dict(unique_coords[['startstationid','startneighborhoods']].values)
cb_master['startneighborhood'] = cb_master.startneighborhood.map(sstnid_hood_dict)

In [88]:
estnid_hood_dict = dict(unique_coords[['endstationid','endneighborhoods']].values)
cb_master['endneighborhood'] = cb_master.endneighborhood.map(estnid_hood_dict)

In [89]:
cb_master['startboro'] = cb_master['startneighborhood']
cb_master['endboro'] = cb_master['startneighborhood']

In [90]:
shood_boro_dict = dict(unique_coords[['startneighborhoods','startboro']].values)
cb_master['startboro'] = cb_master.startboro.map(shood_boro_dict)

In [91]:
ehood_boro_dict = dict(unique_coords[['endneighborhoods','endboro']].values)
cb_master['endboro'] = cb_master.endboro.map(ehood_boro_dict)

Impute unmapped or incorrect neighborhoods and boroughs:

In [92]:
start_end_only = ['startstationid','startstationname','startneighborhood', 'startboro',
                  'endstationid','endstationname','endneighborhood','endboro']

In [93]:
stn3244 = cb_master[start_end_only].loc[(cb_master['startstationid'] == 3244)].iloc[0][2]
stn3898 = cb_master[start_end_only].loc[(cb_master['startstationid'] == 3898)].iloc[0][2]
stn3903 = cb_master[start_end_only].loc[(cb_master['startstationid'] == 3903)].iloc[0][2]
stn3905 = cb_master[start_end_only].loc[(cb_master['startstationid'] == 3905)].iloc[0][2]
stn3911 = cb_master[start_end_only].loc[(cb_master['startstationid'] == 3911)].iloc[0][2]
stn3910 = cb_master[start_end_only].loc[(cb_master['startstationid'] == 3910)].iloc[0][2]
stn3916 = cb_master[start_end_only].loc[(cb_master['startstationid'] == 3916)].iloc[0][2]
stn3918 = cb_master[start_end_only].loc[(cb_master['startstationid'] == 3918)].iloc[0][2]
stn3899 = cb_master[start_end_only].loc[(cb_master['startstationid'] == 3899)].iloc[0][2]
stn3907 = cb_master[start_end_only].loc[(cb_master['startstationid'] == 3907)].iloc[0][2]

In [94]:
cb_master.loc[cb_master['endstationid'] == 3244, 'endneighborhood'] = stn3244
cb_master.loc[cb_master['endstationid'] == 3898, 'endneighborhood'] = stn3898
cb_master.loc[cb_master['endstationid'] == 3903, 'endneighborhood'] = stn3903
cb_master.loc[cb_master['endstationid'] == 3905, 'endneighborhood'] = stn3905
cb_master.loc[cb_master['endstationid'] == 3911, 'endneighborhood'] = stn3911
cb_master.loc[cb_master['endstationid'] == 3910, 'endneighborhood'] = stn3910
cb_master.loc[cb_master['endstationid'] == 3916, 'endneighborhood'] = stn3916
cb_master.loc[cb_master['endstationid'] == 3918, 'endneighborhood'] = stn3918
cb_master.loc[cb_master['endstationid'] == 3899, 'endneighborhood'] = stn3899
cb_master.loc[cb_master['endstationid'] == 3907, 'endneighborhood'] = stn3907

In [95]:
cb_master.loc[cb_master['endstationid'] == 3257, 'endneighborhood'] = 'Upper West Side'
cb_master.loc[cb_master['startstationid'] == 3257, 'endneighborhood'] = 'Upper West Side'
cb_master.loc[cb_master['endstationid'] == 3908, 'endneighborhood'] = 'Lower West Side'
cb_master.loc[cb_master['endstationid'] == 3896, 'endneighborhood'] = 'Ridgewood'
cb_master.loc[cb_master['endstationid'] == 3900, 'endneighborhood'] = 'Greenpoint'
cb_master.loc[cb_master['endstationid'] == 3897, 'endneighborhood'] = 'Ridgewood'
cb_master.loc[cb_master['endstationid'] == 3917, 'endneighborhood'] = 'Downtown Brooklyn'
cb_master.loc[cb_master['endstationid'] == 3901, 'endneighborhood'] = 'Ridgewood'
cb_master.loc[cb_master['endstationid'] == 3909, 'endneighborhood'] = 'Ridgewood'

In [96]:
stn3285 = cb_master[start_end_only].loc[(cb_master['endstationid'] == 3285)].iloc[0][-2]
stn3257 = cb_master[start_end_only].loc[(cb_master['endstationid'] == 3257)].iloc[0][-2]

In [97]:
cb_master.loc[cb_master['startstationid'] == 3285, 'startneighborhood'] = stn3285
cb_master.loc[cb_master['startstationid'] == 3257, 'startneighborhood'] = stn3257

In [98]:
mhood_to_boro_dict = dict.fromkeys(m_hoods, 'Manhattan')
bkhood_to_boro_dict = dict.fromkeys(bk_hoods, 'Brooklyn')
qhood_to_boro_dict = dict.fromkeys(q_hoods, 'Queens')
njhood_to_boro_dict = dict.fromkeys(nj_hoods, 'Jersey City')

In [99]:
from itertools import chain

start_master_dict = dict(chain(shood_boro_dict.items(), mhood_to_boro_dict.items(), 
                               bkhood_to_boro_dict.items(), qhood_to_boro_dict.items(),
                               njhood_to_boro_dict.items()))

end_master_dict = dict(chain(ehood_boro_dict.items(), mhood_to_boro_dict.items(), 
                             bkhood_to_boro_dict.items(), qhood_to_boro_dict.items(),
                             njhood_to_boro_dict.items()))

In [100]:
cb_master['startboro'] = cb_master.startneighborhood.map(start_master_dict)

In [101]:
cb_master['endboro'] = cb_master.endneighborhood.map(end_master_dict)

Perform datatype conversion to reduce memory:

In [107]:
cb_master.drop(columns=['startcoords','endcoords'], inplace=True)

In [104]:
cat_nom_feat = ['endstationname','startstationname','usertype',
               'startneighborhood','startboro','endneighborhood','endboro',
               'startdayname','stopdayname','startdaytype','enddaytype',
               'startrushhour','endrushhour','starttimeofday','endtimeofday',
               'age_group']

In [105]:
cb_master[cat_nom_feat] = cb_master[cat_nom_feat].apply(lambda x: x.astype('category'))

In [103]:
float_to_int = ['startstationid','endstationid','tripduration','bikeid','gender']

cb[float_to_int] = cb[float_to_int].apply(lambda x: x.astype('int16'))

In [111]:
cb_master['vicentydistance'] = cb_master['vicentydistance'].astype('float16')
cb_master['manhdistance'] = cb_master['manhdistance'].astype('float16')

### Final check

Inspect memory:

In [112]:
cb_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4015035 entries, 0 to 4015034
Data columns (total 37 columns):
bikeid                   int16
birthyear                datetime64[ns]
endstationid             int16
endstationlatitude       float64
endstationlongitude      float64
endstationname           category
gender                   int16
startstationid           int16
startstationlatitude     float64
startstationlongitude    float64
startstationname         category
starttime                datetime64[ns]
stoptime                 datetime64[ns]
tripduration             int16
usertype                 category
vicentydistance          float16
manhdistance             float16
startmonth               int16
stopmonth                int16
startdayname             category
startday                 int16
stopdayname              category
stopday                  int16
startdaytype             category
enddaytype               category
starthour                int16
endhour              

Inspect missing values:

In [113]:
cb_master.isnull().sum()

bikeid                   0
birthyear                0
endstationid             0
endstationlatitude       0
endstationlongitude      0
endstationname           0
gender                   0
startstationid           0
startstationlatitude     0
startstationlongitude    0
startstationname         0
starttime                0
stoptime                 0
tripduration             0
usertype                 0
vicentydistance          0
manhdistance             0
startmonth               0
stopmonth                0
startdayname             0
startday                 0
stopdayname              0
stopday                  0
startdaytype             0
enddaytype               0
starthour                0
endhour                  0
startrushhour            0
endrushhour              0
starttimeofday           0
endtimeofday             0
age_at_ride              0
age_group                0
startneighborhood        0
endneighborhood          0
startboro                0
endboro                  0
d

Create csv file of feature engineering citibike dataset:

In [114]:
cb_master.to_csv('../../data/02_processed/citibike_feature_engineered_final.csv', index=False)

## Appendix

### Identifying unique trips

Create new feature, `uniquetripid`:

In [115]:
cb_filtered = cb_master[cb_master['birthyear'].dt.year.astype('int16') > 1920]
cb_filtered['birthyear'] = cb_filtered['birthyear'].dt.year.astype('int16')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [116]:
cb_filtered.groupby(['birthyear','gender','usertype','startstationid','starttime','startneighborhood']).size()

birthyear  gender  usertype    startstationid  starttime                startneighborhood   
1921       1       Subscriber  116             2016-07-20 17:06:09.000  Chelsea                 1
                                               2016-08-24 17:10:05.000  Chelsea                 1
                               160             2014-05-18 00:48:37.000  Murray Hill             1
                               217             2013-06-25 19:56:56.000  DUMBO                   1
                               228             2014-09-02 19:25:14.000  Turtle Bay              1
                                                                                               ..
2003       2       Subscriber  3361            2020-01-06 18:25:57.369  Park Slope              1
                               3377            2019-12-31 23:46:46.006  Gowanus                 1
                               3398            2019-08-30 18:46:54.397  Kings County            1
                         

In [117]:
cb_filtered.groupby(['startneighborhood','startstationid','starttime','usertype','birthyear','gender']).size()

startneighborhood  startstationid  starttime                usertype    birthyear  gender
Alphabet City      150             2013-06-01 10:59:18.000  Subscriber  1970       1         1
                                   2013-06-01 12:27:32.000  Subscriber  1986       1         1
                                   2013-06-01 13:58:19.000  Subscriber  1982       1         1
                                   2013-06-02 13:56:50.000  Subscriber  1983       1         1
                                   2013-06-02 14:40:22.000  Subscriber  1956       1         1
                                                                                            ..
Yorkville          3747            2019-10-15 19:13:29.079  Subscriber  1989       1         1
                                   2019-10-17 22:09:40.602  Subscriber  1966       1         1
                                   2019-10-18 07:26:32.927  Subscriber  1993       2         1
                                   2019-10-18 08:13:32.