# Feature Engineering: Journey Bike Data

In [43]:
import pandas as pd
import numpy as np
import holidays

# import python modules
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/tabea/Documents/UrbanMobility/src')
from features import journey_data_feature_engineering as features

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
# import data
bike_locs = pd.read_csv('../data/interim/station_locations_boroughs.csv')
journey_df = pd.read_csv('../data/interim/journey_data_cleaned.csv', parse_dates=['end_date', 'start_date'], index_col=0)

  mask |= (ar1 == a)


# 1. TEMPORAL FEATURE ENGINEERING

Features: 
- hour (1-24)
- part_of_day (early morning: 1, morning: 2, afternoon: 3, evening: 4, night: 5)
- day_of_week (1-7)
- is_weekend (boolean)
- month (1-12)
- season (1-4)
- bank holiday (boolean)

In [15]:
uk_holidays = holidays.UK()

journey_df['day_of_week'] = journey_df['start_date'].dt.dayofweek
journey_df['hour'] = journey_df['start_date'].dt.hour
journey_df['is_weekend'] = journey_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0) 
journey_df['part_of_day'] = journey_df['hour'].apply(features.get_part_of_day)
journey_df['month'] = journey_df['start_date'].dt.month
journey_df['season'] = journey_df['month'].apply(features.get_season)
journey_df['bank_holiday'] = journey_df['start_date'].apply(lambda x: 1 if x in uk_holidays else 0)

In [16]:
journey_df.head()

Unnamed: 0,rental_id,end_date,end_station_name,start_date,start_station_name,day_of_week,hour,is_weekend,part_of_day,month,season,bank_holiday
16646400,50608184.0,2016-01-01 01:14:00,"Hampstead Road (Cartmel), Euston",2016-01-01 00:00:00,"Hampstead Road, Euston",4,0,0,5,1,4,1
16646401,50608186.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",4,0,0,5,1,4,1
16646402,50608187.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",4,0,0,5,1,4,1
16646403,50608188.0,2016-01-01 00:22:00,"Brushfield Street, Liverpool Street",2016-01-01 00:04:00,"Holborn Circus, Holborn",4,0,0,5,1,4,1
16646405,50608189.0,2016-01-01 00:23:00,"Brushfield Street, Liverpool Street",2016-01-01 00:05:00,"Holborn Circus, Holborn",4,0,0,5,1,4,1


# 2. WEATHER FEATURE ENGINEERING

The daily weather data are generated with the weather data builder by www.visualcrossing.com

Features:
- tempmax
- tempmin
- temp
- feelslike
- humidity
- precip
- windgust
- windspeed
- cloudcover
- visibility
- uvindex
- daylight_hours

### 2.1 Load and Enhance Weather Data

In [17]:
# load data
weather_df = pd.read_csv('../data/external/weather_london_2016-2022.csv', index_col=0, encoding='ISO-8859-1')
weather_df.index = pd.to_datetime(weather_df.index).date
weather_df.head()

Unnamed: 0_level_0,tempmax,tempmin,temp,feelslike,humidity,precip,windgust,windspeed,cloudcover,visibility,uvindex,sunrise,sunset
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-01-01,8.6,2.6,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,2016-01-01T08:06:16,2016-01-01T16:01:33
2016-01-02,10.8,8.1,10.0,10.0,89.7,1.257,46.4,0.2,90.7,14.5,1,2016-01-02T08:06:10,2016-01-02T16:02:36
2016-01-03,10.3,6.3,8.0,8.0,87.7,10.214,53.6,0.2,60.0,20.5,0,2016-01-03T08:06:01,2016-01-03T16:03:42
2016-01-04,10.8,6.0,8.0,8.0,87.9,0.201,,0.1,45.7,20.1,1,2016-01-04T08:05:49,2016-01-04T16:04:51
2016-01-05,10.6,6.8,8.4,8.4,89.5,0.218,,5.2,65.1,17.0,1,2016-01-05T08:05:33,2016-01-05T16:06:02


In [20]:
# clean and enhance weather data: add daylight_hours and replace NaN by 0

weather_df = features.clean_enhance_weather_data(weather_df)
weather_df.head()

Unnamed: 0_level_0,tempmax,tempmin,temp,feelslike,humidity,precip,windgust,windspeed,cloudcover,visibility,uvindex,daylight_hours
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-01-01,8.6,2.6,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
2016-01-02,10.8,8.1,10.0,10.0,89.7,1.257,46.4,0.2,90.7,14.5,1,7.940556
2016-01-03,10.3,6.3,8.0,8.0,87.7,10.214,53.6,0.2,60.0,20.5,0,7.961389
2016-01-04,10.8,6.0,8.0,8.0,87.9,0.201,0.0,0.1,45.7,20.1,1,7.983889
2016-01-05,10.6,6.8,8.4,8.4,89.5,0.218,0.0,5.2,65.1,17.0,1,8.008056


### 2.2 Map Weather Data to Journey Data

In [23]:
journey_df = features.merge_weather_journey_data(journey_df, weather_df)

In [24]:
print(journey_df.isna().sum())
journey_df.head()

rental_id             0
end_date              0
end_station_name      0
start_date            0
start_station_name    0
day_of_week           0
hour                  0
is_weekend            0
part_of_day           0
month                 0
season                0
bank_holiday          0
tempmax               0
tempmin               0
temp                  0
feelslike             0
humidity              0
precip                0
windgust              0
windspeed             0
cloudcover            0
visibility            0
uvindex               0
daylight_hours        0
dtype: int64


Unnamed: 0,rental_id,end_date,end_station_name,start_date,start_station_name,day_of_week,hour,is_weekend,part_of_day,month,...,temp,feelslike,humidity,precip,windgust,windspeed,cloudcover,visibility,uvindex,daylight_hours
16646400,50608184.0,2016-01-01 01:14:00,"Hampstead Road (Cartmel), Euston",2016-01-01 00:00:00,"Hampstead Road, Euston",4,0,0,5,1,...,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
16646401,50608186.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",4,0,0,5,1,...,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
16646402,50608187.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",4,0,0,5,1,...,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
16646403,50608188.0,2016-01-01 00:22:00,"Brushfield Street, Liverpool Street",2016-01-01 00:04:00,"Holborn Circus, Holborn",4,0,0,5,1,...,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
16646405,50608189.0,2016-01-01 00:23:00,"Brushfield Street, Liverpool Street",2016-01-01 00:05:00,"Holborn Circus, Holborn",4,0,0,5,1,...,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389


# 3. MAPPING BOROUGHS TO JOURNEY DATA 

### 3.1 Direct Mapping by Station Name

In [25]:
journey_df = features.direct_borough_mapping_by_stationname(bike_locs, journey_df)

# still lots of missing boroughs: 2'314'797 + 2'337'714
print(journey_df.isna().sum())

rental_id                   0
end_date                    0
end_station_name            0
start_date                  0
start_station_name          0
day_of_week                 0
hour                        0
is_weekend                  0
part_of_day                 0
month                       0
season                      0
bank_holiday                0
tempmax                     0
tempmin                     0
temp                        0
feelslike                   0
humidity                    0
precip                      0
windgust                    0
windspeed                   0
cloudcover                  0
visibility                  0
uvindex                     0
daylight_hours              0
start_borough         2314797
end_borough           2337714
dtype: int64


### 3.2 Fuzzy Matching for empty Boroughs
Matching names that refer to the same station but are slightly different in their naming. Parallel processing to improve the performance of fuzzy matching.

In [28]:
journey_df = features.fuzzy_borough_mapping_by_stationname(bike_locs, journey_df)

# reduced number of missing values to 1'744'880 + 1'789'528
print(journey_df.isna().sum())

rental_id                   0
end_date                    0
end_station_name            0
start_date                  0
start_station_name          0
day_of_week                 0
hour                        0
is_weekend                  0
part_of_day                 0
month                       0
season                      0
bank_holiday                0
tempmax                     0
tempmin                     0
temp                        0
feelslike                   0
humidity                    0
precip                      0
windgust                    0
windspeed                   0
cloudcover                  0
visibility                  0
uvindex                     0
daylight_hours              0
start_borough         1744880
end_borough           1789528
dtype: int64


### 3.3 Former Station Borough Matching

After investigating the missing borough data in the current samplers, it was discovered that these samplers correspond to former stations that are no longer in use and are not listed in the provided BikePoints file, where the station names are given in the format "street name, region" (e.g., "London Fields, Hackney Central").

To address this a dictionary with the region information as key and the borough with the maximum counts as value is created. This is then used to map the missing boroughs in the merged_df DataFrame based on the extracted location information.

In [40]:
journey_df = features.former_station_borough_mapping_by_region(bike_locs, journey_df)

# still some missing values: 8855 + 9287
print(journey_df.isna().sum())

rental_id                0
end_date                 0
end_station_name         0
start_date               0
start_station_name       0
day_of_week              0
hour                     0
is_weekend               0
part_of_day              0
month                    0
season                   0
bank_holiday             0
tempmax                  0
tempmin                  0
temp                     0
feelslike                0
humidity                 0
precip                   0
windgust                 0
windspeed                0
cloudcover               0
visibility               0
uvindex                  0
daylight_hours           0
start_borough         8855
end_borough           9287
dtype: int64


### 3.4 Map Manually and Drop Irrelevant Stations

About 10 stations were still not mapped to boroughs, so they were manually assigned. Stations labeled as 'Test' or 'Workshop' were dropped. Now, all entries are associated with a borough.

In [44]:
unique_empty_start_boroughs = journey_df.loc[journey_df['start_borough'].isna(), 'start_station_name'].unique()
unique_empty_end_boroughs = journey_df.loc[journey_df['end_borough'].isna(), 'end_station_name'].unique()
unique_empty_boroughs = np.union1d(unique_empty_start_boroughs, unique_empty_end_boroughs)


for name in unique_empty_boroughs:
    print(name)

Allington street, Off Victoria Street, Westminster
Canada Water Station
Contact Centre, Southbury House
Imperial Wharf Station
Import Dock
LSP1
LSP2
Mechanical Workshop Clapham
Mechanical Workshop Penton
Monier Road, Newham
One London
PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY
Pop Up Dock 1
Pop Up Dock 2
Victoria and Albert Museum, Cromwell Road
Worship Street, Hackney
York Way, Camden


In [46]:
journey_df = features.manual_borough_mapping(journey_df)

# no missing boroughs.. hurray!
print(journey_df.isna().sum())

rental_id             0
end_date              0
end_station_name      0
start_date            0
start_station_name    0
day_of_week           0
hour                  0
is_weekend            0
part_of_day           0
month                 0
season                0
bank_holiday          0
tempmax               0
tempmin               0
temp                  0
feelslike             0
humidity              0
precip                0
windgust              0
windspeed             0
cloudcover            0
visibility            0
uvindex               0
daylight_hours        0
start_borough         0
end_borough           0
dtype: int64


### 3.5 Save Interim Data

In [47]:
journey_df.to_csv('../data/interim/journey_data_cleaned_featureeng_1_2_3.csv')

# 4. BOROUGH DEMOGRAPHIC FEATURE ENGINEERING

Features:
- 

# 5. SAVE ENGINEERED DATA

In [99]:
# all data (2016 - 20122)
merged_df.to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_cleaned_with_boroughs_no_nans.csv')

In [100]:
# by year

groups = merged_df.groupby(pd.Grouper(key='Start Date', freq='Y'))
yearly_dfs = {}
for year, group in groups:
    yearly_dfs[year.year] = group.reset_index(drop=True)
    
yearly_dfs[2015].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2015_cleaned.csv')
yearly_dfs[2016].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2016_cleaned.csv')
yearly_dfs[2017].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2017_cleaned.csv')
yearly_dfs[2018].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2018_cleaned.csv')
yearly_dfs[2019].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2019_cleaned.csv')
yearly_dfs[2020].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2020_cleaned.csv')
yearly_dfs[2021].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2021_cleaned.csv')
yearly_dfs[2022].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2022_cleaned.csv')