# Feature Engineering: Journey Bike Data

In [2]:
import pandas as pd
import numpy as np
import holidays
from datetime import datetime
import seaborn as sns

# import python modules
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/tabea/Documents/UrbanMobility/src')
from features import journey_data_feature_engineering as features

In [11]:
# import data
bike_locs = pd.read_csv('../data/interim/station_locations_boroughs.csv')
journey_df = pd.read_csv('../data/interim/journey_data_cleaned.csv', parse_dates=['end_date', 'start_date'], index_col=0)

  mask |= (ar1 == a)


# 1. TEMPORAL FEATURE ENGINEERING

Features: 
- hour (1-24)
- part_of_day (early morning: 1, morning: 2, afternoon: 3, evening: 4, night: 5)
- day_of_week (1-7)
- day_of_month (1-31)
- day_of_year (1-365)
- is_weekend (boolean)
- month (1-12)
- season (1-4)
- bank_holiday (boolean)
- year

In [None]:
uk_holidays = holidays.UK()

journey_df['day_of_week'] = journey_df['start_date'].dt.dayofweek
journey_df['day_of_month'] = journey_df['start_date'].dt.day
journey_df['day_of_year'] = journey_df['start_date'].dt.dayofyear
journey_df['hour'] = journey_df['start_date'].dt.hour
journey_df['is_weekend'] = journey_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0) 
journey_df['part_of_day'] = journey_df['hour'].apply(features.get_part_of_day)
journey_df['month'] = journey_df['start_date'].dt.month
journey_df['season'] = journey_df['month'].apply(features.get_season)
journey_df['bank_holiday'] = journey_df['start_date'].apply(lambda x: 1 if x in uk_holidays else 0)
journey_df['year'] = journey_df['start_date'].dt.year

In [16]:
journey_df.head()

Unnamed: 0,rental_id,end_date,end_station_name,start_date,start_station_name,day_of_week,hour,is_weekend,part_of_day,month,season,bank_holiday
16646400,50608184.0,2016-01-01 01:14:00,"Hampstead Road (Cartmel), Euston",2016-01-01 00:00:00,"Hampstead Road, Euston",4,0,0,5,1,4,1
16646401,50608186.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",4,0,0,5,1,4,1
16646402,50608187.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",4,0,0,5,1,4,1
16646403,50608188.0,2016-01-01 00:22:00,"Brushfield Street, Liverpool Street",2016-01-01 00:04:00,"Holborn Circus, Holborn",4,0,0,5,1,4,1
16646405,50608189.0,2016-01-01 00:23:00,"Brushfield Street, Liverpool Street",2016-01-01 00:05:00,"Holborn Circus, Holborn",4,0,0,5,1,4,1


# 2. WEATHER FEATURE ENGINEERING


The daily weather data are generated with the weather data builder by www.visualcrossing.com

Features:
- tempmax
- tempmin
- temp
- feelslike
- humidity
- precip
- windgust
- windspeed
- cloudcover
- visibility
- uvindex
- daylight_hours

### 2.1 Load and Enhance Weather Data

In [17]:
# load data
weather_df = pd.read_csv('../data/external/weather_london_2016-2022.csv', index_col=0, encoding='ISO-8859-1')
weather_df.index = pd.to_datetime(weather_df.index).date
weather_df.head()

Unnamed: 0_level_0,tempmax,tempmin,temp,feelslike,humidity,precip,windgust,windspeed,cloudcover,visibility,uvindex,sunrise,sunset
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-01-01,8.6,2.6,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,2016-01-01T08:06:16,2016-01-01T16:01:33
2016-01-02,10.8,8.1,10.0,10.0,89.7,1.257,46.4,0.2,90.7,14.5,1,2016-01-02T08:06:10,2016-01-02T16:02:36
2016-01-03,10.3,6.3,8.0,8.0,87.7,10.214,53.6,0.2,60.0,20.5,0,2016-01-03T08:06:01,2016-01-03T16:03:42
2016-01-04,10.8,6.0,8.0,8.0,87.9,0.201,,0.1,45.7,20.1,1,2016-01-04T08:05:49,2016-01-04T16:04:51
2016-01-05,10.6,6.8,8.4,8.4,89.5,0.218,,5.2,65.1,17.0,1,2016-01-05T08:05:33,2016-01-05T16:06:02


In [20]:
# clean and enhance weather data: add daylight_hours and replace NaN by 0

weather_df = features.clean_enhance_weather_data(weather_df)
weather_df.head()

Unnamed: 0_level_0,tempmax,tempmin,temp,feelslike,humidity,precip,windgust,windspeed,cloudcover,visibility,uvindex,daylight_hours
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-01-01,8.6,2.6,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
2016-01-02,10.8,8.1,10.0,10.0,89.7,1.257,46.4,0.2,90.7,14.5,1,7.940556
2016-01-03,10.3,6.3,8.0,8.0,87.7,10.214,53.6,0.2,60.0,20.5,0,7.961389
2016-01-04,10.8,6.0,8.0,8.0,87.9,0.201,0.0,0.1,45.7,20.1,1,7.983889
2016-01-05,10.6,6.8,8.4,8.4,89.5,0.218,0.0,5.2,65.1,17.0,1,8.008056


### 2.2 Map Weather Data to Journey Data

In [23]:
journey_df = features.merge_weather_journey_data(journey_df, weather_df)

In [24]:
print(journey_df.isna().sum())
journey_df.head()

rental_id             0
end_date              0
end_station_name      0
start_date            0
start_station_name    0
day_of_week           0
hour                  0
is_weekend            0
part_of_day           0
month                 0
season                0
bank_holiday          0
tempmax               0
tempmin               0
temp                  0
feelslike             0
humidity              0
precip                0
windgust              0
windspeed             0
cloudcover            0
visibility            0
uvindex               0
daylight_hours        0
dtype: int64


Unnamed: 0,rental_id,end_date,end_station_name,start_date,start_station_name,day_of_week,hour,is_weekend,part_of_day,month,...,temp,feelslike,humidity,precip,windgust,windspeed,cloudcover,visibility,uvindex,daylight_hours
16646400,50608184.0,2016-01-01 01:14:00,"Hampstead Road (Cartmel), Euston",2016-01-01 00:00:00,"Hampstead Road, Euston",4,0,0,5,1,...,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
16646401,50608186.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",4,0,0,5,1,...,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
16646402,50608187.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",4,0,0,5,1,...,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
16646403,50608188.0,2016-01-01 00:22:00,"Brushfield Street, Liverpool Street",2016-01-01 00:04:00,"Holborn Circus, Holborn",4,0,0,5,1,...,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
16646405,50608189.0,2016-01-01 00:23:00,"Brushfield Street, Liverpool Street",2016-01-01 00:05:00,"Holborn Circus, Holborn",4,0,0,5,1,...,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389


# 3. MAPPING BOROUGHS TO JOURNEY DATA 

### 3.1 Direct Mapping by Station Name

In [25]:
journey_df = features.direct_borough_mapping_by_stationname(bike_locs, journey_df)

# still lots of missing boroughs: 2'314'797 + 2'337'714
print(journey_df.isna().sum())

rental_id                   0
end_date                    0
end_station_name            0
start_date                  0
start_station_name          0
day_of_week                 0
hour                        0
is_weekend                  0
part_of_day                 0
month                       0
season                      0
bank_holiday                0
tempmax                     0
tempmin                     0
temp                        0
feelslike                   0
humidity                    0
precip                      0
windgust                    0
windspeed                   0
cloudcover                  0
visibility                  0
uvindex                     0
daylight_hours              0
start_borough         2314797
end_borough           2337714
dtype: int64


### 3.2 Fuzzy Matching for empty Boroughs
Matching names that refer to the same station but are slightly different in their naming. Parallel processing to improve the performance of fuzzy matching.

In [28]:
journey_df = features.fuzzy_borough_mapping_by_stationname(bike_locs, journey_df)

# reduced number of missing values to 1'744'880 + 1'789'528
print(journey_df.isna().sum())

rental_id                   0
end_date                    0
end_station_name            0
start_date                  0
start_station_name          0
day_of_week                 0
hour                        0
is_weekend                  0
part_of_day                 0
month                       0
season                      0
bank_holiday                0
tempmax                     0
tempmin                     0
temp                        0
feelslike                   0
humidity                    0
precip                      0
windgust                    0
windspeed                   0
cloudcover                  0
visibility                  0
uvindex                     0
daylight_hours              0
start_borough         1744880
end_borough           1789528
dtype: int64


### 3.3 Region Mapping for Former Station

After investigating the missing borough data in the current samplers, it was discovered that these samplers correspond to former stations that are no longer in use and are not listed in the provided BikePoints file, where the station names are given in the format "street name, region" (e.g., "London Fields, Hackney Central").

To address this a dictionary with the region information as key and the borough with the maximum counts as value is created. This is then used to map the missing boroughs in the merged_df DataFrame based on the extracted location information.

In [40]:
journey_df = features.former_station_borough_mapping_by_region(bike_locs, journey_df)

# still some missing values: 8855 + 9287
print(journey_df.isna().sum())

rental_id                0
end_date                 0
end_station_name         0
start_date               0
start_station_name       0
day_of_week              0
hour                     0
is_weekend               0
part_of_day              0
month                    0
season                   0
bank_holiday             0
tempmax                  0
tempmin                  0
temp                     0
feelslike                0
humidity                 0
precip                   0
windgust                 0
windspeed                0
cloudcover               0
visibility               0
uvindex                  0
daylight_hours           0
start_borough         8855
end_borough           9287
dtype: int64


### 3.4 Manual Mapping and Drop Irrelevant Stations

About 10 stations were still not mapped to boroughs, so they were manually assigned. Stations labeled as 'Test' or 'Workshop' were dropped. Now, all entries are associated with a borough.

In [44]:
unique_empty_start_boroughs = journey_df.loc[journey_df['start_borough'].isna(), 'start_station_name'].unique()
unique_empty_end_boroughs = journey_df.loc[journey_df['end_borough'].isna(), 'end_station_name'].unique()
unique_empty_boroughs = np.union1d(unique_empty_start_boroughs, unique_empty_end_boroughs)


for name in unique_empty_boroughs:
    print(name)

Allington street, Off Victoria Street, Westminster
Canada Water Station
Contact Centre, Southbury House
Imperial Wharf Station
Import Dock
LSP1
LSP2
Mechanical Workshop Clapham
Mechanical Workshop Penton
Monier Road, Newham
One London
PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY
Pop Up Dock 1
Pop Up Dock 2
Victoria and Albert Museum, Cromwell Road
Worship Street, Hackney
York Way, Camden


In [46]:
journey_df = features.manual_borough_mapping(journey_df)

# no missing boroughs.. hurray!
print(journey_df.isna().sum())

rental_id             0
end_date              0
end_station_name      0
start_date            0
start_station_name    0
day_of_week           0
hour                  0
is_weekend            0
part_of_day           0
month                 0
season                0
bank_holiday          0
tempmax               0
tempmin               0
temp                  0
feelslike             0
humidity              0
precip                0
windgust              0
windspeed             0
cloudcover            0
visibility            0
uvindex               0
daylight_hours        0
start_borough         0
end_borough           0
dtype: int64


rental_id             0
end_date              0
end_station_name      0
start_date            0
start_station_name    0
start_borough         0
end_borough           0
dtype: int64


### 3.5 Save Interim Data

In [22]:
journey_df.to_csv('../data/interim/journey_data_cleaned_featureeng_1_2_3.csv')

Unnamed: 0,rental_id,end_date,end_station_name,start_date,start_station_name,day_of_week,hour,is_weekend,part_of_day,month,...,windspeed,cloudcover,visibility,uvindex,daylight_hours,start_borough,end_borough,year,day_of_month,day_of_year
16646400,50608184.0,2016-01-01 01:14:00,"Hampstead Road (Cartmel), Euston",2016-01-01 00:00:00,"Hampstead Road, Euston",4,0,0.0,5,1.0,...,0.2,61.3,18.6,1.0,7.921389,Camden,Camden,2016,1,1
16646401,50608186.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",4,0,0.0,5,1.0,...,0.2,61.3,18.6,1.0,7.921389,Westminster,Westminster,2016,1,1
16646402,50608187.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",4,0,0.0,5,1.0,...,0.2,61.3,18.6,1.0,7.921389,Westminster,Westminster,2016,1,1
16646403,50608188.0,2016-01-01 00:22:00,"Brushfield Street, Liverpool Street",2016-01-01 00:04:00,"Holborn Circus, Holborn",4,0,0.0,5,1.0,...,0.2,61.3,18.6,1.0,7.921389,Camden,City of London,2016,1,1
16646405,50608189.0,2016-01-01 00:23:00,"Brushfield Street, Liverpool Street",2016-01-01 00:05:00,"Holborn Circus, Holborn",4,0,0.0,5,1.0,...,0.2,61.3,18.6,1.0,7.921389,Camden,City of London,2016,1,1


# 4. BOROUGH DEMOGRAPHIC FEATURE ENGINEERING

### 4.1 Create DataFrame with Borough Demographic Features

Data Source
- TSXXX: Census 2021: the census is undertaken by the Office for National Statistics every 10 years and gives us a picture of all the people and householdsin GB
- ADDXXX: Additional data

Features
- TS006 population density: population_density
- TS007 age: age_mean, age_percentile_25, age_percentile_75
- TS008 sex: female_ratio
- TS017 household size: household_size_1_ratio, household_size_2_ratio, household_size_3-5_ratio, household_size_6+_ratio
- TS021 ethnic group: ethnic_asian_ratio, ethnic_african_caribbean_ratio, ethnic_mixed_ratio, ethic_white_ratio, ethnic_arab_other_ratio
- TS030 religion: religion_no_ratio, religion_christian_ratio, religion_buddhist_ratio, religion_hindu_ratio, religion_jewish_ratio, religion_muslim_ratio, religion_sikh_ratio
- TS067 eduction: education_no_ratio, education_level_1_ratio, education_level_2_ratio, education_level_3_ratio, education_level_4, education_apprenticeship
- TS037 general health: health = 5 -> very good, health = 1 -> very bad
- ADD001 green and blue cover: green_cover_percentage, blue_cover_percentage
- ADD007 sports participation: sports_participation_rate
- ADD012 crime rate: crime_offences_rate
- ADD003 business density: business_density
- TS058 travel to work: distance_work_less_2km, distance_work_2km_5km, distance_work_5km_10km, distance_work_10km_20km, distance_work_20km_more, distance_work_homeoffice, distance_work_no_fix_place
- ADD008 road traffic - area ratio: road_traffic_ratio
- ADD009 healthy streets score: street_health_score
- ADD002 house price index: house_price_avg
- TS045 cars per household: cars_household_avg
- TS044 accommodation: accommodation_house_ratio, accommodation_flat_ratio, accommodation_mobile_ratio
- TS054 tenure: tenure_owned_sharedowned_ratio
- TS038 disability: disability_ratio
- TS016 length of residence: residence_lengh_uk_born, residence_lengh_10yr_plus, residence_lengh_5yr_10yr, residence_lengh_2yr_5yr, residence_lengh_2yr_less
- ADD006 personal well-being: personal_well_being_life_satisfaction, personal_well_being_worthwile, personal_well_being_happiness, personal_well_being_anxiety
- ADD011 election 2018: election_seats_percentage_con, election_seats_percentage_lab, election_seats_percentage_ld, election_seats_percentage_gre, election_seats_percentage_ind
- TS062 socio economic classification: occupation_high_level_ratio (L1 - L6), occupation_small_intermediate_ratio (L7-L9), occupation_lower_level_ratio (L10-L13), occupation_unemployed_ratio (L14.1, L14.2), occuption_student_ratio (L15)
- ADD004 earnings workplace: earnings_workplace_mean



In [23]:
borough_df = features.add_borough_demographic_features(bike_locs, ['TS006', 'TS007', 'TS008', 'TS017', 'TS021', 'TS030', 'TS067',
                                                                    'TS037', 'ADD001', 'ADD007', 'ADD012', 'ADD003', 'TS058', 'ADD008',
                                                                    'ADD009', 'ADD002', 'TS045', 'TS044', 'TS054', 'TS038', 'TS016',
                                                                    'ADD006', 'ADD011', 'TS062', 'ADD004'])

borough_df.head(13)

Unnamed: 0,borough,bike_station_counts,bike_docks_counts,borough_code,population_density,age_mean,age_25_percentile,age_75_percentile,female_ratio,householdsize_1_ratio,...,election_seats_percentage_lab,election_seats_percentage_ld,election_seats_percentage_gre,election_seats_percentage_ind,occupation_high_level_ratio_ratio,occupation_small_intermediate_ratio_ratio,occupation_lower_level_ratio_ratio,occupation_unemployed_ratio_ratio,occupation_student_ratio_ratio,earnings_workplace
0,Westminster,171,3915,E09000033,9514.2,38.280112,24.0,53.0,0.515712,0.426546,...,0.32,0.0,0.0,0.0,0.443089,0.161098,0.148262,0.118178,0.129374,60046.0
1,Tower Hamlets,116,3332,E09000030,15702.9,31.984699,21.0,42.0,0.497693,0.319656,...,0.93,0.0,0.0,0.02,0.397595,0.149093,0.185841,0.136711,0.13076,69264.0
2,Kensington and Chelsea,94,2181,E09000020,11816.5,39.95147,24.0,56.0,0.532255,0.437055,...,0.26,0.02,0.0,0.0,0.460353,0.177567,0.144639,0.10478,0.112661,41723.0
3,Camden,64,1782,E09000007,9640.9,37.006034,22.0,52.0,0.526573,0.387345,...,0.8,0.06,0.02,0.0,0.448063,0.157334,0.14426,0.098225,0.152118,49239.0
4,Hammersmith and Fulham,59,1727,E09000013,11161.1,36.504294,23.0,51.0,0.530769,0.360937,...,0.76,0.0,0.0,0.0,0.457652,0.175044,0.171519,0.088828,0.106956,48558.0
5,Lambeth,59,1656,E09000022,11839.1,35.826829,24.0,49.0,0.515382,0.320404,...,0.9,0.0,0.08,0.0,0.452817,0.171754,0.203337,0.08449,0.087602,48333.0
6,Wandsworth,59,1593,E09000032,9560.0,35.724045,24.0,49.0,0.525954,0.299375,...,0.43,0.0,0.0,0.02,0.528828,0.175615,0.149007,0.069828,0.076722,40211.0
7,Southwark,56,1649,E09000028,10659.0,35.435768,22.0,49.0,0.51565,0.331361,...,0.78,0.22,0.0,0.0,0.424325,0.159489,0.211372,0.090624,0.114189,47876.0
8,Hackney,39,1167,E09000012,13593.3,34.288071,21.0,47.0,0.522111,0.329258,...,0.91,0.0,0.0,0.0,0.41677,0.17596,0.194211,0.118285,0.094773,42211.0
9,City of London,38,977,E09000001,2975.0,40.704977,26.0,55.0,0.449843,0.509972,...,,,,,0.635932,0.125062,0.084708,0.054598,0.0997,88145.0


In [24]:
# missing data: personal_wellbeing and election_seats_percentage for City of London -> imputation, fill with mean
borough_df = borough_df.fillna(borough_df.mean())

  borough_df = borough_df.fillna(borough_df.mean())


In [9]:
# save borough data
borough_df.to_csv('../data/interim/borough_data_featureeng.csv')

### 4.2 Mapping Borough Features and Journey Data

In [4]:
features.map_journey_borough_data(datetime(2019, 1, 1), datetime(2019, 1, 31), journey_df, borough_df, '2019_01')
features.map_journey_borough_data(datetime(2018, 1, 1), datetime(2018, 1, 31), journey_df, borough_df, '2018_01')

NameError: name 'journey_df' is not defined

In [1]:
features.map_journey_borough_data(datetime(2019, 1, 1), datetime(2019, 12, 31), journey_df, borough_df, '2019')

NameError: name 'features' is not defined

In [None]:
features.map_journey_borough_data(datetime(2018, 1, 1), datetime(2018, 12, 31), journey_df, borough_df, '2018')