# Feature Engineering: Journey Bike Data

In [1]:
import pandas as pd
import requests
import io
import json
import urllib
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime
from rapidfuzz import fuzz
from concurrent import futures

# 1. MAPPING BOROUGHS TO JOURNEY DATA 

## 1.1 Acquiring and Assigning London Boroughs to Bike Station Location

### Get all bike stations and its corresponding locations data (lat and lon)
by extracting relevant information from API requests to tfl BikePoint data

In [94]:
url = "https://api.tfl.gov.uk/BikePoint/"
response = requests.get(url)
root = json.loads(response.text)

data = []
logical_id = ""
for station in root:
    for prop in station['additionalProperties']:
        if prop['key'] == 'TerminalName':
            logical_id = prop['value']
            break

    station_data = {
        "id": station['id'][11:],
        "name": station['commonName'],
        "lat": station['lat'],
        "lon": station['lon']
    }
    data.append(station_data)

bike_locs = pd.DataFrame(data)

print(bike_locs.shape)
bike_locs.head(5)

(796, 5)


Unnamed: 0,id,name,lat,lon,terminalId
0,1,"River Street , Clerkenwell",51.529163,-0.10997,1023
1,2,"Phillimore Gardens, Kensington",51.499606,-0.197574,1018
2,3,"Christopher Street, Liverpool Street",51.521283,-0.084605,1012
3,4,"St. Chad's Street, King's Cross",51.530059,-0.120973,1013
4,5,"Sedding Street, Sloane Square",51.49313,-0.156876,3420


### Get boroughs based on location and map to bike location

In [96]:
def get_borough(lat, lon):
    """
    Function to retrieve borough name using lat and lon coordinates.
    
    This function sends a GET request to the 'findthatpostcode' API, using 
    the provided lat and lon coordinates. If the request is successful, the 
    function extracts the borough name from the response data and returns it. 
    If the request is unsuccessful, the function returns 'no borough'.
    
    Parameters:
    lat (float): Latitude coordinate of the location.
    lon (float): Longitude coordinate of the location.

    Returns:
    str: Borough name or 'no borough' if the API request is unsuccessful.
    """
    url = f'https://findthatpostcode.uk/points/{lat},{lon}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        borough = data['included'][0]['attributes']['cty_name']
        return borough
    else:
        return 'no borough'

In [3]:
# map borough to the bike locations
bike_locs['borough'] = bike_locs.apply(lambda row: get_borough(row['lat'], row['lon']), axis=1)
bike_locs.head(5)
# bike_locs.to_csv('/Users/tabea/Documents/UrbanMobility/data/bike_locations_boroughs.csv', header=True, index=None)

Unnamed: 0,id,name,lat,lon,terminalId,borough
0,1,"River Street , Clerkenwell",51.529163,-0.10997,1023,Islington
1,2,"Phillimore Gardens, Kensington",51.499606,-0.197574,1018,Kensington and Chelsea
2,3,"Christopher Street, Liverpool Street",51.521283,-0.084605,1012,Hackney
3,4,"St. Chad's Street, King's Cross",51.530059,-0.120973,1013,Camden
4,5,"Sedding Street, Sloane Square",51.49313,-0.156876,3420,Kensington and Chelsea


## 1.2. Mapping Station Names to Boroughs

### Map the StartStation and EndStation Names to boroughs

In [152]:
# Standardize by stripping white space and converting to lower case, create dictionary for mapping
bike_locs['name'] = bike_locs['name'].str.strip().str.lower()
borough_mapping = bike_locs.set_index('name')['borough'].to_dict()

In [4]:
# mapping
merged_df['start_borough'] = merged_df['startStation_name'].str.strip().str.lower().map(borough_mapping)
merged_df['end_borough'] = merged_df['endStation_name'].str.strip().str.lower().map(borough_mapping)

merged_df.head(5)

Unnamed: 0.1,Unnamed: 0,Rental Id,End Date,EndStation Name,Start Date,StartStation Name,StartBorough,EndBorough
0,29879046,40346508,2015-04-01 00:06:00,"Ebury Bridge, Pimlico",2015-04-01 00:00:00,"Harriet Street, Knightsbridge",Kensington and Chelsea,Westminster
1,29879054,40346509,2015-04-01 00:11:00,"Regent's Row , Haggerston",2015-04-01 00:00:00,"Brushfield Street, Liverpool Street",City of London,Hackney
2,29879048,40346510,2015-04-01 00:08:00,"Foley Street, Fitzrovia",2015-04-01 00:01:00,"Tavistock Place, Bloomsbury",Camden,Westminster
3,29879161,40346511,2015-04-01 00:50:00,"Bow Church Station, Bow",2015-04-01 00:01:00,"Moor Street, Soho",Westminster,Tower Hamlets
4,29879044,40346512,2015-04-01 00:03:00,"Jubilee Street, Stepney",2015-04-01 00:01:00,"Philpot Street, Whitechapel",Tower Hamlets,Tower Hamlets


In [29]:
# still lots of missing boroughs: 3'104'758 + 3'192'763
print(merged_df.isna().sum())

Unnamed: 0                 0
Rental Id                  0
End Date                   0
EndStation Name            0
Start Date                 0
StartStation Name          0
StartBorough         3104758
EndBorough           3192763
dtype: int64


### Run fuzzy matching for empty boroughs
Matching names that refer to the same station but are slightly different in their naming. Parallel processing to improve the performance of fuzzy matching.

In [None]:
def fuzzy_match(station_name, min_score=70):
    """
    Performs fuzzy matching between a given station name and a mapping of station names to boroughs.
    
    Args:
        station_name (str): The station name to be matched.
        min_score (int): The minimum similarity score required for a match (default: 70).
    
    Returns:
        str or None: The borough corresponding to the best fuzzy match for the station name, 
                     or None if no match is found above the minimum score threshold.
    """
    if station_name is None:
        return None

    best_match = None
    best_score = 0

    for name in station_to_borough.keys():
        score = fuzz.token_sort_ratio(station_name, name)
        if score > best_score:
            best_score = score
            best_match = name

    return station_to_borough[best_match] if best_match and best_score >= min_score else None

In [48]:
station_to_borough = {row['name']: row['borough'] for _, row in bike_locs.iterrows()}
empty_boroughs = merged_df[(merged_df['start_borough'].isna()) | (merged_df['end_borough'].isna())]


# function to perform fuzzy matching in parallel
def parallel_fuzzy_match(column):
    return column.apply(fuzzy_match)

# split the DataFrame into chunks for parallel processing
num_parallel_tasks = 6
chunk_size = len(empty_boroughs) // num_parallel_tasks  
chunks = [empty_boroughs[i:i+chunk_size] for i in range(0, len(empty_boroughs), chunk_size)]

# update the StartBorough column, process chunks in parallel
with futures.ThreadPoolExecutor() as executor: 
    results = list(executor.map(parallel_fuzzy_match, [chunk['startStation_name'] for chunk in chunks]))

for i, result in enumerate(results):
    chunk = chunks[i]
    chunk.loc[:, 'start_borough'] = result

# update the EndBorough column, process chunks in parallel
with futures.ThreadPoolExecutor() as executor:  # Use ThreadPoolExecutor for threads or ProcessPoolExecutor for processes
    results = list(executor.map(parallel_fuzzy_match, [chunk['endStation_name'] for chunk in chunks]))

for i, result in enumerate(results):
    chunk = chunks[i]
    chunk.loc[:, 'end_borough'] = result

# replace the rows with missing borough data in the original dataframe with the processed rows
updated_empty_boroughs = pd.concat(chunks)
merged_df.update(updated_empty_boroughs)


In [49]:
# reduced number of missing values by 3.
print(merged_df.isna().sum())

Unnamed: 0                 0
Rental Id                  0
End Date                   0
EndStation Name            0
Start Date                 0
StartStation Name          0
StartBorough         1082424
EndBorough           1145911
dtype: int64


### Adding boroughs to former stations

After investigating the missing borough data in the current samplers, it was discovered that these samplers correspond to former stations that are no longer in use and are not listed in the provided BikePoints file, where the station names are given in the format "street name, region" (e.g., "London Fields, Hackney Central").

To address this a dictionary with the region information as key and the borough with the maximum counts as value is created. This is then used to map the missing boroughs in the merged_df DataFrame based on the extracted location information.

In [72]:
# add a 'location' column to bike_locs
bike_locs['location'] = bike_locs['name'].str.split(',').str[1].str.strip()

# group by 'location' and get the borough with the maximum counts
location_borough = bike_locs.groupby('location')['borough'].agg(lambda x: x.value_counts().index[0])

# convert the Series to a dictionary
location_borough_dict = location_borough.to_dict()

In [79]:
def get_borough_from_dict(name):
    """
    Retrieves the borough from the 'location_borough_dict' dictionary based on the given station name.
    
    Args:
        name (str): Station name in the format 'street name, region'.
    
    Returns:
        str or None: The corresponding borough based on the region, or None if the borough is unavailable.
    """
    parts = name.split(',')
    if len(parts) > 1:
        return location_borough_dict.get(parts[1].strip(), None)
    else:
        return None

In [80]:
merged_df.loc[merged_df['start_borough'].isna(), 'start_borough'] = merged_df.loc[merged_df['start_borough'].isna(), 'startStation_name'].apply(get_borough_from_dict)
merged_df.loc[merged_df['end_borough'].isna(), 'end_borough'] = merged_df.loc[merged_df['end_borough'].isna(), 'endStation_name'].apply(get_borough_from_dict)

### Map manually and drop irrelevant stations

In [92]:
unique_empty_start_boroughs = merged_df.loc[merged_df['start_borough'].isna(), 'startStation_name'].unique()
unique_empty_end_boroughs = merged_df.loc[merged_df['end_borough'].isna(), 'endStation_name'].unique()

for name in unique_empty_boroughs:
    print(name)

Abingdon Green, Great College Street
Allington street, Off Victoria Street, Westminster
Columbia Road, Weavers
Contact Centre, Southbury House
Electrical Workshop PS
Hansard Mews, Shepherds Bush
Import Dock
LSP1
LSP2
Mechanical Workshop Clapham
Mechanical Workshop Penton
Monier Road
Monier Road, Newham
One London
Oval Way, Lambeth
PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY
Pop Up Dock 1
Pop Up Dock 2
Victoria and Albert Museum, Cromwell Road
Worship Street, Hackney
York Way, Camden


In [95]:
# fill values manually

merged_df.loc[merged_df['startStation_name'] == 'Hansard Mews, Shepherds Bush', 'start_borough'] = 'Hammersmith and Fulham'
merged_df.loc[merged_df['startStation_name'] == 'Columbia Road, Weavers', 'start_borough'] = 'Tower Hamlets'
merged_df.loc[merged_df['startStation_name'] == 'Abingdon Green, Great College Street', 'start_borough'] = 'Westminster'
merged_df.loc[merged_df['startStation_name'] == 'Oval Way, Lambeth', 'start_borough'] = 'Lambeth'
merged_df.loc[merged_df['startStation_name'] == 'Contact Centre, Southbury House', 'start_borough'] = 'Enfield'
merged_df.loc[merged_df['startStation_name'] == 'Monier Road', 'start_borough'] = 'Newham'
merged_df.loc[merged_df['startStation_name'] == 'Victoria and Albert Museum, Cromwell Road', 'start_borough'] = 'Kensington and Chelsea'
merged_df.loc[merged_df['startStation_name'] == 'Monier Road, Newham', 'start_borough'] = 'Newham'
merged_df.loc[merged_df['startStation_name'] == 'Allington street, Off Victoria Street, Westminster', 'start_borough'] = 'Westminster'
merged_df.loc[merged_df['startStation_name'] == 'Worship Street, Hackney', 'start_borough'] = 'Hackney'
merged_df.loc[merged_df['startStation_name'] == 'York Way, Camden', 'start_borough'] = 'Camden'
merged_df.loc[merged_df['startStation_name'] == 'Monier Road', 'start_borough'] = 'Hackney'

merged_df.loc[merged_df['endStation_name'] == 'Hansard Mews, Shepherds Bush', 'end_borough'] = 'Hammersmith and Fulham'
merged_df.loc[merged_df['endStation_name'] == 'Columbia Road, Weavers', 'end_borough'] = 'Tower Hamlets'
merged_df.loc[merged_df['endStation_name'] == 'Abingdon Green, Great College Street', 'end_borough'] = 'Westminster'
merged_df.loc[merged_df['endStation_name'] == 'Oval Way, Lambeth', 'end_borough'] = 'Lambeth'
merged_df.loc[merged_df['endStation_name'] == 'Contact Centre, Southbury House', 'end_borough'] = 'Enfield'
merged_df.loc[merged_df['endStation_name'] == 'Monier Road', 'end_borough'] = 'Newham'
merged_df.loc[merged_df['endStation_name'] == 'Victoria and Albert Museum, Cromwell Road', 'end_borough'] = 'Kensington and Chelsea'
merged_df.loc[merged_df['endStation_name'] == 'Monier Road, Newham', 'end_borough'] = 'Newham'
merged_df.loc[merged_df['endStation_name'] == 'Allington street, Off Victoria Street, Westminster', 'end_borough'] = 'Westminster'
merged_df.loc[merged_df['endStation_name'] == 'Worship Street, Hackney', 'end_borough'] = 'Hackney'
merged_df.loc[merged_df['endStation_name'] == 'York Way, Camden', 'end_borough'] = 'Camden'
merged_df.loc[merged_df['endStation_name'] == 'Monier Road', 'end_borough'] = 'Hackney'

In [98]:
# drop irrelevant stations

merged_df = merged_df.dropna(subset=['start_borough', 'end_borough'])
print(merged_df.isna().sum())

Unnamed: 0           0
Rental Id            0
End Date             0
EndStation Name      0
Start Date           0
StartStation Name    0
StartBorough         0
EndBorough           0
dtype: int64


In [19]:
# import data, if already executed
merged_df = pd.read_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_cleaned_with_boroughs_no_nans.csv', index_col=0)

In [7]:
# Rename columns
merged_df = merged_df.rename(columns={
    'Rental Id': 'rental_id',
    'End Date': 'end_date',
    'EndStation Name': 'endStation_name',
    'Start Date': 'start_date',
    'StartStation Name': 'startStation_name',
    'StartBorough': 'start_borough',
    'EndBorough': 'end_borough'
})


In [8]:
merged_df['end_date'] = pd.to_datetime(merged_df['end_date'])
mask_2015 = merged_df['end_date'].dt.year == 2015
merged_df = merged_df[~mask_2015]

merged_df['start_date'] = pd.to_datetime(merged_df['start_date'])
mask_2015 = merged_df['start_date'].dt.year == 2015
merged_df = merged_df[~mask_2015]

merged_df['end_date'] = pd.to_datetime(merged_df['end_date'])
mask_2023 = merged_df['end_date'].dt.year == 2023
merged_df = merged_df[~mask_2023]

merged_df['start_date'] = pd.to_datetime(merged_df['start_date'])
mask_2023 = merged_df['start_date'].dt.year == 2023
merged_df = merged_df[~mask_2023]

# 2. BASIC FEATURE ENGINEERING

### Temporal Feature Engineering

Features: hour, partOfDay (morning, afternoon, evening, night), day of week, weekend, weekday, , month, season

In [4]:
def get_part_of_day(hour):
    """
    Given an hour of the day (in a 24-hour format), this function 
    returns a string representing the general part of the day the 
    hour falls into. The categorizations used are:
    
    - Early Morning: 5:00 to 8:59
    - Morning: 9:00 to 12:59
    - Afternoon: 13:00 to 16:59
    - Evening: 17:00 to 20:59
    - Night: 21:00 to 4:59

    Input: 
    hour: integer (0-23)

    Returns: 
    part_of_day: string (Early Morning, Morning, Afternoon, Evening, Night)
    """
    if (hour > 4) and (hour <= 8):
        return 'Early Morning'
    elif (hour > 8) and (hour <= 12 ):
        return 'Morning'
    elif (hour > 12) and (hour <= 16):
        return'Afternoon'
    elif (hour > 16) and (hour <= 20) :
        return 'Evening'
    elif (hour > 20) or (hour <=4):
        return'Night'

In [5]:
 def get_season(month):
    """
    This function classifies a given month into its corresponding season based on meteorological reckoning. 
    Here's the classification used:

    - Spring: March (3) through May (5)
    - Summer: June (6) through August (8)
    - Fall: September (9) through November (11)
    - Winter: December (12) through February (2)

    Input: 
    month: integer (1-12) representing the month of the year

    Returns: 
    season: string (spring, summer, fall, winter)
    """
    if month >= 3 and month <= 5:
        return 'spring'
    elif month >= 6 and month <= 8:
        return 'summer'
    elif month >= 9 and month <= 11:
        return 'fall'
    else:
        return 'winter'

In [9]:
merged_df['start_date'] = pd.to_datetime(merged_df['start_date'])
merged_df['day_of_week'] = merged_df['start_date'].dt.dayofweek
merged_df['hour'] = merged_df['start_date'].dt.hour
merged_df['is_weekend'] = merged_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0) 
merged_df['part_of_day'] = merged_df['hour'].apply(get_part_of_day)
merged_df['month'] = merged_df['start_date'].dt.month
merged_df['season'] = merged_df['month'].apply(get_season)

### Bank Holiday Feature Engineering

In [10]:
import holidays

uk_holidays = holidays.UK()
merged_df['bank_holiday'] = merged_df['start_date'].apply(lambda x: 1 if x in uk_holidays else 0)

In [11]:
merged_df.head()

Unnamed: 0,Unnamed: 0.1,rental_id,end_date,endStation_name,start_date,startStation_name,start_borough,end_borough,day_of_week,hour,is_weekend,part_of_day,month,season,bank_holiday
9768810,16646400.0,50608184.0,2016-01-01 01:14:00,"Hampstead Road (Cartmel), Euston",2016-01-01 00:00:00,"Hampstead Road, Euston",Camden,Camden,4,0,0,Night,1,winter,1
9768811,16646401.0,50608186.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",Westminster,Westminster,4,0,0,Night,1,winter,1
9768812,16646402.0,50608187.0,2016-01-01 00:24:00,"Rochester Row, Westminster",2016-01-01 00:04:00,"Chelsea Bridge, Pimlico",Westminster,Westminster,4,0,0,Night,1,winter,1
9768813,16646403.0,50608188.0,2016-01-01 00:22:00,"Brushfield Street, Liverpool Street",2016-01-01 00:04:00,"Holborn Circus, Holborn",Camden,City of London,4,0,0,Night,1,winter,1
9768814,16646405.0,50608189.0,2016-01-01 00:23:00,"Brushfield Street, Liverpool Street",2016-01-01 00:05:00,"Holborn Circus, Holborn",Camden,City of London,4,0,0,Night,1,winter,1


### Weather Feature Engineering

Features: tempmax, tempmin, temp, feelslike, humidity, precip	windgust, windspeed, cloudcover, visibility, uvindex, daylight_hours

daily weather data generated with the weather data builder by www.visualcrossing.com

#### load and clean weather data

In [12]:
# load data
weather_df = pd.read_csv('/Users/tabea/Documents/UrbanMobility/data/weather london_2016-2022.csv', index_col=0, encoding='ISO-8859-1')
weather_df.head()

Unnamed: 0_level_0,tempmax,tempmin,temp,feelslike,humidity,precip,windgust,windspeed,cloudcover,visibility,uvindex,sunrise,sunset
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-01-01,8.6,2.6,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,2016-01-01T08:06:16,2016-01-01T16:01:33
2016-01-02,10.8,8.1,10.0,10.0,89.7,1.257,46.4,0.2,90.7,14.5,1,2016-01-02T08:06:10,2016-01-02T16:02:36
2016-01-03,10.3,6.3,8.0,8.0,87.7,10.214,53.6,0.2,60.0,20.5,0,2016-01-03T08:06:01,2016-01-03T16:03:42
2016-01-04,10.8,6.0,8.0,8.0,87.9,0.201,,0.1,45.7,20.1,1,2016-01-04T08:05:49,2016-01-04T16:04:51
2016-01-05,10.6,6.8,8.4,8.4,89.5,0.218,,5.2,65.1,17.0,1,2016-01-05T08:05:33,2016-01-05T16:06:02


In [13]:
# add daylight_hours
weather_df['sunrise'] = pd.to_datetime(weather_df['sunrise'])
weather_df['sunset'] = pd.to_datetime(weather_df['sunset'])
weather_df['daylight_hours'] = (weather_df['sunset'] - weather_df['sunrise']).dt.total_seconds()/ 3600

# drop sunrise and sunset
weather_df = weather_df.drop(columns=['sunrise', 'sunset'])

# replace NaN by 0
weather_df = weather_df.fillna(0)

In [14]:
weather_df.head()

Unnamed: 0_level_0,tempmax,tempmin,temp,feelslike,humidity,precip,windgust,windspeed,cloudcover,visibility,uvindex,daylight_hours
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2016-01-01,8.6,2.6,5.9,5.9,84.7,3.8,50.0,0.2,61.3,18.6,1,7.921389
2016-01-02,10.8,8.1,10.0,10.0,89.7,1.257,46.4,0.2,90.7,14.5,1,7.940556
2016-01-03,10.3,6.3,8.0,8.0,87.7,10.214,53.6,0.2,60.0,20.5,0,7.961389
2016-01-04,10.8,6.0,8.0,8.0,87.9,0.201,0.0,0.1,45.7,20.1,1,7.983889
2016-01-05,10.6,6.8,8.4,8.4,89.5,0.218,0.0,5.2,65.1,17.0,1,8.008056


#### map weather data to journey data

In [20]:
# merge the dataframes on the date columns
merged_df['start_date_only'] = merged_df['start_date'].dt.date
weather_df.index = pd.to_datetime(weather_df.index).date

merged_df = pd.merge(merged_df, weather_df, left_on='start_date_only', right_index=True, how='left')

# drop the 'start_date_only' column
merged_df.drop(columns=['start_date_only'], inplace=True)


In [21]:
merged_df.tail()

Unnamed: 0,Unnamed: 0.1,rental_id,end_date,endStation_name,start_date,startStation_name,start_borough,end_borough,day_of_week,hour,...,temp,feelslike,humidity,precip,windgust,windspeed,cloudcover,visibility,uvindex,daylight_hours
83479851,65714332.0,127692023.0,2022-12-31 23:57:00,"Southerton Road, Hammersmith",2022-12-31 23:51:00,"Rainville Road, Hammersmith",Hammersmith and Fulham,Hammersmith and Fulham,5,23,...,12.8,12.8,85.1,3.24,65.8,34.6,95.5,11.1,0,7.908333
83479860,65714327.0,127692032.0,2022-12-31 23:55:00,"Macclesfield Rd, St Lukes",2022-12-31 23:52:00,"St. John Street, Finsbury",Islington,Islington,5,23,...,12.8,12.8,85.1,3.24,65.8,34.6,95.5,11.1,0,7.908333
83479869,65714322.0,127692041.0,2022-12-31 23:55:00,"Macclesfield Rd, St Lukes",2022-12-31 23:53:00,"St. John Street, Finsbury",Islington,Islington,5,23,...,12.8,12.8,85.1,3.24,65.8,34.6,95.5,11.1,0,7.908333
83479870,65714323.0,127692042.0,2022-12-31 23:54:00,"Millbank Tower, Pimlico",2022-12-31 23:53:00,"Millbank Tower, Pimlico",Westminster,Westminster,5,23,...,12.8,12.8,85.1,3.24,65.8,34.6,95.5,11.1,0,7.908333
83672585,63740942.0,127890291.0,2022-10-12 08:31:00,"Gwendwr Road, West Kensington",2022-10-12 08:26:00,"Hammersmith Road, Hammersmith",Hammersmith and Fulham,Hammersmith and Fulham,2,8,...,12.0,12.0,80.2,0.0,28.5,17.2,71.4,18.3,4,10.913333


In [23]:
print(merged_df.isna().sum())

Unnamed: 0.1         0
rental_id            0
end_date             0
endStation_name      0
start_date           0
startStation_name    0
start_borough        0
end_borough          0
day_of_week          0
hour                 0
is_weekend           0
part_of_day          0
month                0
season               0
bank_holiday         0
tempmax              0
tempmin              0
temp                 0
feelslike            0
humidity             0
precip               0
windgust             0
windspeed            0
cloudcover           0
visibility           0
uvindex              0
daylight_hours       0
dtype: int64


In [24]:
# save as CSV
merged_df.to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_cleaned_with_borough_basic_features.csv')

# 3. BOROUGH DEMOGRAPHIC FEATURE ENGINEERING

# X. SAVE ENGINEERED DATA

In [99]:
# all data (2016 - 20122)
merged_df.to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_cleaned_with_boroughs_no_nans.csv')

In [100]:
# by year

groups = merged_df.groupby(pd.Grouper(key='Start Date', freq='Y'))
yearly_dfs = {}
for year, group in groups:
    yearly_dfs[year.year] = group.reset_index(drop=True)
    
yearly_dfs[2015].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2015_cleaned.csv')
yearly_dfs[2016].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2016_cleaned.csv')
yearly_dfs[2017].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2017_cleaned.csv')
yearly_dfs[2018].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2018_cleaned.csv')
yearly_dfs[2019].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2019_cleaned.csv')
yearly_dfs[2020].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2020_cleaned.csv')
yearly_dfs[2021].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2021_cleaned.csv')
yearly_dfs[2022].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2022_cleaned.csv')