# Get and clean Journey, Bike location and Borough Data

In [2]:
import pandas as pd
import requests
import io
import json
import urllib
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime
from rapidfuzz import fuzz
from concurrent import futures

## 1. IMPORT BIKE JOURNEY DATA

### fetch data
Due to the dynamic loading of the data, web scraping is not possible. Therefore, the name of the files are copy & pased into a CSV file which then get fetched and combined with multiple API requests.

In [None]:
def rename_columns(df):
    """
    This function renames the columns in the provided dataframe 'df' as per the 
    mapping defined in 'column_names', and also changes the datatype of some columns.
    """
    
    # define a mapping of old column names to new standardized names
    column_names = {
        'End Station Id': 'EndStation Id',
        'End station number': 'EndStation Id',
        'Start Station Id': 'StartStation Id',
        'Start station number': 'StartStation Id',
        'End Station Name': 'EndStation Name',
        'End station': 'EndStation Name',
        'Start Station Name': 'StartStation Name',
        'Start station': 'StartStation Name',
        'Start date': 'Start Date',
        'End Date': 'End Date',
        'End date': 'End Date',
        'Number': 'Rental Id',
    }
    
    for old_name, new_name in column_names.items():
            if old_name in df.columns:
                df = df.rename(columns={old_name: new_name})
                if new_name in ['EndStation Id', 'StartStation Id', 'Rental Id']:
                    df[new_name] = pd.to_numeric(df[new_name], errors='coerce', downcast='integer')
                elif new_name in ['Start Date', 'End Date']:
                    df[new_name] = pd.to_datetime(df[new_name], infer_datetime_format=True)
    
    return df

In [None]:
# load the list of file names from a CSV file
filenames = pd.read_csv('/Users/tabea/Documents/UrbanMobility/filenames-data.csv', header=None, squeeze=True)

# combine a list of URL by add the base-url and filename
base_url = 'http://cycling.data.tfl.gov.uk/usage-stats/'
url_list = (base_url + urllib.parse.quote(x) for x in filenames)
unused_cols = ['Total duration (ms)', 'Total duration', 'Duration', 'Duration_Seconds', 'Bike Id', 'Bike number', 'Bike model']

# loop through each URL to extract data
temp_dfs = []
for url in url_list:
    response = requests.get(url, verify=False, timeout=(3, 7))

    if url.endswith('.csv'):
        temp_df = pd.read_csv(io.StringIO(response.content.decode('utf-8')), usecols=lambda col: col not in unused_cols)

    elif url.endswith('.xlsx'):
        temp_df = pd.read_excel(io.BytesIO(response.content), usecols=lambda col: col not in unused_cols)

    temp_df = rename_columns(temp_df)
    temp_dfs.append(temp_df)

# concatenate all temporary dataframes into a single dataframe
merged_df = pd.concat(temp_dfs, ignore_index=True)


In [13]:
merged_df.to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_raw.csv')

In [12]:
# total amount of entries: 84'188'068
len(merged_df)

84188068

### import data from disk (if already fetched)

In [68]:
# import
merged_df = pd.read_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_raw.csv')

In [69]:
merged_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Rental Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name,Unnamed: 9,Unnamed: 10,Unnamed: 11,EndStation Logical Terminal,endStationPriority_id,StartStation Logical Terminal
0,0,0,63097899.0,2017-03-15 00:06:00,631.0,"Battersea Park Road, Nine Elms",2017-03-15 00:00:00,74.0,"Vauxhall Cross, Vauxhall",,,,,,
1,1,1,63097900.0,2017-03-15 00:05:00,397.0,"Devonshire Terrace, Bayswater",2017-03-15 00:01:00,410.0,"Edgware Road Station, Marylebone",,,,,,
2,2,2,63097901.0,2017-03-15 00:06:00,426.0,"Vincent Street, Pimlico",2017-03-15 00:01:00,177.0,"Ashley Place, Victoria",,,,,,
3,3,3,63097902.0,2017-03-15 00:12:00,462.0,"Bonny Street, Camden Town",2017-03-15 00:01:00,22.0,"Northington Street , Holborn",,,,,,
4,4,4,63097903.0,2017-03-15 00:05:00,423.0,"Eaton Square (South), Belgravia",2017-03-15 00:01:00,143.0,"Pont Street, Knightsbridge",,,,,,


## 2. CLEAN BIKE JOURNEY DATA

In [70]:
print("length before cleaning:", len(merged_df))

length before cleaning: 84188068


### drop columns starting with 'Unnamed'

In [71]:
merged_df = merged_df.filter(regex='^(?!Unnamed)')

### drop rows with nan values only

In [72]:
merged_df = merged_df.dropna(how='all')

### investigate duplicates

In [73]:
# some files have same or overlapping content, but different names. 
# e.g: 01b Journey Data Extract 24Jan16-06Feb16.csv, 01bJourneyDataExtract24Jan16-06Feb16.csv
    
duplicates_rental_id = merged_df[merged_df['Rental Id'].duplicated(keep=False)]
print("duplicated rental ID samples count: ", len(duplicates_rental_id))
# duplicates_rental_id.to_csv('/Users/tabea/Documents/UrbanMobility/data/duplicates.csv')

duplicated rental ID samples count:  585398


### drop duplicates

In [74]:
# drop all samples with duplicated rental id, sort first to keep the row with the most non-null values

merged_df['nonnull_count'] = merged_df.notnull().sum(axis=1)
merged_df = merged_df.sort_values(by=['Rental Id', 'nonnull_count'], ascending=[True, False])
merged_df = merged_df.drop_duplicates(subset='Rental Id', keep='first')
merged_df = merged_df.drop(columns='nonnull_count')

print("current length of df: ", len(merged_df))

current length of df:  83895356


### investigate all nan values

In [75]:
print(merged_df.isna().sum())

Rental Id                               0
End Date                           170358
EndStation Id                      715522
EndStation Name                    171824
Start Date                              0
StartStation Id                    234440
StartStation Name                       0
EndStation Logical Terminal      83665717
endStationPriority_id            83665717
StartStation Logical Terminal    83662856
dtype: int64


### nan values: StartStation Name & EndStation Name

In [77]:
StartStationName_NAN = merged_df[merged_df["StartStation Name"].isna()]
print("StartStation Name NaNs count: ", len(StartStationName_NAN))

EndStationName_NAN = merged_df[merged_df["EndStation Name"].isna()]
print("EndStation Name NaNs: count  ", len(EndStationName_NAN))
# EndStationName_NAN.to_csv('/Users/tabea/Documents/UrbanMobility/data/EndStationName_NAN.csv')

StartStation Name NaNs count:  0
EndStation Name NaNs: count   171824


In [78]:
# EndStation Name is only NaN if EndStation ID is also NaN -> they can't be mapped, so they must be removed.

merged_df = merged_df.dropna(subset=['EndStation Id', 'EndStation Name'], how='all')
print("current df length:", len(merged_df))

current df length: 83723532


### nan values: Start Date and End Date

In [80]:
StartDate_NAN = merged_df[merged_df["Start Date"].isna()]
print("Start Date NaNs: ", len(StartDate_NAN))

EndDate_NAN = merged_df[merged_df["End Date"].isna()]
print("End Date NaNs: ", len(EndDate_NAN))
# EndDate_NAN.to_csv('/Users/tabea/Documents/UrbanMobility/data/EndDate_NAN.csv')

Start Date NaNs:  0
End Date NaNs:  69


In [81]:
# drop 69 entries with missing data

merged_df = merged_df.dropna(subset=['End Date'])
print("current df length:", len(merged_df))

current df length: 83723463


### nan values: StartStation Id & EndStation Id
Numerous NaN values are observed in the 'StartStation Id' and 'EndStation Id' columns. The primary cause: cycling rides extending beyond a single calendar day. For these instances, stations are referred to as 'TerminalStation', each carrying a unique ID set with higher numbers (>852).

Due to the mix of stationID and terminalID and lots of NaN values, the ID's get dropped and the name of the station is used as identifier.

In [82]:
# StartStation Id: 234'440 NaN -> but most StartStation Names are known
StartStationId_NAN = merged_df[merged_df["StartStation Id"].isna()]
print("StartStation Id NaNs count: ", len(StartStationId_NAN))
# StartStationId_NAN.to_csv('/Users/tabea/Documents/UrbanMobility/data/StartStationId_NAN.csv')

# EndStation Id: 715'522 NaN -> but most EndStation Names are known
EndStationId_NAN = merged_df[merged_df["EndStation Id"].isna()]
print("EndStation Id NaNs count: ", len(EndStationId_NAN))
# EndStationId_NAN.to_csv('/Users/tabea/Documents/UrbanMobility/data/EndStationId_NAN.csv')

StartStation Id NaNs count:  231579
EndStation Id NaNs count:  543698


In [83]:
# only 852 station are present in the data. But there are also terminal station IDs that have higher values and are mixed in the data.
# they can be found here: https://api.tfl.gov.uk/BikePoint/

greater_than_852 = (merged_df['StartStation Id'] > 852) | (merged_df['EndStation Id'] > 852)
print("count of terminal station ID instaed of normal ID: ", greater_than_852.sum())

count of terminal station ID instaed of normal ID:  2788522


In [84]:
# drop ID's

merged_df = merged_df.drop(columns=['StartStation Id', 'EndStation Id', 'EndStation Logical Terminal', 'endStationPriority_id', 'StartStation Logical Terminal'])
print(merged_df.isna().sum())

Rental Id            0
End Date             0
EndStation Name      0
Start Date           0
StartStation Name    0
dtype: int64


### change dtypes

In [None]:
merged_df["Rental Id"] = merged_df["Rental Id"].astype(int)
merged_df["Start Date"] = pd.to_datetime(merged_df["Start Date"])
merged_df["End Date"] = pd.to_datetime(merged_df["End Date"])

### first cleaning: done
464'605 samples got deleted

In [89]:
print("length df after cleaning:", len(merged_df))
# merged_df.to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_cleaned.csv')

length df after cleaning: 83723463


# 4. MAP BIKE STATION LOCATIONS AND BOROUGHS

### get all bike stations and its corresponding locations data (lat and lon)
by extracting relevant information from API requests to tfl BikePoint data

In [94]:
url = "https://api.tfl.gov.uk/BikePoint/"
response = requests.get(url)
root = json.loads(response.text)

data = []
logical_id = ""
for station in root:
    for prop in station['additionalProperties']:
        if prop['key'] == 'TerminalName':
            logical_id = prop['value']
            break

    station_data = {
        "id": station['id'][11:],
        "name": station['commonName'],
        "lat": station['lat'],
        "lon": station['lon'],
        "terminalId": logical_id
    }
    data.append(station_data)

bike_locs = pd.DataFrame(data)

print(bike_locs.shape)
bike_locs.head(5)

(796, 5)


Unnamed: 0,id,name,lat,lon,terminalId
0,1,"River Street , Clerkenwell",51.529163,-0.10997,1023
1,2,"Phillimore Gardens, Kensington",51.499606,-0.197574,1018
2,3,"Christopher Street, Liverpool Street",51.521283,-0.084605,1012
3,4,"St. Chad's Street, King's Cross",51.530059,-0.120973,1013
4,5,"Sedding Street, Sloane Square",51.49313,-0.156876,3420


### get boroughs based on location and map to bike location

In [96]:
def get_borough(lat, lon):
    """
    Function to retrieve borough name using lat and lon coordinates.
    
    This function sends a GET request to the 'findthatpostcode' API, using 
    the provided lat and lon coordinates. If the request is successful, the 
    function extracts the borough name from the response data and returns it. 
    If the request is unsuccessful, the function returns 'no borough'.
    
    Parameters:
    lat (float): Latitude coordinate of the location.
    lon (float): Longitude coordinate of the location.

    Returns:
    str: Borough name or 'no borough' if the API request is unsuccessful.
    """
    url = f'https://findthatpostcode.uk/points/{lat},{lon}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        borough = data['included'][0]['attributes']['cty_name']
        return borough
    else:
        return 'no borough'

In [3]:
# map borough to the bike locations
bike_locs['borough'] = bike_locs.apply(lambda row: get_borough(row['lat'], row['lon']), axis=1)
bike_locs.head(5)
# bike_locs.to_csv('/Users/tabea/Documents/UrbanMobility/data/bike_locations_boroughs.csv', header=True, index=None)

Unnamed: 0,id,name,lat,lon,terminalId,borough
0,1,"River Street , Clerkenwell",51.529163,-0.10997,1023,Islington
1,2,"Phillimore Gardens, Kensington",51.499606,-0.197574,1018,Kensington and Chelsea
2,3,"Christopher Street, Liverpool Street",51.521283,-0.084605,1012,Hackney
3,4,"St. Chad's Street, King's Cross",51.530059,-0.120973,1013,Camden
4,5,"Sedding Street, Sloane Square",51.49313,-0.156876,3420,Kensington and Chelsea


## 5. ADD BOROUGH DATA TO JOURNEY DATA

### Map the StartStation and EndStation Names to boroughs

In [152]:
# Standardize by stripping white space and converting to lower case, create dictionary for mapping
bike_locs['name'] = bike_locs['name'].str.strip().str.lower()
borough_mapping = bike_locs.set_index('name')['borough'].to_dict()

In [4]:
# mapping
merged_df['StartBorough'] = merged_df['StartStation Name'].str.strip().str.lower().map(borough_mapping)
merged_df['EndBorough'] = merged_df['EndStation Name'].str.strip().str.lower().map(borough_mapping)

merged_df.head(5)

Unnamed: 0.1,Unnamed: 0,Rental Id,End Date,EndStation Name,Start Date,StartStation Name,StartBorough,EndBorough
0,29879046,40346508,2015-04-01 00:06:00,"Ebury Bridge, Pimlico",2015-04-01 00:00:00,"Harriet Street, Knightsbridge",Kensington and Chelsea,Westminster
1,29879054,40346509,2015-04-01 00:11:00,"Regent's Row , Haggerston",2015-04-01 00:00:00,"Brushfield Street, Liverpool Street",City of London,Hackney
2,29879048,40346510,2015-04-01 00:08:00,"Foley Street, Fitzrovia",2015-04-01 00:01:00,"Tavistock Place, Bloomsbury",Camden,Westminster
3,29879161,40346511,2015-04-01 00:50:00,"Bow Church Station, Bow",2015-04-01 00:01:00,"Moor Street, Soho",Westminster,Tower Hamlets
4,29879044,40346512,2015-04-01 00:03:00,"Jubilee Street, Stepney",2015-04-01 00:01:00,"Philpot Street, Whitechapel",Tower Hamlets,Tower Hamlets


In [29]:
# still lots of missing boroughs: 3'104'758 + 3'192'763
print(merged_df.isna().sum())

Unnamed: 0                 0
Rental Id                  0
End Date                   0
EndStation Name            0
Start Date                 0
StartStation Name          0
StartBorough         3104758
EndBorough           3192763
dtype: int64


### Run fuzzy matching for empty boroughs
Matching names that refer to the same station but are slightly different in their naming. Parallel processing to improve the performance of fuzzy matching.

In [None]:
def fuzzy_match(station_name, min_score=70):
    """
    Performs fuzzy matching between a given station name and a mapping of station names to boroughs.
    
    Args:
        station_name (str): The station name to be matched.
        min_score (int): The minimum similarity score required for a match (default: 70).
    
    Returns:
        str or None: The borough corresponding to the best fuzzy match for the station name, 
                     or None if no match is found above the minimum score threshold.
    """
    if station_name is None:
        return None

    best_match = None
    best_score = 0

    for name in station_to_borough.keys():
        score = fuzz.token_sort_ratio(station_name, name)
        if score > best_score:
            best_score = score
            best_match = name

    return station_to_borough[best_match] if best_match and best_score >= min_score else None

In [48]:
station_to_borough = {row['name']: row['borough'] for _, row in bike_locs.iterrows()}
empty_boroughs = merged_df[(merged_df['StartBorough'].isna()) | (merged_df['EndBorough'].isna())]


# function to perform fuzzy matching in parallel
def parallel_fuzzy_match(column):
    return column.apply(fuzzy_match)

# split the DataFrame into chunks for parallel processing
num_parallel_tasks = 6
chunk_size = len(empty_boroughs) // num_parallel_tasks  
chunks = [empty_boroughs[i:i+chunk_size] for i in range(0, len(empty_boroughs), chunk_size)]

# update the StartBorough column, process chunks in parallel
with futures.ThreadPoolExecutor() as executor: 
    results = list(executor.map(parallel_fuzzy_match, [chunk['StartStation Name'] for chunk in chunks]))

for i, result in enumerate(results):
    chunk = chunks[i]
    chunk.loc[:, 'StartBorough'] = result

# update the EndBorough column, process chunks in parallel
with futures.ThreadPoolExecutor() as executor:  # Use ThreadPoolExecutor for threads or ProcessPoolExecutor for processes
    results = list(executor.map(parallel_fuzzy_match, [chunk['EndStation Name'] for chunk in chunks]))

for i, result in enumerate(results):
    chunk = chunks[i]
    chunk.loc[:, 'EndBorough'] = result

# replace the rows with missing borough data in the original dataframe with the processed rows
updated_empty_boroughs = pd.concat(chunks)
merged_df.update(updated_empty_boroughs)


In [49]:
# reduced number of missing values by 3.
print(merged_df.isna().sum())

Unnamed: 0                 0
Rental Id                  0
End Date                   0
EndStation Name            0
Start Date                 0
StartStation Name          0
StartBorough         1082424
EndBorough           1145911
dtype: int64


### Adding boroughs to former stations

After investigating the missing borough data in the current samplers, it was discovered that these samplers correspond to former stations that are no longer in use and are not listed in the provided BikePoints file, where the station names are given in the format "street name, region" (e.g., "London Fields, Hackney Central").

To address this a dictionary with the region information as key and the borough with the maximum counts as value is created. This is then used to map the missing boroughs in the merged_df DataFrame based on the extracted location information.

In [72]:
# add a 'location' column to bike_locs
bike_locs['location'] = bike_locs['name'].str.split(',').str[1].str.strip()

# group by 'location' and get the borough with the maximum counts
location_borough = bike_locs.groupby('location')['borough'].agg(lambda x: x.value_counts().index[0])

# convert the Series to a dictionary
location_borough_dict = location_borough.to_dict()

In [79]:
def get_borough_from_dict(name):
    """
    Retrieves the borough from the 'location_borough_dict' dictionary based on the given station name.
    
    Args:
        name (str): Station name in the format 'street name, region'.
    
    Returns:
        str or None: The corresponding borough based on the region, or None if the borough is unavailable.
    """
    parts = name.split(',')
    if len(parts) > 1:
        return location_borough_dict.get(parts[1].strip(), None)
    else:
        return None

In [80]:
merged_df.loc[merged_df['StartBorough'].isna(), 'StartBorough'] = merged_df.loc[merged_df['StartBorough'].isna(), 'StartStation Name'].apply(get_borough_from_dict)
merged_df.loc[merged_df['EndBorough'].isna(), 'EndBorough'] = merged_df.loc[merged_df['EndBorough'].isna(), 'EndStation Name'].apply(get_borough_from_dict)

### Map manually and drop test/workshop stations

In [92]:
unique_empty_start_boroughs = merged_df.loc[merged_df['StartBorough'].isna(), 'StartStation Name'].unique()
unique_empty_end_boroughs = merged_df.loc[merged_df['EndBorough'].isna(), 'EndStation Name'].unique()

for name in unique_empty_boroughs:
    print(name)

Abingdon Green, Great College Street
Allington street, Off Victoria Street, Westminster
Columbia Road, Weavers
Contact Centre, Southbury House
Electrical Workshop PS
Hansard Mews, Shepherds Bush
Import Dock
LSP1
LSP2
Mechanical Workshop Clapham
Mechanical Workshop Penton
Monier Road
Monier Road, Newham
One London
Oval Way, Lambeth
PENTON STREET COMMS TEST TERMINAL _ CONTACT MATT McNULTY
Pop Up Dock 1
Pop Up Dock 2
Victoria and Albert Museum, Cromwell Road
Worship Street, Hackney
York Way, Camden


In [95]:
# fill values manually

merged_df.loc[merged_df['StartStation Name'] == 'Hansard Mews, Shepherds Bush', 'StartBorough'] = 'Hammersmith and Fulham'
merged_df.loc[merged_df['StartStation Name'] == 'Columbia Road, Weavers', 'StartBorough'] = 'Tower Hamlets'
merged_df.loc[merged_df['StartStation Name'] == 'Abingdon Green, Great College Street', 'StartBorough'] = 'Westminster'
merged_df.loc[merged_df['StartStation Name'] == 'Oval Way, Lambeth', 'StartBorough'] = 'Lambeth'
merged_df.loc[merged_df['StartStation Name'] == 'Contact Centre, Southbury House', 'StartBorough'] = 'Enfield'
merged_df.loc[merged_df['StartStation Name'] == 'Monier Road', 'StartBorough'] = 'Newham'
merged_df.loc[merged_df['StartStation Name'] == 'Victoria and Albert Museum, Cromwell Road', 'StartBorough'] = 'Kensington and Chelsea'
merged_df.loc[merged_df['StartStation Name'] == 'Monier Road, Newham', 'StartBorough'] = 'Newham'
merged_df.loc[merged_df['StartStation Name'] == 'Allington street, Off Victoria Street, Westminster', 'StartBorough'] = 'Westminster'
merged_df.loc[merged_df['StartStation Name'] == 'Worship Street, Hackney', 'StartBorough'] = 'Hackney'
merged_df.loc[merged_df['StartStation Name'] == 'York Way, Camden', 'StartBorough'] = 'Camden'
merged_df.loc[merged_df['StartStation Name'] == 'Monier Road', 'StartBorough'] = 'Hackney'

merged_df.loc[merged_df['EndStation Name'] == 'Hansard Mews, Shepherds Bush', 'EndBorough'] = 'Hammersmith and Fulham'
merged_df.loc[merged_df['EndStation Name'] == 'Columbia Road, Weavers', 'EndBorough'] = 'Tower Hamlets'
merged_df.loc[merged_df['EndStation Name'] == 'Abingdon Green, Great College Street', 'EndBorough'] = 'Westminster'
merged_df.loc[merged_df['EndStation Name'] == 'Oval Way, Lambeth', 'EndBorough'] = 'Lambeth'
merged_df.loc[merged_df['EndStation Name'] == 'Contact Centre, Southbury House', 'EndBorough'] = 'Enfield'
merged_df.loc[merged_df['EndStation Name'] == 'Monier Road', 'EndBorough'] = 'Newham'
merged_df.loc[merged_df['EndStation Name'] == 'Victoria and Albert Museum, Cromwell Road', 'EndBorough'] = 'Kensington and Chelsea'
merged_df.loc[merged_df['EndStation Name'] == 'Monier Road, Newham', 'EndBorough'] = 'Newham'
merged_df.loc[merged_df['EndStation Name'] == 'Allington street, Off Victoria Street, Westminster', 'EndBorough'] = 'Westminster'
merged_df.loc[merged_df['EndStation Name'] == 'Worship Street, Hackney', 'EndBorough'] = 'Hackney'
merged_df.loc[merged_df['EndStation Name'] == 'York Way, Camden', 'EndBorough'] = 'Camden'
merged_df.loc[merged_df['EndStation Name'] == 'Monier Road', 'EndBorough'] = 'Hackney'

In [98]:
# drop test and workshop stations

merged_df = merged_df.dropna(subset=['StartBorough', 'EndBorough'])
print(merged_df.isna().sum())

Unnamed: 0           0
Rental Id            0
End Date             0
EndStation Name      0
Start Date           0
StartStation Name    0
StartBorough         0
EndBorough           0
dtype: int64


# 6. Save cleaned data

In [99]:
# all data (2015 - 20122)
merged_df.to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_cleaned_with_boroughs_no_nans.csv')

In [100]:
# by year

groups = merged_df.groupby(pd.Grouper(key='Start Date', freq='Y'))
yearly_dfs = {}
for year, group in groups:
    yearly_dfs[year.year] = group.reset_index(drop=True)
    
yearly_dfs[2015].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2015_cleaned.csv')
yearly_dfs[2016].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2016_cleaned.csv')
yearly_dfs[2017].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2017_cleaned.csv')
yearly_dfs[2018].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2018_cleaned.csv')
yearly_dfs[2019].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2019_cleaned.csv')
yearly_dfs[2020].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2020_cleaned.csv')
yearly_dfs[2021].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2021_cleaned.csv')
yearly_dfs[2022].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2022_cleaned.csv')