# Fetch and Clean Journey Bike Data

In [2]:
# import libraries
import pandas as pd
from datetime import datetime

# import python modules
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/Users/tabea/Documents/UrbanMobility/src')
from data import journey_data_preprocessing as preprocess

## 1. FETCH BIKE JOURNEY DATA

### 1.1 Fetch Data

The data we need cannot be easily scraped due to its dynamic nature. Instead, we employ a different strategy. The filenames for the required journey data files have been manually compiled and stored in a CSV file, which serves as a reference for fetching data. These filenames point to the specific datasets hosted at https://cycling.data.tfl.gov.uk. By iterating over the list of filenames, we can programmatically send requests to retrieve each file's data.

In [None]:
# load the list of journey data filenames to be fetched.
filenames = pd.read_csv('../data/raw/filenames-data.csv', header=None, squeeze=True)
journey_data_df = preprocess.fetch_journey_data(filenames)

print("total amount of entries fetched:", len(journey_data_df))

84188068

### 1.2 Save Raw Data

In [None]:
journey_data_df.head()
journey_data_df.to_csv('../data/raw/journey_data_raw.csv')

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Rental Id,End Date,EndStation Id,EndStation Name,Start Date,StartStation Id,StartStation Name,Unnamed: 9,Unnamed: 10,Unnamed: 11,EndStation Logical Terminal,endStationPriority_id,StartStation Logical Terminal
0,0,0,63097899.0,2017-03-15 00:06:00,631.0,"Battersea Park Road, Nine Elms",2017-03-15 00:00:00,74.0,"Vauxhall Cross, Vauxhall",,,,,,
1,1,1,63097900.0,2017-03-15 00:05:00,397.0,"Devonshire Terrace, Bayswater",2017-03-15 00:01:00,410.0,"Edgware Road Station, Marylebone",,,,,,
2,2,2,63097901.0,2017-03-15 00:06:00,426.0,"Vincent Street, Pimlico",2017-03-15 00:01:00,177.0,"Ashley Place, Victoria",,,,,,
3,3,3,63097902.0,2017-03-15 00:12:00,462.0,"Bonny Street, Camden Town",2017-03-15 00:01:00,22.0,"Northington Street , Holborn",,,,,,
4,4,4,63097903.0,2017-03-15 00:05:00,423.0,"Eaton Square (South), Belgravia",2017-03-15 00:01:00,143.0,"Pont Street, Knightsbridge",,,,,,


## 2. CLEAN BIKE JOURNEY DATA

In [4]:
# import journey data if already fetched
journey_data_df = pd.read_csv('../data/raw/journey_data_raw.csv')

In [5]:
print("length before cleaning:", len(journey_data_df))

length before cleaning: 84188068


### 2.1 Standardise Column Names
Merge columns with different namings, change datatypes and drop columns starting with 'Unnamed'

In [6]:
journey_data_df = preprocess.standardize_columns(journey_data_df)
journey_data_df.head()

Unnamed: 0,rental_id,end_date,end_station_id,end_station_name,start_date,start_station_id,start_station_name,EndStation Logical Terminal,endStationPriority_id,StartStation Logical Terminal
0,63097899.0,2017-03-15 00:06:00,631.0,"Battersea Park Road, Nine Elms",2017-03-15 00:00:00,74.0,"Vauxhall Cross, Vauxhall",,,
1,63097900.0,2017-03-15 00:05:00,397.0,"Devonshire Terrace, Bayswater",2017-03-15 00:01:00,410.0,"Edgware Road Station, Marylebone",,,
2,63097901.0,2017-03-15 00:06:00,426.0,"Vincent Street, Pimlico",2017-03-15 00:01:00,177.0,"Ashley Place, Victoria",,,
3,63097902.0,2017-03-15 00:12:00,462.0,"Bonny Street, Camden Town",2017-03-15 00:01:00,22.0,"Northington Street , Holborn",,,
4,63097903.0,2017-03-15 00:05:00,423.0,"Eaton Square (South), Belgravia",2017-03-15 00:01:00,143.0,"Pont Street, Knightsbridge",,,


### 2.2 Drop Duplicates and NaN-Only
some files have same or overlapping content, but different names: 01b Journey Data Extract 24Jan16-06Feb16.csv == 01bJourneyDataExtract24Jan16-06Feb16.csv

In [7]:
journey_data_df = preprocess.drop_duplicates(journey_data_df)

print("current length of df: ", len(journey_data_df))

current length of df:  83895356


### 2.3 Investigate and Handle NaN Values

In [8]:
print(journey_data_df.isna().sum())

rental_id                               0
end_date                           170358
end_station_id                     715522
end_station_name                   171824
start_date                              0
start_station_id                   234440
start_station_name                      0
EndStation Logical Terminal      83665717
endStationPriority_id            83665717
StartStation Logical Terminal    83662856
dtype: int64


#### NaN Values: start_station_name & end_station_name

start_station_name: no NaN values

end_station_name: only NaN if end_station_id is also NaN -> they can't be mapped, so they must be removed.

In [9]:
# drop all entries with empty end station id and name
journey_data_df = journey_data_df.dropna(subset=['end_station_id', 'end_station_name'], how='all')

#### NaN Values: start_date & end_date

start_station_date: no NaN values

end_station_date: only a few ones -> must be removed

In [10]:
# drop all entries with empty end date
journey_data_df = journey_data_df.dropna(subset=['end_date'])

### NaN Values: start_station_id & end_station_id

Numerous NaN values are observed in the 'start_station_id' and 'end_station_id' columns. The primary cause: bike rides extending beyond a single calendar day. For these instances, stations are referred to as terminal stations, each carrying a unique ID set with higher numbers (>852).

Due to the mix of station_id and terminal_id and lots of NaN values, the ID's get dropped and the name of the station is used as identifier.

In [11]:
# only 852 station are present in the data. But there are also terminal station IDs that have higher values and are mixed in the data.
# they can be found here: https://api.tfl.gov.uk/BikePoint/

greater_than_852 = (journey_data_df['start_station_id'] > 852) | (journey_data_df['end_station_id'] > 852)
print("count of terminal station ID instaed of normal ID: ", greater_than_852.sum())

count of terminal station ID instaed of normal ID:  2788522


In [12]:
# drop all ID's
journey_data_df = journey_data_df.drop(columns=['start_station_id', 'end_station_id', 'EndStation Logical Terminal', 'endStationPriority_id', 'StartStation Logical Terminal'])

In [13]:
print(journey_data_df.isna().sum())

rental_id             0
end_date              0
end_station_name      0
start_date            0
start_station_name    0
dtype: int64


### 2.4 Filter Data by Date

In [14]:
start_date = datetime(2019, 1, 1) 
end_date = datetime(2019, 12, 31)

journey_data_df = preprocess.filter_date(journey_data_df, start_date, end_date)

### 2.5 Save Cleaned Data

In [15]:

journey_data_df.head()
journey_data_df.to_csv('../data/interim/journey_data_cleaned.csv')