# 1. JOURNEY DATA
link: https://cycling.data.tfl.gov.uk

In [None]:
import pandas as pd
import requests
import io
import urllib
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import datetime

## a. import journey cycle data

In [None]:
# Define a function to rename columns

def rename_columns(df):
    column_names = {
        'End Station Id': 'EndStation Id',
        'End station number': 'EndStation Id',
        'Start Station Id': 'StartStation Id',
        'Start station number': 'StartStation Id',
        'End Station Name': 'EndStation Name',
        'End station': 'EndStation Name',
        'Start Station Name': 'StartStation Name',
        'Start station': 'StartStation Name',
        'Start date': 'Start Date',
        'End Date': 'End Date',
        'End date': 'End Date',
        'Number': 'Rental Id',
    }
    
    for old_name, new_name in column_names.items():
            if old_name in df.columns:
                df = df.rename(columns={old_name: new_name})
                if new_name in ['EndStation Id', 'StartStation Id', 'Rental Id']:
                    df[new_name] = pd.to_numeric(df[new_name], errors='coerce', downcast='integer')
                elif new_name in ['Start Date', 'End Date']:
                    df[new_name] = pd.to_datetime(df[new_name], infer_datetime_format=True)
    
    return df


# read the list of file names
filenames = pd.read_csv('/Users/tabea/Documents/UrbanMobility/filenames-data.csv', header=None, squeeze=True)

# combine base-url and filenames
base_url = 'http://cycling.data.tfl.gov.uk/usage-stats/'
url_list = (base_url + urllib.parse.quote(x) for x in filenames)
unused_cols = ['Total duration (ms)', 'Total duration', 'Duration', 'Duration_Seconds', 'Bike Id', 'Bike number', 'Bike model']

# loop over the urls and extract the data
temp_dfs = []
for url in url_list:
    response = requests.get(url, verify=False, timeout=(3, 7))

    if url.endswith('.csv'):
        temp_df = pd.read_csv(io.StringIO(response.content.decode('utf-8')), usecols=lambda col: col not in unused_cols)

    elif url.endswith('.xlsx'):
        temp_df = pd.read_excel(io.BytesIO(response.content), usecols=lambda col: col not in unused_cols)

    temp_df = rename_columns(temp_df)
    temp_dfs.append(temp_df)

# concatenate the new data to the existing data
merged_df = pd.concat(temp_dfs, ignore_index=True)


In [None]:
# total amount of entries: 84'188'068
len(merged_df)

In [None]:
merged_df.to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_raw.csv')

In [None]:
merged_df.head(50)

## b. clean data

In [None]:
print("length before cleaning:", len(merged_df))

### investigation nan values

In [None]:
# rental: no nan, only different header namings

nan_rows_rental = merged_df[merged_df["Rental Id"].isna()]
print("count rental nans: ", len(nan_rows_rental))

In [None]:
# start date & start station id & start station name: no nan, only different namings

nan_rows_start = merged_df.loc[(merged_df["StartStation Name"].isna()) | (merged_df["StartStation Id"].isna()) | (merged_df["Start Date"].isna())] 
print("count start nans: ", len(nan_rows_start))

In [None]:
# end date & end station id & end station name: 4536 nan
# example: id:63097949, bike-id:1628, start-date:15.03.17 00:13, start-station-id:274, start-station-name: Warwick Road, Olympia

nan_rows_end = merged_df.loc[(merged_df["EndStation Name"].isna()) | (merged_df["EndStation Id"].isna()) | (merged_df["End Date"].isna())] 
print(nan_rows_end.head())
print("count: ", len(nan_rows_end))

### investigate duplicates

In [None]:
# some files are read twice. e.g: 01b Journey Data Extract 24Jan16-06Feb16.csv, 01bJourneyDataExtract24Jan16-06Feb16.csv

duplicates = merged_df[merged_df.duplicated(keep=False)]
print(len(duplicates))
duplicates.to_csv('/Users/tabea/Documents/UrbanMobility/data/duplicates.csv')

In [None]:
# investigate duplicates by date: 14 dates in 2016 with duplicates

duplicates["Start Date"] = pd.to_datetime(duplicates["Start Date"])
print(duplicates["Start Date"].dt.date.unique())

### drop nan

In [None]:
# drop missing values

merged_df.dropna(axis=0, subset=["EndStation Id", "End Date", "EndStation Name"], inplace=True)
print(merged_df.shape)
print(merged_df.isna().sum())

### drop duplicates

In [None]:
# drop duplicates

merged_df.drop_duplicates(inplace=True)
print(merged_df.shape)

In [None]:
# check for duplicated rental Id: none found

duplicates_rental_id = merged_df[merged_df['Rental Id'].duplicated(keep=False)]
print(len(duplicates_rental_id))

### drop unused cols

In [None]:
merged_df.drop(merged_df.iloc[:,7:], axis=1, inplace=True)
merged_df.head()

### change dtypes

In [None]:
print(merged_df.dtypes)

merged_df["EndStation Id"] = merged_df["EndStation Id"].astype(int)
merged_df["Rental Id"] = merged_df["Rental Id"].astype(int)
merged_df["Start Date"] = pd.to_datetime(merged_df["Start Date"])

print(merged_df.dtypes)

In [None]:
print("length after cleaning:", len(merged_df))

In [None]:
merged_df.to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_cleaned.csv')

## c. split data by year

In [None]:
# Split merged_df by year
groups = merged_df.groupby(pd.Grouper(key='Start Date', freq='Y'))

# Create a new DataFrame for each year
yearly_dfs = {}
for year, group in groups:
    yearly_dfs[year.year] = group.reset_index(drop=True)

In [None]:
# plot length of all dataframes

length_of_dfs = [len(yearly_dfs[2015]), len(yearly_dfs[2016]), len(yearly_dfs[2017]), len(yearly_dfs[2018]), len(yearly_dfs[2019]), len(yearly_dfs[2020]), len(yearly_dfs[2021]), len(yearly_dfs[2022])]
years = [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]

sns.set_style("whitegrid")
sns.set_palette("Blues")

fig, ax = plt.subplots(figsize=(8,6))
sns.barplot(x=years, y=length_of_dfs, ax=ax, color="royalblue")
plt.xlabel("Year")
plt.ylabel("Total Entries")
ax.yaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{int(x):,}'))

for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2., p.get_height(), f"{int(p.get_height())}", 
            fontsize=12, color='black', ha='center', va='bottom')

plt.show()

### save data as csv

In [None]:
yearly_dfs[2015].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2015.csv')
yearly_dfs[2016].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2016.csv')
yearly_dfs[2017].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2017.csv')
yearly_dfs[2018].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2018.csv')
yearly_dfs[2019].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2019.csv')
yearly_dfs[2020].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2020.csv')
yearly_dfs[2021].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2021.csv')
yearly_dfs[2022].to_csv('/Users/tabea/Documents/UrbanMobility/data/journey_data_2022.csv')

# 2. BIKE STATION LOCATIONS

In [None]:
import requests
from xml.etree import ElementTree as ET
import pandas as pd

base = "https://tfl.gov.uk/tfl/syndication/feeds/cycle-hire/livecyclehireupdates.xml"
response = requests.get(base)
root = ET.fromstring(response.content)

data = []
for station in root:
    station_data = {
        "id": int(station[0].text),
        "name": station[1].text,
        "lat": float(station[3].text),
        "lon": float(station[4].text),
        "capacity": int(station[12].text)
    }
    data.append(station_data)

bike_locs = pd.DataFrame(data)

bike_locs.to_csv('/Users/tabea/Documents/UrbanMobility/data/bike_locations.csv', header=True, index=None)

print(bike_locs.shape)
bike_locs.head(10)

### visualize the bike locations

In [None]:
import folium

# create a map centered on London
london_coords = (51.5074, -0.1278)
m = folium.Map(location=london_coords, zoom_start=12, tiles='Stamen Toner')

# add markers for each bike station location
for index, row in bike_locs.iterrows():
    popup_text = f"{row['name']} (capacity: {row['capacity']})"
    marker = folium.Marker(location=(row['lat'], row['lon']), popup=popup_text)
    marker.add_to(m)

# add a heatmap layer
heat_data = [[row['lat'], row['lon']] for index, row in bike_locs.iterrows()]
heatmap = folium.FeatureGroup(heat_data)
heatmap.add_to(m)

# save the map as an HTML file
m.save('map_bike_loc.html')

In [None]:
from IPython.display import IFrame

IFrame('map_bike_loc.html', width=900, height=500)

# 3. ADD LONDON BOROUGH DATA

In [None]:
# TODO: get london borough data