In [1]:
import sys
!{sys.executable} -m pip install haversine
import haversine 
from sklearn.neighbors import DistanceMetric
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
plt.style.use("dark_background")
plt.rcParams['figure.dpi'] = 150
from nltk import flatten



We read the santander_locations document, which contains the location information of bike stations across London.

In [4]:
station_data = pd.read_csv("../Data/santander_locations.csv")

In [5]:
station_data.head(10)

Unnamed: 0,Station.Id,StationName,longitude,latitude,Easting,Northing
0,1,"River Street, Clerkenwell",-0.109971,51.5292,531202.52,182832.02
1,2,"Phillimore Gardens, Kensington",-0.197574,51.4996,525207.07,179391.86
2,3,"Christopher Street, Liverpool Street",-0.084606,51.5213,532984.81,182001.53
3,4,"St. Chad's Street, King's Cross",-0.120974,51.5301,530436.76,182911.99
4,5,"Sedding Street, Sloane Square",-0.156876,51.4931,528051.649,178742.097
5,6,"Broadcasting House, Marylebone",-0.144229,51.5181,528857.44,181542.87
6,7,"Charlbert Street, St. John's Wood",-0.168074,51.5343,527158.01,183300.75
7,8,"Maida Vale, Maida Vale",-0.183486,51.529857,526102.0,182780.0
8,9,"New Globe Walk, Bankside",-0.096441,51.5074,532203.97,180434.55
9,10,"Park Street, Bankside",-0.092754,51.506,532463.89,180284.3


In [6]:
len(station_data)

808

The number of stations is 808.

In [7]:
station_data['Station.Id'].to_numpy()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  36,  37,  38,  39,  40,
        41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104, 105,
       106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
       119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131,
       132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
       146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158,
       159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171,
       172, 173, 174, 175, 176, 177, 178, 180, 181, 182, 183, 18

The leftmost column in bold gives the index of the table. It is not the same as Station ID, which is given by the column named 'Station.Id'. But we could always access entries by Station ID, or by the table ID.

In [8]:
#Access by Station ID
station_data[station_data['Station.Id'] == 268].index

Int64Index([261], dtype='int64')

In [9]:
#Access by Table ID
station_data.iloc[261]

Station.Id                         268
StationName    Belgrave Road, Victoria
longitude                    -0.144133
latitude                       51.4932
Easting                      528934.35
Northing                     178772.58
Name: 261, dtype: object

For the station Belgrave Road, Victoria, its station ID is 268, but its table ID is 261.

In [10]:
#Access by Station ID
station_data[station_data['StationName'] == "Boston Place, Marylebone"].index

Int64Index([43], dtype='int64')

In [11]:
#Access by Table ID
station_data.iloc[766]

Station.Id                                 798
StationName    Birkenhead Street, King's Cross
longitude                            -0.122299
latitude                               51.5302
Easting                               530345.0
Northing                              182925.0
Name: 766, dtype: object

Below gives a collection of bike stations near National Railway stations.

| Station names | Station ID | DF Index | 
| --- | --- | --- |
| Belgrave Road, Victoria | 268 | 261 |
| Waterloo Station 1, Waterloo | 374 | 362 |
| Waterloo Station 2, Waterloo | 261 | 350 |
| Waterloo Station 3, Waterloo | 154 | 151 |
| South Wharf Road, Paddington | 186 | 182 |
| Birkenhead Street, King's Cross | 798 | 766 |
| Snowsfields, London Bridge | 706 | 675 |
| Boston Place, Marylebone | 45 | 43 |

We now do some processing of the data.

In [13]:
max(station_data['Station.Id']), len(np.unique(station_data['Station.Id']))
station_ID = list(station_data['Station.Id'])
row_no = list(range(len(station_data['Station.Id'])))
row_ID_dict = dict(zip(station_ID, row_no))

In [14]:
class OptimizationError(RuntimeError):
    """Called when optimizer does not converge."""
    pass

class StationIdError(IndexError):
    """Called when we try and read a non-existing station id."""
    pass

We write a function to obtain station name corresponding to a given station ID number.

In [15]:
def get_station_name(in_id):
    """Get station name from bike_data for a given id."""
    try:
        return station_data[station_data["Station.Id"] == in_id].StationName.iloc[0]
    except IndexError:
        StationIdError("No station matching input ID")

In [16]:
get_station_name(830)

'Sidney Road, Stockwell'

We get the longitude and latitude data in radians from the data.

In [17]:
from math import radians
station_coord = station_data[["longitude","latitude"]]
station_coord['longitude_radians'] = station_coord['longitude'].apply(lambda x: radians(x)) 
station_coord['latitude_radians'] = station_coord['latitude'].apply(lambda x: radians(x)) 
station_coord = station_coord[['latitude_radians','longitude_radians']].to_numpy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_coord['longitude_radians'] = station_coord['longitude'].apply(lambda x: radians(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  station_coord['latitude_radians'] = station_coord['latitude'].apply(lambda x: radians(x))


In [18]:
station_coord

array([[ 8.99354201e-01, -1.91935603e-03],
       [ 8.98837583e-01, -3.44831682e-03],
       [ 8.99216320e-01, -1.47664803e-03],
       ...,
       [ 8.99038384e-01, -1.89027885e-03],
       [ 8.98821928e-01, -8.56206171e-04],
       [ 8.98811351e-01, -1.10744632e-03]])

And we find the geodesic distance between each station, using the haversine formula.

In [19]:
dist = DistanceMetric.get_metric('haversine')
geo_dist = dist.pairwise(station_coord) * 6365.079
geo_dist_df = pd.DataFrame(geo_dist)
geo_dist_df.iloc[1,2]



8.174215894780051

We save the geodesic distance between each pair of stations into an array.

In [20]:
geo_dist

array([[ 0.        ,  6.891488  ,  1.9605899 , ...,  2.01349835,
         5.4049247 ,  4.72041716],
       [ 6.891488  ,  0.        ,  8.17421589, ...,  6.30370511,
        10.271535  ,  9.27709531],
       [ 1.9605899 ,  8.17421589,  0.        , ...,  1.99173004,
         3.51324925,  2.96369879],
       ...,
       [ 2.01349835,  6.30370511,  1.99173004, ...,  0.        ,
         4.32238187,  3.42167331],
       [ 5.4049247 , 10.271535  ,  3.51324925, ...,  4.32238187,
         0.        ,  0.99781121],
       [ 4.72041716,  9.27709531,  2.96369879, ...,  3.42167331,
         0.99781121,  0.        ]])

We save the geodesic dataframe into csv.

In [21]:
geo_dist_df.to_csv('../Data/geodesic.csv')

We read the bike data folder, create a dictionary, store the file name as key and the data frame inside the file as value.

In [23]:
path = '../Data/santander_summaries'
data_files = os.listdir(path)
n_weeks = len(data_files)
bike_data = {file: pd.read_csv(path + '/' + file,
                                     names=["start_loc", "end_loc",
                                            "start_t", "duration"])
             for file in data_files}        

print(list(bike_data.values())[0].head())

   start_loc  end_loc   start_t  duration
0         47       56  55728000      1500
1        762      600  55728000       540
2        588      698  55728060       420
3        456      456  55728120     12240
4         67       67  55728120       660


Then we sort the data according to start time.

In [24]:
bike_keys = sorted(bike_data)
sorted_bike_data = {i: bike_data[i] for i in bike_keys}
 
print(sorted_bike_data)

{'221_01Jul2020-07Jul2020.csv':         start_loc  end_loc   start_t  duration
0             103       37  47260920       360
1              39      539  47260920       120
2             785      785  47260920       300
3             341      159  47260980      1800
4             708      573  47260980      1080
...           ...      ...       ...       ...
248194        270      272  47864700       480
248195        117      152  47864760       480
248196        171      155  47864760       180
248197         83      826  47864820       540
248198        154      173  47864940       300

[248199 rows x 4 columns], '222_08Jul2020-14Jul2020.csv':         start_loc  end_loc   start_t  duration
0             517      531  47865600       180
1             532      476  47865600       780
2             797      201  47865660       600
3             779      508  47865660       960
4             797      312  47865660       900
...           ...      ...       ...       ...
254142        16

We map the row number to each location index in station_data, and add the distance column for each journey.

In [25]:
bike_loc = dict(sorted_bike_data)

for names in bike_loc.keys():
    endt = np.zeros(bike_loc[names].shape[0], dtype = 'float64')
    bike_loc[names] = bike_loc[names].to_numpy()
    startt = bike_loc[names][:, 2].astype('float64')
    dura = bike_loc[names][:, 3].astype('float64')
    bike_loc[names] = bike_loc[names][:, 0:2]
        
    start_noise = np.random.uniform(0.0, 1.0, bike_loc[names].shape[0])
    startt = startt / 60 + start_noise
    duration_noise = np.random.uniform(0.0, 1.0, bike_loc[names].shape[0])
    dura = dura / 60 + duration_noise
    endt = startt + dura
    
    dist = np.zeros((bike_loc[names].shape[0], 1), dtype = geo_dist.dtype)
    for r in range(bike_loc[names].shape[0]):
            dist[r] = geo_dist[row_ID_dict[bike_loc[names][r, 0]], row_ID_dict[bike_loc[names][r, 1]]]
    
    dist = pd.DataFrame(dist)
    startt = pd.DataFrame(startt)
    endt = pd.DataFrame(endt)
    dura = pd.DataFrame(dura)
    bike_loc[names] = pd.DataFrame(bike_loc[names], columns = ["start_loc", "end_loc"])
    
    bike_loc[names] = bike_loc[names].assign(distance = dist)
    bike_loc[names] = bike_loc[names].assign(start_t = startt)
    bike_loc[names] = bike_loc[names].assign(end_t = endt)
    bike_loc[names] = bike_loc[names].assign(duration = dura)
    
print(list(bike_loc.values())[0].head(10))

   start_loc  end_loc  distance        start_t          end_t   duration
0        103       37  1.455347  787682.551083  787688.795509   6.244425
1         39      539  0.544400  787682.077241  787685.010834   2.933593
2        785      785  0.000000  787682.560881  787687.600020   5.039139
3        341      159  1.090538  787683.229835  787713.351038  30.121203
4        708      573  3.599904  787683.523229  787701.735893  18.212663
5        366      138  0.463223  787684.637816  787691.852334   7.214519
6         66      832  6.017458  787684.565727  787712.513200  27.947473
7        225      384  1.459768  787685.867103  787693.156269   7.289166
8        729      207  4.568847  787686.123903  787706.757129  20.633226
9        510      763  1.334991  787686.698751  787692.041734   5.342983


We repeat the above procedure, but normalise the data.

In [26]:
bike_loc = dict(sorted_bike_data)

for names in bike_loc.keys():
    bike_loc[names] = bike_loc[names].to_numpy()
    startt = bike_loc[names][:, 2].astype('float64')
    dura = bike_loc[names][:, 3].astype('float64')
    endt = startt + dura
    bike_loc[names] = bike_loc[names][:, 0:2]
    t_min = np.floor(47260920/(60*60*24))*60*60*24
        
    start_noise = np.random.uniform(0.0, 1.0, bike_loc[names].shape[0])
    startt = (startt - t_min) / 60 + start_noise
    end_noise = np.random.uniform(0.0, 1.0, bike_loc[names].shape[0])
    endt = (endt - t_min) / 60 + end_noise
    dura = endt - startt
    
    dist = np.zeros((bike_loc[names].shape[0], 1), dtype = geo_dist.dtype)
    for r in range(bike_loc[names].shape[0]):
            dist[r] = geo_dist[row_ID_dict[bike_loc[names][r, 0]], row_ID_dict[bike_loc[names][r, 1]]]
    
    dist = pd.DataFrame(dist)
    startt = pd.DataFrame(startt)
    endt = pd.DataFrame(endt)
    dura = pd.DataFrame(dura)
    bike_loc[names] = pd.DataFrame(bike_loc[names], columns = ["start_loc", "end_loc"])
    
    bike_loc[names] = bike_loc[names].assign(distance = dist)
    bike_loc[names] = bike_loc[names].assign(start_t = startt)
    bike_loc[names] = bike_loc[names].assign(end_t = endt)
    bike_loc[names] = bike_loc[names].assign(duration = dura)
    
print(list(bike_loc.values())[0].head(10))

   start_loc  end_loc  distance   start_t      end_t   duration
0        103       37  1.455347  2.443042   8.093304   5.650262
1         39      539  0.544400  2.260985   4.093658   1.832673
2        785      785  0.000000  2.337970   7.154192   4.816222
3        341      159  1.090538  3.543996  33.499252  29.955256
4        708      573  3.599904  3.843514  21.965196  18.121682
5        366      138  0.463223  4.876573  11.522892   6.646319
6         66      832  6.017458  4.673414  31.913564  27.240149
7        225      384  1.459768  5.186432  12.867942   7.681510
8        729      207  4.568847  6.641825  26.604163  19.962338
9        510      763  1.334991  6.489251  11.890359   5.401108


We obtain a dictionary containing all bike journeys that start from station ID 2, i.e., 'start_loc' == 2. And we also obtain a dictionary containing all bike journeys that end at station ID 2, i.e., 'end_loc' == 2.

In [27]:
data_dict_dep = {}
data_dict_arr = {}

for names in bike_loc.keys():
    data_dict_dep[names] = bike_loc[names].loc[bike_loc[names]['start_loc'] == 2]
    data_dict_arr[names] = bike_loc[names].loc[bike_loc[names]['end_loc'] == 2]
        
print(list(data_dict_dep.values())[0].head(10));
print(list(data_dict_arr.values())[0].head(10))

      start_loc  end_loc  distance     start_t       end_t    duration
423           2        2  0.000000  138.852795  167.543995   28.691200
424           2        2  0.000000  138.903072  164.805069   25.901997
428           2        2  0.000000  139.193403  166.678777   27.485374
3910          2      558  4.882449  511.119333  550.057092   38.937758
3983          2      228  4.438654  513.513071  530.892008   17.378937
4071          2      219  1.156326  516.240557  536.137712   19.897155
4089          2      219  1.156326  516.635117  536.964413   20.329295
4189          2      389  3.117940  519.750296  533.137913   13.387617
4454          2      826  3.789855  528.121062  948.139151  420.018089
5182          2      348  3.544449  553.314412  573.607162   20.292750
      start_loc  end_loc  distance     start_t       end_t   duration
423           2        2  0.000000  138.852795  167.543995  28.691200
424           2        2  0.000000  138.903072  164.805069  25.901997
428      

We save the departure times data and the arrival times data into a new folder.

In [32]:
os.makedirs('../Processed_Data/SortDepartures', exist_ok=True)  

for names in data_dict_dep.keys():
    data_dict_dep[names].to_csv('../Processed_Data/Departures/'+ str(names))

os.makedirs('../Processed_Data/SortArrivals', exist_ok=True)  

for names in data_dict_arr.keys():
    data_dict_arr[names].to_csv('../Processed_Data/Arrivals/'+ str(names))

We obtain a dictionary containing all bike journeys that start from station ID 2, 3, 4, 5, i.e., 'start_loc' == 2,3, 4, 5. 

In [33]:
loc_codes = [2, 3, 4, 5]
data_dict_0 = {}
data_dict_1 = {}
data_dict_2 = {}
data_dict_3 = {}

for names in bike_loc.keys():
    data_dict_0[names] = bike_loc[names].loc[bike_loc[names]['start_loc'] == 2]
    data_dict_1[names] = bike_loc[names].loc[bike_loc[names]['start_loc'] == 3]
    data_dict_2[names] = bike_loc[names].loc[bike_loc[names]['start_loc'] == 4]
    data_dict_3[names] = bike_loc[names].loc[bike_loc[names]['start_loc'] == 5]
        
print(list(data_dict_0.values())[0].head(10));
print(list(data_dict_1.values())[0].head(10))
print(list(data_dict_2.values())[0].head(10))
print(list(data_dict_3.values())[0].head(10))

      start_loc  end_loc  distance     start_t       end_t    duration
423           2        2  0.000000  138.852795  167.543995   28.691200
424           2        2  0.000000  138.903072  164.805069   25.901997
428           2        2  0.000000  139.193403  166.678777   27.485374
3910          2      558  4.882449  511.119333  550.057092   38.937758
3983          2      228  4.438654  513.513071  530.892008   17.378937
4071          2      219  1.156326  516.240557  536.137712   19.897155
4089          2      219  1.156326  516.635117  536.964413   20.329295
4189          2      389  3.117940  519.750296  533.137913   13.387617
4454          2      826  3.789855  528.121062  948.139151  420.018089
5182          2      348  3.544449  553.314412  573.607162   20.292750
       start_loc  end_loc  distance      start_t        end_t   duration
3999           3       57  2.470578   514.554357   527.217754  12.663397
7170           3        3  0.000000   641.242302   658.750099  17.507797


We save the departure times data at each station into a new folder.

In [34]:
os.makedirs('./Station2', exist_ok=True)  

for names in data_dict_dep.keys():
    data_dict_0[names].to_csv('./Station2/'+ str(names))

os.makedirs('./Station3', exist_ok=True)  

for names in data_dict_arr.keys():
    data_dict_1[names].to_csv('./Station3/'+ str(names))

os.makedirs('./Station4', exist_ok=True)  

for names in data_dict_arr.keys():
    data_dict_2[names].to_csv('./Station4/'+ str(names))
                                
os.makedirs('./Station5', exist_ok=True)  

for names in data_dict_arr.keys():
    data_dict_3[names].to_csv('./Station5/'+ str(names))

We then read and save departure times at all stations.

In [35]:
loc_codes = station_data['Station.Id'].to_numpy()

for i in range(808):
    
    loc = loc_codes[i]
    data_dict = {}

    for names in bike_loc.keys():
        data_dict[names] = bike_loc[names].loc[bike_loc[names]['start_loc'] == loc]
    
    path = f"../Processed_Data/sortdep/Station{loc}"
    os.makedirs(path, exist_ok=True) 
    
    filename = f"../Processed_Data/sortdep/Station{loc}/{str(names)}"
    for names in data_dict_dep.keys():
        filename = f"../Processed_Data/sortdep/Station{loc}/{str(names)}"
        data_dict[names].to_csv(filename)

In [37]:
loc_codes = station_data['Station.Id'].to_numpy()

for i in range(808):
    
    loc = loc_codes[i]
    data_dict = {}

    for names in bike_loc.keys():
        data_dict[names] = bike_loc[names].loc[bike_loc[names]['start_loc'] == loc]
    
    path = f"../Processed_Data/sortarr/Station{loc}"
    os.makedirs(path, exist_ok=True) 
    
    filename = f"../Processed_Data/sortarr/Station{loc}/{str(names)}"
    for names in data_dict_dep.keys():
        filename = f"../Processed_Data/sortarr/Station{loc}/{str(names)}"
        data_dict[names].to_csv(filename)

#Add in duration.
cols = ["start_t", "end_t","duration","start_loc", "end_loc"]
for name in bike_data.keys():
    bike_data[name] = bike_data[name].assign(end_t=lambda x:
                                             abs(x.start_t + x.duration))
    bike_data[name] = bike_data[name].reindex(columns=cols)

print(list(bike_data.values())[0].head())

#Map the row number to each location index in station_data.
for name in bike_data.keys():
    bike_data[name] = bike_data[name].to_numpy()
    startno = np.zeros((bike_data[name].shape[0], 1), dtype=int)
    endno = np.zeros((bike_data[name].shape[0], 1), dtype=int)
    
    for r in range(bike_data[name].shape[0]):
        startno[r] = row_ID_dict[bike_data[name][r, 3]]
        endno[r] = row_ID_dict[bike_data[name][r, 4]]
    
    startno = pd.DataFrame(startno)
    endno = pd.DataFrame(endno)
    bike_data[name] = pd.DataFrame(bike_data[name], columns = ["start_t", "end_t","duration","start_loc", 
                                                              "end_loc"])
    
    bike_data[name] = bike_data[name].assign(start_no = startno)
    bike_data[name] = bike_data[name].assign(end_no = endno)
    
print(list(bike_data.values())[0].head())        

And then we add the distance travelled for each journey.

In [105]:
#Adding the distance column for each journey
bike_loc = dict(bike_data)

for names in bike_loc.keys():
    bike_loc[names] = bike_loc[names].to_numpy()
     
    
    dist = np.zeros((bike_loc[names].shape[0], 1), dtype = geo_dist.dtype)
    for r in range(bike_loc[names].shape[0]):
            dist[r] = geo_dist[bike_loc[names][r, 5], bike_loc[names][r, 6]]
    
    dist = pd.DataFrame(dist)
    bike_loc[names] = pd.DataFrame(bike_loc[names], columns = ["start_t", "end_t","duration","start_loc", 
                                                              "end_loc", "start_no", "end_no"])
    del bike_loc[names]["start_no"]
    del bike_loc[names]["end_no"]
    bike_loc[names] = bike_loc[names].assign(distance = dist)
    
print(list(bike_loc.values())[0].head(10))

    start_t     end_t  duration  start_loc  end_loc  distance
0  55728000  55729500      1500         47       56  0.029266
1  55728000  55728540       540        762      600  0.019513
2  55728060  55728480       420        588      698  0.013609
3  55728120  55740360     12240        456      456  0.000000
4  55728120  55728780       660         67       67  0.000000
5  55728120  55728420       300        243       74  0.015976
6  55728180  55728480       300        715      444  0.011863
7  55728180  55740420     12240        456      456  0.000000
8  55728180  55740420     12240        456      456  0.000000
9  55728180  55728600       420        383       83  0.003483


Finally we save the data into folders.