In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import requests

### Introduction
After talking with Simona about my Data Preprocessing, I've received a very useful advice to transform the names of the cities into geographical coordinates. This way it would be possible for the neural network to include the distance between the cities into account - I am almost certain that it also influences the cancellation and delay probability. 
Moreover, currently I don't have that much features in my dataset, especially for such a complex problem, so more information = better.

### Geocoding tools

#### Google Geocoding API
Google Geocoding is an efficient tool that allows to find out the coordinate of any place. The user just needs to enter the name of the placue (city, street, country, etc.) and smart google algorithms would do the rest. It is also possible to integrate this service into my application by making use of Google API, however such service is not free and requires some amount of money to be paid per a certain amount of API calls

##### Advantages
- Easy to use
- Up-to-date information
- Doesn't require the input to be 100% accurate
- Being able to use Google Geocoding is a nice addition to my dev portfolio
##### Disadvantages
- I have 17 mil. rows in my dataset, so the price that I would have to pay for Geocoding API will be very expensive
- The geocoding process will take a while, as each geocoded value means one API call

#### Using additional dataset from Train Services
It is also possible to download an additional dataset from the same website where I found the data I currently use. This dataset contains the detailed information on every station, including geographical coordinates. So I can just compare two datasets against each other and find out the geographical coordinates based on the matching values

##### Advantages
- Is free to use
- I know that I can implement this, as I've dealt with similar tasks before
- Should execute much quicker than Google Geocoding
- The datasets are from the same source, so there's no chance that the station names would not match
##### Disadvantages
- I would learn nothing new
- The information in the dataset might not be up to date

In [2]:
main_dataset = pd.read_csv('preprocessed_data/disruptions_and_main_dataset.csv')

In [None]:


def extract_lat_long_via_address(address_or_zipcode):
    lat, lng = None, None
    api_key = GOOGLE_API_KEY
    base_url = "https://maps.googleapis.com/maps/api/geocode/json"
    endpoint = f"{base_url}?address={address_or_zipcode}&key={api_key}"
    # see how our endpoint includes our API key? Yes this is yet another reason to restrict the key
    r = requests.get(endpoint)
    if r.status_code not in range(200, 299):
        return None, None
    try:
        '''
        This try block incase any of our inputs are invalid. This is done instead
        of actually writing out handlers for all kinds of responses.
        '''
        results = r.json()['results'][0]
        lat = results['geometry']['location']['lat']
        lng = results['geometry']['location']['lng']
    except:
        pass
    return lat, lng

In [4]:
lat, lng = extract_lat_long_via_address("Eindhoven Centraal")

In [5]:
print(lat, lng)

51.4431348 5.4805389


In [6]:
all_arrival_stations = main_dataset['Arrival station name']
all_departure_stations = main_dataset['Departure station name']
merged_list = np.concatenate((all_arrival_stations, all_departure_stations), axis = 0)
merged_list = np.unique(merged_list)

In [7]:
print(merged_list)
len(merged_list)

["'s-Hertogenbosch" "'s-Hertogenbosch Oost" "'t Harde" 'Aachen Hbf'
 'Aachen West' 'Aalten' 'Abcoude' 'Akkrum' 'Alkmaar' 'Alkmaar Noord'
 'Almelo' 'Almelo de Riet' 'Almere Buiten' 'Almere Centrum'
 'Almere Muziekwijk' 'Almere Oostvaarders' 'Almere Parkwijk'
 'Almere Poort' 'Alphen a/d Rijn' 'Amersfoort Centraal'
 'Amersfoort Schothorst' 'Amersfoort Vathorst' 'Amsterdam Amstel'
 'Amsterdam Bijlmer ArenA' 'Amsterdam Centraal' 'Amsterdam Holendrecht'
 'Amsterdam Lelylaan' 'Amsterdam Muiderpoort' 'Amsterdam RAI'
 'Amsterdam Science Park' 'Amsterdam Sloterdijk' 'Amsterdam Zuid'
 'Anna Paulowna' 'Apeldoorn' 'Apeldoorn De Maten' 'Apeldoorn Osseveld'
 'Appingedam' 'Arkel' 'Arnemuiden' 'Arnhem Centraal' 'Arnhem Presikhaaf'
 'Arnhem Velperpoort' 'Arnhem Zuid' 'Assen' 'Baarn' 'Bad Nieuweschans'
 'Baflo' 'Barendrecht' 'Barneveld Centrum' 'Barneveld Noord'
 'Barneveld Zuid' 'Bedum' 'Beek-Elsloo' 'Beesd' 'Beilen' 'Bergen op Zoom'
 'Best' 'Beverwijk' 'Bilthoven' 'Blerick' 'Bloemendaal' 'Bodegraven'
 

393

In [8]:
class City:
  def __init__(self, city_name, lat, lng):
    self.city_name = city_name
    self.lat = lat
    self.lng = lng

cities_and_coordinates = []

def money_saving_method():
    for entry in merged_list:
        lat, lng = extract_lat_long_via_address(entry)
        city_to_add = City(entry, lat, lng)
        cities_and_coordinates.append(city_to_add)

money_saving_method()

In [9]:
coordintate = str(cities_and_coordinates[0].lat) + " " + str(cities_and_coordinates[0].lng)

In [10]:
main_dataset.head()

Unnamed: 0,Journey id,Train type,Railroad company,Train number,Departure station code,Arrival station code,Departure station name,Arrival station name,Total journey delay,Departure time,Arrival time,Is weekend,Is holiday,Part number,Cancelled,Disruptions
0,7914217,Intercity,NS,1410,RTD,DT,Rotterdam Centraal,Delft,4.0,2022-01-01 02:00:00,2022-01-01T02:13:00+01:00,True,True,1,False,False
1,7914217,Intercity,NS,1410,DT,GV,Delft,Den Haag HS,7.0,2022-01-01 02:13:00,2022-01-01T02:21:00+01:00,True,True,2,False,True
2,7914217,Intercity,NS,1410,GV,LEDN,Den Haag HS,Leiden Centraal,3.0,2022-01-01 02:23:00,2022-01-01T02:36:00+01:00,True,True,3,False,False
3,7914217,Intercity,NS,1410,LEDN,SHL,Leiden Centraal,Schiphol Airport,0.0,2022-01-01 02:45:00,2022-01-01T03:01:00+01:00,True,True,4,False,False
4,7914217,Intercity,NS,1410,SHL,ASD,Schiphol Airport,Amsterdam Centraal,0.0,2022-01-01 03:03:00,2022-01-01T03:17:00+01:00,True,True,5,False,False


In [11]:
def get_lat_lng(city_name):
    for city in cities_and_coordinates:
        if city.city_name == city_name:
            return city.lat, city.lng
    return None, None

# Apply the function to create new features
main_dataset["Departure Latitude"], main_dataset["Departure Longitude"] = zip(*main_dataset["Departure station name"].apply(get_lat_lng))
main_dataset["Arrival Latitude"], main_dataset["Arrival Longitude"] = zip(*main_dataset["Arrival station name"].apply(get_lat_lng))


In [12]:
main_dataset.head()

Unnamed: 0,Journey id,Train type,Railroad company,Train number,Departure station code,Arrival station code,Departure station name,Arrival station name,Total journey delay,Departure time,Arrival time,Is weekend,Is holiday,Part number,Cancelled,Disruptions,Departure Latitude,Departure Longitude,Arrival Latitude,Arrival Longitude
0,7914217,Intercity,NS,1410,RTD,DT,Rotterdam Centraal,Delft,4.0,2022-01-01 02:00:00,2022-01-01T02:13:00+01:00,True,True,1,False,False,51.924287,4.470026,52.011577,4.357068
1,7914217,Intercity,NS,1410,DT,GV,Delft,Den Haag HS,7.0,2022-01-01 02:13:00,2022-01-01T02:21:00+01:00,True,True,2,False,True,52.011577,4.357068,52.070249,4.321748
2,7914217,Intercity,NS,1410,GV,LEDN,Den Haag HS,Leiden Centraal,3.0,2022-01-01 02:23:00,2022-01-01T02:36:00+01:00,True,True,3,False,False,52.070249,4.321748,52.166359,4.482086
3,7914217,Intercity,NS,1410,LEDN,SHL,Leiden Centraal,Schiphol Airport,0.0,2022-01-01 02:45:00,2022-01-01T03:01:00+01:00,True,True,4,False,False,52.166359,4.482086,52.313025,4.772478
4,7914217,Intercity,NS,1410,SHL,ASD,Schiphol Airport,Amsterdam Centraal,0.0,2022-01-01 03:03:00,2022-01-01T03:17:00+01:00,True,True,5,False,False,52.313025,4.772478,52.379128,4.900272


In [13]:
main_dataset.drop("Arrival station name", inplace = True, axis = 1)
main_dataset.drop("Departure station name", inplace = True, axis = 1)

In [14]:
main_dataset.head()

Unnamed: 0,Journey id,Train type,Railroad company,Train number,Departure station code,Arrival station code,Total journey delay,Departure time,Arrival time,Is weekend,Is holiday,Part number,Cancelled,Disruptions,Departure Latitude,Departure Longitude,Arrival Latitude,Arrival Longitude
0,7914217,Intercity,NS,1410,RTD,DT,4.0,2022-01-01 02:00:00,2022-01-01T02:13:00+01:00,True,True,1,False,False,51.924287,4.470026,52.011577,4.357068
1,7914217,Intercity,NS,1410,DT,GV,7.0,2022-01-01 02:13:00,2022-01-01T02:21:00+01:00,True,True,2,False,True,52.011577,4.357068,52.070249,4.321748
2,7914217,Intercity,NS,1410,GV,LEDN,3.0,2022-01-01 02:23:00,2022-01-01T02:36:00+01:00,True,True,3,False,False,52.070249,4.321748,52.166359,4.482086
3,7914217,Intercity,NS,1410,LEDN,SHL,0.0,2022-01-01 02:45:00,2022-01-01T03:01:00+01:00,True,True,4,False,False,52.166359,4.482086,52.313025,4.772478
4,7914217,Intercity,NS,1410,SHL,ASD,0.0,2022-01-01 03:03:00,2022-01-01T03:17:00+01:00,True,True,5,False,False,52.313025,4.772478,52.379128,4.900272


In [16]:
main_dataset.to_csv('preprocessed_data/geocoded.csv', index=False)

In [None]:
#encode train type and railroad company as integers