# CA2 DAVI Individual Assignment
By:
Hubert Heng Kheng Yeow,
p2214494,
DAAA1B02

## Title: "Singapore public transport visualisation"

#### Objectives of the analysis:
- Objective 1: To analyse the spread and density of the various public transportation in Singapore
- Objective 2: To draw a correlation between the spread and densiy of the various public transportation and the variance in traffic congestion over time. Then we will implement some changes to public transport to reduce traffic congestion.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import geopandas as gpd
from shapely.geometry import Point

import json
import urllib
from urllib.parse import urlparse
import httplib2 as http # External library
import requests
from zipfile import ZipFile

## Step 1: Extracting the datasets

**NOTE** DO NOT RUN THE TWO CELLS BELOW TOO MANY TIMES, lta does not allow excessive API calls  
If too many calls are made, lta will temporary prevent this account key from making another API call  
The lta online datasets should already be downloaded

In [None]:
# Extracting online dymanic datasets from datamall2 by lta
# passenger volume datasets
dataset_paths = ['ltaodataservice/PV/Train', 'ltaodataservice/PV/Bus']
file_names = ['train_passenger_volume', 'bus_passenger_volume']
zipped_file_names = []

if __name__ == "__main__":
    for i in range(len(dataset_paths)):
        # Authentication parameters
        headers = {'AccountKey': '3fxd63A8QKq1SQb/3xF+ow==', # personal account key
        'accept': 'application/json'}

        # API parameters
        uri = 'http://datamall2.mytransport.sg/'
        path = dataset_paths[i]

        # Build query string & specify type of API call
        target = urlparse(uri + path)
        method = 'GET'
        body = ''

        # Get handle to http
        h = http.Http()

        # Obtain results
        response, content = h.request(
            target.geturl(),
            method,
            body,
            headers
        )

        # Parse JSON to print
        jsonObj = json.loads(content)
        link = jsonObj['value'][0]['Link']

        # Open the link to download the zipped csv file
        req = requests.get(link, allow_redirects=True)
        file_name = file_names[i] + ".zip"
        with open(file_name, "wb") as f: # creating a new file
            f.write(req.content) # write file sent from request
            zipped_file_names.append(f.name) # append file names into list

    for file_name in zipped_file_names:
        with ZipFile(file_name, 'r') as zObject:
            zObject.extractall(path=r"C:\Users\huber\OneDrive - Singapore Polytechnic\Sp Year 1 Sem 2\DAVI\CA2\DAAA1B02-2214494-HubertHeng\Datasets\passenger volume")  
else:
    print("__name__ is not equal to __main__!")

In [None]:
# location datasets (geodataframes)
train_location = gpd.read_file('Datasets/location/TrainStation_Jan2022/MRTLRTStnPtt.shp')
bus_location = gpd.read_file('Datasets/location/BusStopLocation_Jun2022/BusStop.shp')
taxi_location = gpd.read_file('Datasets/location/TaxiStand_Jan2022/TaxiStop.shp')

# passenger volume datasets
train_passenger_vol = pd.read_csv('Datasets/passenger volume/transport_node_train_202212.csv')
bus_passenger_vol = pd.read_csv('Datasets/passenger volume/transport_node_bus_202212.csv')

# ridership dataset
public_transport_ridership = pd.read_csv('Datasets/ridership/public-transport-utilisation-average-public-transport-ridership.csv')

# population dataset
bus_population = pd.read_csv("Datasets/population/MVP01-10_Bus_by_Pax.csv")
taxi_population = pd.read_csv("Datasets/population/monthly_taxi_fleet.csv")

## Step 2: Visualise the datasets before cleaning
Check for any NaN values or unwanted data in the dataframes

In [None]:
datasets = [train_location, bus_location, taxi_location, train_passenger_vol, bus_passenger_vol, 
public_transport_ridership, bus_population, taxi_population]
datasets_title = ["**Train stations locations**", "**Bus stops locations**", "**Taxi stands locations**", "**Passenger volume by train station**", "**Passenger volume by bus stop**", 
"**Public transport ridership annually by type of transport**", "**Bus population in Singapore**", "**Taxi population in Singapore**"]
for i in range(len(datasets)):
    print(datasets_title[i])
    print(datasets[i].isna().sum())
    print("\n")

It appears that the dataframes "bus_location" and "taxi_location" have some NaN values. Lets take a look at what this missing data are

In [None]:
datasets = [bus_location, taxi_location]
datasets_title = ["**Bus stops locations**", "**Taxi stands locations**"]
for i in range(len(datasets)):
    print(datasets_title[i])
    is_na = datasets[i].isna()
    print(datasets[i][is_na.any(axis=1)])
    print("\n")

## Step 3: Cleaning the datasets
Lets check the NaN values in each dataframe such that we can figure out how we are going to clean that dataframe.
Lets start with the dataframe "bus_location" first.

In [None]:
bus_location["LOC_DESC"] = bus_location["LOC_DESC"].fillna("No description")
bus_location.iloc[285]

We will replace NaN with "No description" in the "LOC_DESC" column to indicate that there is no location description for the bus stop.

In [None]:
bus_location[bus_location["BUS_ROOF_N"] == "NIL"]
bus_location["BUS_ROOF_N"] = bus_location["BUS_ROOF_N"].fillna("NIL")

There are numerous rows where the value in the "BUS_ROOF_N" is NIL. We will replace the NaN value in index 3305 with NIL too.

In [None]:
bus_location.isna().sum()

Now, we will move onto the next dataframe "taxi_location". For this dataframe, we will remove the column "TYPE_CD" as all the values in that column are NaN.

In [None]:
taxi_location = taxi_location.drop(columns = ['TYPE_CD'])
taxi_location.isna().sum()

Now, lets find the indexes of the rows that contain NaN values in the column "TYPE_CD_DE".

In [None]:
taxi_location[taxi_location["TYPE_CD_DE"].isna()]

We will drop these rows.

We then need to change to string values in the column "taxi_fleet" to numeric, else it will mess with our code.

In [None]:
taxi_population['taxi_fleet'] = taxi_population['taxi_fleet'].str.replace(',', '').astype(int)

Now that we have either replaced or removed all the NaN values in our datasets, we will now rename the columns of some of the datasets to make it easier to read as well as to change the dtypes of some of the datasets. We will also change the date frequency of the dataframe "taxi_population" from month to year.

In [None]:
datasets = [train_location, bus_location, taxi_location, train_passenger_vol, bus_passenger_vol, 
public_transport_ridership, bus_population, taxi_population]
datasets_title = ["**Train stations locations**", "**Bus stops locations**", "**Taxi stands locations**", "**Passenger volume by train station**", "**Passenger volume by bus stop**", 
"**Public transport ridership annually by type of transport**", "**Bus population in Singapore**", "**Taxi population in Singapore**"]
for i in range(len(datasets)):
    print(datasets_title[i])
    print(datasets[i].columns)
    print("\n")

In [None]:
train_location.rename(columns = {
    'STN_NAME': 'Station name',
    'STN_NO': 'Station number'
}, inplace= True)

bus_location.rename(columns = {
    'BUS_STOP_N': 'Bus stop number',
    'BUS_ROOF_N': 'Bus roof number',
    'LOC_DESC': 'Location description',
}, inplace= True)

taxi_location.rename(columns = {
    'TYPE_CD_DE': 'Taxi stand/stop'
}, inplace= True)

train_passenger_vol.rename(columns = {
    'YEAR_MONTH': 'year_month'
}, inplace= True)

bus_passenger_vol.rename(columns = {
    'YEAR_MONTH': 'year_month'
}, inplace= True)

taxi_population.rename(columns = {
    'month': 'year_month',
    'taxi_fleet': 'number'
}, inplace= True)

Now, lets view the dtypes in our data.

In [None]:
datasets = [train_location, bus_location, taxi_location, train_passenger_vol, bus_passenger_vol, 
public_transport_ridership, bus_population, taxi_population]
datasets_title = ["**Train stations locations**", "**Bus stops locations**", "**Taxi stands locations**", "**Passenger volume by train station**", "**Passenger volume by bus stop**", 
"**Public transport ridership annually by type of transport**", "**Bus population in Singapore**", "**Taxi population in Singapore**"]
for i in range(len(datasets)):
    print(datasets_title[i])
    print(datasets[i].info())
    print("\n")

As we can see, we have to change the "year" for all the dataframes (except for the location datasets) to "datetime64" instead of "int64" or "object"

In [None]:
datasets = [public_transport_ridership, bus_population]
for dataframe in datasets:
    dataframe["year"] = dataframe["year"].astype(object)

In [None]:
datasets_1 = [public_transport_ridership, bus_population]
datasets_2 = [train_passenger_vol, bus_passenger_vol, taxi_population]

for dataframe in datasets_1:
    dataframe["year"] = pd.to_datetime(dataframe["year"], format="%Y")

for dataframe in datasets_2:
    dataframe["year_month"] = pd.to_datetime(dataframe["year_month"], format="%Y-%m")

In [None]:
datasets = [train_location, bus_location, taxi_location, train_passenger_vol, bus_passenger_vol, 
public_transport_ridership, bus_population, taxi_population]
datasets_title = ["**Train stations locations**", "**Bus stops locations**", "**Taxi stands locations**", "**Passenger volume by train station**", "**Passenger volume by bus stop**", 
"**Public transport ridership annually by type of transport**", "**Bus population in Singapore**", "**Taxi population in Singapore**"]
for i in range(len(datasets)):
    print(datasets_title[i])
    print(datasets[i].info())
    print("\n")

## Step 4: Visualizing the cleaned datasets

Let's first visualise the first few rows in our data.

In [None]:
datasets = [train_location, bus_location, taxi_location, train_passenger_vol, bus_passenger_vol, 
public_transport_ridership, bus_population, taxi_population]
datasets_title = ["**Train stations locations**", "**Bus stops locations**", "**Taxi stands locations**", "**Passenger volume by train station**", "**Passenger volume by bus stop**", 
"**Public transport ridership annually by type of transport**", "**Bus population in Singapore**", "**Taxi population in Singapore**"]
for i in range(len(datasets)):
    print(datasets_title[i])
    display(datasets[i].head())
    print("\n")

Next, let's check for some outliers in our data.

In [None]:
def detect_outliers (dataframe, column):
    Q1 = dataframe[column].quantile(0.25)
    Q3 = dataframe[column].quantile(0.75)
    IQR = Q3 = Q1
    lower_fence = Q1 + 1.5 * IQR
    upper_fence = Q3 + 1.5 * IQR
    outliers_condition = ((dataframe[column] < (lower_fence)) | (dataframe[column] > (upper_fence)))
    outliers = dataframe[outliers_condition]
    if len(outliers) == 0:
        print(f'No outliers in {column}')
    else: 
        print(f'There are {len(outliers)} outliers in {column}')
    sns.boxplot(dataframe[column]).set_title('Boxplot of outliers')
    plt.show()
    return outliers

In [None]:
datasets = [train_passenger_vol, train_passenger_vol ,bus_passenger_vol, bus_passenger_vol,
public_transport_ridership, bus_population, taxi_population]
datasets_column = ['TOTAL_TAP_IN_VOLUME', 'TOTAL_TAP_OUT_VOLUME', 'TOTAL_TAP_IN_VOLUME', 'TOTAL_TAP_OUT_VOLUME', 
'average_ridership', 'number', 'number']
for i in range(len(datasets)):
    detect_outliers(datasets[i], datasets_column[i])
    print("\n")

## Step 5: Data Visualisation

## Objective 1 
##### To analyse the spread and density of the various public transportation in Singapore

#### Step 5.1: How does the variety and spread of public transportation look like in Singapore?

Map of bus stops, MRT/LRT stations and taxi stands in Singapore.

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 15))
taxi_location.plot(ax = ax, color='#FFCCE1', alpha=0.8)
bus_location.plot(ax = ax, color='#dee3ea', alpha=0.5)
train_location.plot(ax = ax, color='#a4d0cf', alpha=0.7)
plt.legend(['Taxi', 'Bus', 'Train'])
plt.title("Map of Singapore")
plt.show()

- As we can see, the points on the map are quite hard to see as they are clustered up together. The map that we just plotted is static, meaning that we are unable to zoom in to see the points more clearly. Furthermore, we are unable to see both the individual stations/stops and all the stations/stops together on one figure since the plot does not allow us to switch between these two options. We would have to plot at least 4 plots in order to do that.
- Hence, we need to find another way to plot these geodataframes together.

- We decided to plot these geodataframe onto a plotly mapbox map. Plotly allows us to create an interacterable map, where we can zoom into locations that we are interested in and view information about them by hovering over them. Mapbox allows us to have an accurate visualisation of the world's locations, like for example, the different roads and streets in an area. With this mapbox figure,  we would be able to switch between the individual stations/stops and all the stations/stops together seamlessly on one figure with a click of a button. Mapbox also allows us to have nicer map visulisations.
- However, the problems is that plotly maps only accepts coordinates based on the World Geodetic System (WGS84).
- The coordinates that we have in the geodataframes "train_location", "bus_location" and "taxi_location" are based on the plane coordinate system (SVY21). The plane coordinate system is a transverse mercator projection (with a WGS84 datum) commonly used in Singapore.
- We need to change the coordinates to the correct coordinate system (SVY21 -> WGS84). To do this, we will do some math. We will make use of some inverse trigonometric functions like latitude = asin (z/R) and longitude = atan2 (y,x). asin and atan are known as arcsine and acrtangent, they are the inverse of sine and tangent respectively.
- Using a class and some functions, we will convert the coordinates and store them in new geodataframes.

In [None]:
import math

class SVY21:
    # Ref: http://www.linz.govt.nz/geodetic/conversion-coordinates/projection-conversions/transverse-mercator-preliminary-computations/index.aspx
    
    # WGS84 Datum
    a = 6378137
    f = 1 / 298.257223563

    # SVY21 Projection
    # Fundamental point: Base 7 at Pierce Resevoir.
    # Latitude: 1 22 02.9154 N, longitude: 103 49 31.9752 E (of Greenwich).

    # Known Issue: Setting (oLat, oLon) to the exact coordinates specified above
    # results in computation being slightly off. The values below give the most 
    # accurate represenation of test data.
    oLat = 1.366666     # origin's lat in degrees
    oLon = 103.833333   # origin's lon in degrees
    oN = 38744.572      # false Northing
    oE = 28001.642      # false Easting
    k = 1               # scale factor

    def __init__(self):
        self.b = self.a * (1 - self.f)
        self.e2 = (2 * self.f) - (self.f * self.f)
        self.e4 = self.e2 * self.e2
        self.e6 = self.e4 * self.e2
        self.A0 = 1 - (self.e2 / 4) - (3 * self.e4 / 64) - (5 * self.e6 / 256);
        self.A2 = (3. / 8.) * (self.e2 + (self.e4 / 4) + (15 * self.e6 / 128));
        self.A4 = (15. / 256.) * (self.e4 + (3 * self.e6 / 4));
        self.A6 = 35 * self.e6 / 3072;

    def computeSVY21(self, lat, lon):
        """
        Returns a pair (N, E) representing Northings and Eastings in SVY21.
        """

        latR = lat * math.pi / 180
        sinLat = math.sin(latR)
        sin2Lat = sinLat * sinLat
        cosLat = math.cos(latR)
        cos2Lat = cosLat * cosLat
        cos3Lat = cos2Lat * cosLat
        cos4Lat = cos3Lat * cosLat
        cos5Lat = cos4Lat * cosLat
        cos6Lat = cos5Lat * cosLat
        cos7Lat = cos6Lat * cosLat

        rho = self.calcRho(sin2Lat)
        v = self.calcV(sin2Lat)
        psi = v / rho
        t = math.tan(latR)
        w = (lon - self.oLon) * math.pi / 180

        M = self.calcM(lat)
        Mo = self.calcM(self.oLat)

        w2 = w * w
        w4 = w2 * w2
        w6 = w4 * w2
        w8 = w6 * w2

        psi2 = psi * psi
        psi3 = psi2 * psi
        psi4 = psi3 * psi

        t2 = t * t
        t4 = t2 * t2
        t6 = t4 * t2

        # Compute Northing
        nTerm1 = w2 / 2 * v * sinLat * cosLat
        nTerm2 = w4 / 24 * v * sinLat * cos3Lat * (4 * psi2 + psi - t2)
        nTerm3 = w6 / 720 * v * sinLat * cos5Lat * ((8 * psi4) * (11 - 24 * t2) - (28 * psi3) * (1 - 6 * t2) + psi2 * (1 - 32 * t2) - psi * 2 * t2 + t4)
        nTerm4 = w8 / 40320 * v * sinLat * cos7Lat * (1385 - 3111 * t2 + 543 * t4 - t6)
        N = self.oN + self.k * (M - Mo + nTerm1 + nTerm2 + nTerm3 + nTerm4)

        # Compute Easting
        eTerm1 = w2 / 6 * cos2Lat * (psi - t2)
        eTerm2 = w4 / 120 * cos4Lat * ((4 * psi3) * (1 - 6 * t2) + psi2 * (1 + 8 * t2) - psi * 2 * t2 + t4)
        eTerm3 = w6 / 5040 * cos6Lat * (61 - 479 * t2 + 179 * t4 - t6)
        E = self.oE + self.k * v * w * cosLat * (1 + eTerm1 + eTerm2 + eTerm3)

        return (N, E)

    def calcM(self, lat):
        latR = lat * math.pi / 180
        return self.a * ((self.A0 * latR) - (self.A2 * math.sin(2 * latR)) + (self.A4 * math.sin(4 * latR)) - (self.A6 * math.sin(6 * latR)))

    def calcRho(self, sin2Lat):
        num = self.a * (1 - self.e2)
        denom = math.pow(1 - self.e2 * sin2Lat, 3. / 2.)
        return num / denom

    def calcV(self, sin2Lat):
        poly = 1 - self.e2 * sin2Lat
        return self.a / math.sqrt(poly)

    def computeLatLon(self, N, E):
        """
        Returns a pair (lat, lon) representing Latitude and Longitude.
        """

        Nprime = N - self.oN
        Mo = self.calcM(self.oLat)
        Mprime = Mo + (Nprime / self.k)
        n = (self.a - self.b) / (self.a + self.b)
        n2 = n * n
        n3 = n2 * n
        n4 = n2 * n2
        G = self.a * (1 - n) * (1 - n2) * (1 + (9 * n2 / 4) + (225 * n4 / 64)) * (math.pi / 180)
        sigma = (Mprime * math.pi) / (180. * G)
        
        latPrimeT1 = ((3 * n / 2) - (27 * n3 / 32)) * math.sin(2 * sigma)
        latPrimeT2 = ((21 * n2 / 16) - (55 * n4 / 32)) * math.sin(4 * sigma)
        latPrimeT3 = (151 * n3 / 96) * math.sin(6 * sigma)
        latPrimeT4 = (1097 * n4 / 512) * math.sin(8 * sigma)
        latPrime = sigma + latPrimeT1 + latPrimeT2 + latPrimeT3 + latPrimeT4

        sinLatPrime = math.sin(latPrime)
        sin2LatPrime = sinLatPrime * sinLatPrime

        rhoPrime = self.calcRho(sin2LatPrime)
        vPrime = self.calcV(sin2LatPrime)
        psiPrime = vPrime / rhoPrime
        psiPrime2 = psiPrime * psiPrime
        psiPrime3 = psiPrime2 * psiPrime
        psiPrime4 = psiPrime3 * psiPrime
        tPrime = math.tan(latPrime)
        tPrime2 = tPrime * tPrime
        tPrime4 = tPrime2 * tPrime2
        tPrime6 = tPrime4 * tPrime2
        Eprime = E - self.oE
        x = Eprime / (self.k * vPrime)
        x2 = x * x
        x3 = x2 * x
        x5 = x3 * x2
        x7 = x5 * x2

        # Compute Latitude
        latFactor = tPrime / (self.k * rhoPrime)
        latTerm1 = latFactor * ((Eprime * x) / 2)
        latTerm2 = latFactor * ((Eprime * x3) / 24) * ((-4 * psiPrime2) + (9 * psiPrime) * (1 - tPrime2) + (12 * tPrime2))
        latTerm3 = latFactor * ((Eprime * x5) / 720) * ((8 * psiPrime4) * (11 - 24 * tPrime2) - (12 * psiPrime3) * (21 - 71 * tPrime2) + (15 * psiPrime2) * (15 - 98 * tPrime2 + 15 * tPrime4) + (180 * psiPrime) * (5 * tPrime2 - 3 * tPrime4) + 360 * tPrime4)
        latTerm4 = latFactor * ((Eprime * x7) / 40320) * (1385 - 3633 * tPrime2 + 4095 * tPrime4 + 1575 * tPrime6)
        lat = latPrime - latTerm1 + latTerm2 - latTerm3 + latTerm4

        # Compute Longitude
        secLatPrime = 1. / math.cos(lat)
        lonTerm1 = x * secLatPrime
        lonTerm2 = ((x3 * secLatPrime) / 6) * (psiPrime + 2 * tPrime2)
        lonTerm3 = ((x5 * secLatPrime) / 120) * ((-4 * psiPrime3) * (1 - 6 * tPrime2) + psiPrime2 * (9 - 68 * tPrime2) + 72 * psiPrime * tPrime2 + 24 * tPrime4)
        lonTerm4 = ((x7 * secLatPrime) / 5040) * (61 + 662 * tPrime2 + 1320 * tPrime4 + 720 * tPrime6)
        lon = (self.oLon * math.pi / 180) + lonTerm1 - lonTerm2 + lonTerm3 - lonTerm4

        return (lat / (math.pi / 180), lon / (math.pi / 180))

In [None]:
new_train_dataframe = []
new_bus_dataframe = []
new_taxi_dataframe = []
new_dataframe_list = [new_train_dataframe, new_bus_dataframe, new_taxi_dataframe]
datasets = [train_location, bus_location, taxi_location]

convert = SVY21() # use class
for j in range(len(datasets)):
    for index, row in datasets[j].iterrows():
        i = convert.computeLatLon(row.geometry.y, row.geometry.x)
        row.geometry = Point(i[0], i[1])
        new_dataframe_list[j].append(row)

In [None]:
# New train geodataframe
df1 = pd.DataFrame(new_train_dataframe)
new_train_location = gpd.GeoDataFrame(df1)

# New bus geodataframe
df2 = pd.DataFrame(new_bus_dataframe)
new_bus_location = gpd.GeoDataFrame(df2)

# New taxi geodataframe
df3 = pd.DataFrame(new_taxi_dataframe)
new_taxi_location = gpd.GeoDataFrame(df3)

Let us look at the first few rows of our newly created geodataframes.

In [None]:
datasets = [new_train_location, new_bus_location, new_taxi_location]
datasets_column = ['Train geodataframe', 'Bus geodataframe', 'Taxi geodataframe']
for i in range(len(datasets)):
    print(datasets_column[i])
    print(datasets[i].head())
    print('\n')

- The code below is used to convert the epsg of the geodataframes from epsg 3414 (Singapore's coordinate system) to epsg 4326 (The world's coordinate system). 
- Similarly to the SVY21 and WGS84, we have to change these coordinate systems to plot them onto our plotly mapbox map.

In [None]:
datasets = [new_train_location, new_bus_location, new_taxi_location]
datasets_column = ['Train geodataframe', 'Bus geodataframe', 'Taxi geodataframe']
for i in range(len(datasets)):
    datasets[i].set_crs(epsg=4326, inplace=True, allow_override=True)
    print(datasets_column[i])
    print(datasets[i].crs)
    print('\n')

Map of Singapore

In [None]:
mapbox_access_token = open(".mapbox_token").read()

# Figures
fig = go.Figure()
fig.add_trace(
    go.Scattermapbox(
        lat=new_train_location.geometry.x,
        lon=new_train_location.geometry.y,
        name='Train stations',
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=5,
            opacity=0.5,
            color='salmon'
        ),
        hovertemplate=
        '<i>Latitude</i>: %{lat:.3f}'+
        '<br><i>Longitude</i>: %{lon:.3f}<br>' +
        '<b>%{text[0]}</b>' +
        '<br><b>%{text[1]}</b>',
        text=new_train_location[['Station name', 'Station number']]
    )
)
fig.add_trace(
    go.Scattermapbox(
        lat=new_bus_location.geometry.x,
        lon=new_bus_location.geometry.y,
        name='Bus stops',
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=5,
            opacity=0.5,
            color='darkseagreen'
        ),
        hovertemplate=
        '<i>Latitude</i>: %{lat:.3f}'+
        '<br><i>Longitude</i>: %{lon:.3f}<br>' +
        '<b>%{text[0]}</b>' +
        '<br><b>%{text[1]}</b>',
        text=new_bus_location[['Bus stop number', 'Bus roof number']]
    )
)
fig.add_trace(
    go.Scattermapbox(
        lat=new_taxi_location.geometry.x,
        lon=new_taxi_location.geometry.y,
        name='Taxi stands',
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=5,
            opacity=0.5,
            color='mediumslateblue'
        ),
        hovertemplate=
        '<i>Latitude</i>: %{lat:.3f}'+
        '<br><i>Longitude</i>: %{lon:.3f}<br>' +
        '<b>%{text}</b>',
        text=new_taxi_location['Taxi stand/stop']
    )
)

# Create the buttons
dropdown_buttons = [
{'label': "ALL", 'method': "update", 'args': [{"visible": [True, True, True]}, {"title": "ALL"}]},
{'label': "Train stations", 'method': "update", 'args': [{"visible": [True, False, False]}, {"title": "Train stations"}]},
{'label': "Bus stops", 'method': "update", 'args': [{"visible": [False, True, False]}, {"title": "Bus stops"}]},
{'label': "Taxi stands", 'method': "update", 'args': [{"visible": [False, False, True]}, {"title": "Taxi stands"}]},
]

# Figure layout
fig.update_layout(
    autosize=False,
    width=1400,
    height=700,
    hovermode='closest',
    mapbox_style="mapbox://styles/tarnishedhunter/clde0tvs1003s01mk55yaku0z",
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=1.3521,
            lon=103.8198
        ),
        pitch=0,
        zoom=10
    ),
    margin={"r":0,"t":0,"l":0,"b":0}, # remove the white gutter between the frame and map
    hoverlabel=dict( 
        bgcolor="white", # white background
        font_size=16, # label font size
        font_family="Rockwell",

    ),
    legend=dict(
        title="Colour Codes for Locations",
        entrywidth=0.3,
        orientation="h",
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01,
        font=dict(
            family="Rockwell",
            size=14,
            color="black"
        ),
        bgcolor="azure",
        bordercolor="Grey",
        borderwidth=2
    ),
    updatemenus=[
        dict(
            buttons=dropdown_buttons,
            direction="down",
            pad={"r": 0, "t": 0, "l":0,"b":0},
            showactive=True,
            x=1,
            xanchor="right",
            y=0.95,
            yanchor="top",
            font=dict(
                size=14,
                color="black"
            )
        ),
    ]
)
fig.show()

- In general, with the all tabs active, it seems that most of the areas in Singapore are pretty much covered by public transport. There is not sign of any public transport on the extreme lefts, rights and center of Singapore, since those areas are airport runways and forests. It makes sense since no one lives there.
- On the extreme left, there is a line of bus stops that line up all the way to the top left of Singapore. The top left of Singapore is where Lim Chu Kang is situated. Lim Chu Kang is quite isolated from the rest of Singapore, cutted of by forests, rivers and murai airfield. Bus stops are unfortunately the only way reach Lim Chu Kang. There are no mrt/lrt stations or taxi stands in sight.
- The line of bus stops connects Lim Chu Kang to the southwest of Singapore, like where Jurong West, Choa Chu Kang and Bukit Batok are.
- Since we are already talking about the distribution of bus stops in Singapore, let us switch to the bus stops tab. At first glance, the distribution of bus stops seems to be very evenly distributed across towns in Singapore, except for Lim Chu Kang, which we talked about just now. 
- However, taking a closer look, there are still some towns, like Lim Chu Kang, where there are hardly to no bus stops in sight.
- Theses towns include:
    - Bukit Timah
    - Tanglin
    - Novena
- Looking at these three towns, we would have noticed that both Bukit Timah and Tanglin are from district 10 and that Novena is from district 11. These two districts are some of the more expensive towns in Singapore, where the rich families and expats live.
- Could this mean that the more expensive the town, the less the bus stops situtated? This hypothesis is quite reasonable, since richer people are able to afford private transport like cars. The need of public transport in these towns wouls hence be a lot lower as compared to the cheaper towns in Singapore.
- However, there are a few examples that oppose to this statement. For example, Lim Chu Kang is not exactly an expensive town. But then again, Lim Chu Kang doesnt have many residents, most of the land there are taken up by cemetries.
- The better examples that can oppose to this statement are the other towns in district 10 and 11. Ardmore, Holland Road, Newton and Thomson all seem to have a fair share of bus stops, though it is still lesser than most towns in Singapore.
- To prevent ourselves from straying too far off from our objectives, we will move onto the mrt/lrt station and taxi stands maps. This hypothesis that we have though can be used in another analysis. This is one of the funs of data anlaysis, we will never know what interesting observations that we can observed from our data.

- Now, when we open the train stations tab, we would have noticed at first glance that the distribution of mrt/lrt stations are not really distributed evenly.
- There is a huge concentration of mrt/lrt stations in the north east of Singapore, where Sengkang and Punggol are. There is also another huge concentration in the south of Sinapore, where the central business district and shopping districts of Singapore are located.
- There is a reasonable amount of mrt/lrt staions around the center of Singapore, around the towns Choa Chu Kang and Bukit Panjang.
- The rest of the areas and towns of Singapore, the North, the South West and the East, the concentration of mrt/lrt stations is very little.
- Now, moving onto the taxi stands tab, it is immediately apparent that there a insanely huge concentration of taxi stands in the south of Singapore, where the central business and shopping districts of Singapore are located. It makes sense though, since the concentration of people there is the highest anywhere in Singapore. All the offices and shopping centres are located there, if there isnt enough public transport there, a huge traffic congestion will definitely result.
- There is a decent concentration of taxi stands upper north west, in the districts 10 and 11, which are the richer towns of Singapore. This also makes sense, since taxis are a lot more expensive as compared to buses and trains. Taxis is seem as a luxury to the average Singaporean and one wouldnt ride it everyday. These richer towns are able to afford it though, hence explaining the decent concentration of taxi stands there.
- The rest of the areas and towns of Singapore, however, the concentration of taxi stands are very low.

Line graph of public ridership of different public transport over the years.

In [None]:
fig, ax = plt.subplots(figsize=(10, 6), tight_layout = True)

sns.set_theme(style="whitegrid", palette="pastel")
sns.set_style({"grid.color": ".6", "grid.linestyle": ":", "ticks": True})

ax = sns.lineplot(data=public_transport_ridership, x="year", y="average_ridership", hue="type_of_public_transport")

sns.despine()
ax.legend(loc = "lower right", title = "Type of Public Transport")
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
ax.set_title(r"$\bf{More\ Singaporeans\ are\ Taking\ the\ MRT}$" + "\nDifferent types of public transport average Ridership in Millions", loc = "left", fontsize = 16)
ax.set_ylabel("Change in Average Daily Ridership")
ax.set_xlabel("Years")

plt.show()

- From the line graph, we can observed that most people in Singapore take the bus. Bus is the main form of public transport in Singapore.
- This is followed by the MRT, taxi and lastly LRT.
- Before we start analysing, the reason why the number of Singaporeans taking taxis from 1996 to 2000 is zero is because they did not start recording the number of people taking the taxi until 2000.
- From the graph, we can see that the rate of increase of average daily ridership is the highest for MRT.
- The number of people starting the MRT is slowly catching up to the number of people taking the bus.
- This could be due to the rapid expansion of MRT lines in Singapore in the recent years, due to the increase in funding and support from the government and transport authorities.
- For a period, from around 2000 to 2004, the number in average daily ridership dropped slightly before rising again for bus. This could be due to the increase in bus fares from 2000 to 2004, as seen from https://landtransportguru.net/chronology-of-fare-adjustments/, however, the train fares did increase during that period too, hence this means that are other factors contributing to the daily ridership decrease. Increase in bus fare is not the only causation, there could be many other factors too.
- LRT and Taxis seen hardly any difference in the average daily ridership over the years. This is true considering the fact that they are the less popular modes of transport, due to the high taxi fares and the little number of lrt stations in Singapore.

Pie chart of the different types of public transport in Singapore.

In [None]:
# Extracting the required information from the datasets
public_transport_ridership_by_type = public_transport_ridership.groupby('type_of_public_transport')['average_ridership'].agg([np.sum, np.mean])

In [None]:
labels = ['Bus', 'LRT', 'MRT', 'Taxi']
palette_color = sns.color_palette('pastel')

fig = plt.figure(figsize=(10, 6), tight_layout = True)
ax1 = fig.add_subplot(121)
ax2 = fig.add_subplot(122)

ax1.pie(public_transport_ridership_by_type['sum'], labels=labels, colors=palette_color, autopct='%.0f%%', explode=[0.03]*4)
ax2.pie(public_transport_ridership_by_type['mean'], labels=labels, colors=palette_color, autopct='%.0f%%', explode=[0.03]*4)
ax1.title.set_text('Total ridership by transport')
ax2.title.set_text('Mean ridership by transport')

plt.show()

- As we can see from the pie charts, the percentages of total and mean ridership by transport are completely the same. Hence, we will not take the total and mean into account.
- Due to our previous knowledge from analysing the line graph above, we know that the highest proportionn of average daily ridership is bus, followed by MRT, taxi and lastly LRT.
- However, we do not know the actual percentage values, hence we plot these two pie charts.
- As we all can see from the pie charts, 58% of ridership is by bus, 29% for MRT, 12% for taxi and 1% for LRT. This matches with our previous knowledge.
- The ridership by bus is almost twice as much as the ridership for MRT, however, we do know that the percentage difference is slowly decreasing due to the huge rate of increase in MRT avaerge daily ridership.
- The ridership by taxi is less than half of the ridership by MRT.
- The ridership by LRT, as we all know, makes up only 1% of the entire ridership population. The percentages for bus, MRT and even taxi completely dwarfs it.

Population of buses and taxis over the years.

In [None]:
# Extracting the required information from the datasets
taxi_population_by_year = taxi_population.groupby('year_month')['number'].sum()
bus_population_by_year = bus_population.groupby('year')['number'].sum()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6), tight_layout = True)

sns.set_style({"grid.color": ".8", "grid.linestyle": ":", "ticks": True})
sns.set_theme(style="whitegrid")

sns.lineplot(data=bus_population_by_year, color='lightsteelblue', label='Bus')
sns.lineplot(data=taxi_population_by_year, color='lightsalmon', label='Taxi')

sns.despine(left=True)
ax.set_title(r"$\bf{There\ are\ more\ taxis\ than\ buses\ overall}$" + "\nBus vs Taxi Population over the years", loc = "left", fontsize = 16)
ax.set_ylabel("Population")
ax.set_xlabel("Years")
ax.legend(loc="upper right", title='Type')

plt.show()

- Looking at the line graph, we all will immdiately notice the large decrease in the taxi population from around the years 2016 to 2022. Lets talk about this first. Whats the reason for the sudden decrease?
- Reading from these article, https://www.channelnewsasia.com/singapore/future-of-taxi-industry-in-singapore-in-focus-phv-242436, it seems that the rise of private-hire cars and the Covid-19 pandamic are the two main reasons for the huge decrease in the taxi population.
- This is quite saddening, due to the huge amount of jobs and livelihoods lost. However, many new jobs for private-hire drivers are also created.
- But arent taxis and private-hire cars the same thing? Why are people choosing to private-hire cars instead of taxis?
- From online research, it seems that some reasons are as follows:
    - ease of use of private-hire app (easier to get a private-hire car)
    - ease of payment (more convenient)
    - private-hire app rewards system (exclusive discounts)
- Moving back to the anlaysis and the line graph, we can see that for the years before 2020, the taxi population was higher than the population of buses.
- The taxi population reached its peak at around 29000 cars in 2016, before unfortunately decreasing rapidly to around 16000 in 2022.
- Up till 2016, the taxi population has beenn increasing steadily from around 20000 to around 29000 in 2016.
- For the bus population, it has been increasing steadily throughout the years.

Heatmap of bus population of different bus capacitites.

In [None]:
# Extracting the required information from the datasets
bus_population_by_capacity = bus_population.groupby(['capacity', 'year'])['number'].sum()
bus_population_by_capacity = bus_population_by_capacity.reset_index()
bus_population_by_capacity = pd.pivot_table(bus_population_by_capacity, index='capacity', columns='year', values='number')

In [None]:
fig, ax = plt.subplots(figsize=(10, 6), tight_layout = True)

sns.heatmap(bus_population_by_capacity, fmt="g", cmap='BuPu', center=1000, annot=True, linewidths=0.05,
            cbar_kws = {
                "orientation": "vertical",
                "label" : "Bus population"
            },xticklabels = [
                "2005", "2006", "2007", "2008", "2009", "2010", "2011", "2012", "2013",
                 "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021"
            ])
ax.set_title("Bus population by capacity over the years", weight = "semibold", fontsize = 16)
ax.set_xlabel("Years")
ax.set_ylabel("Capacity")

plt.show()

- In this heatmap, we can see that all the years, from years 2005 to 2021, bus capacities of 10-15 and >70 have the greatest proportion, as seen by the dark purple squares.
- Bus capacities 16-20, 41-45 and 46-50 have decent proportions, but they are still a lot lesser as compared to the 10-15 and >70 capacities, around 5 to 6 times less.
- The rest of the bus capacities are very little, ranging from 0 population to around 1000, as seen by the light blue squares.

Line graph of taxi population of different companies.

In [None]:
# Extracting the required information from the datasets
taxi_population_by_company = taxi_population.groupby(['company', 'year_month'])['number'].sum()
taxi_population_by_company = taxi_population_by_company.reset_index()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6), tight_layout = True)

sns.lineplot(data=taxi_population_by_company, x="year_month", y='number', hue='company')
ax.set_title(r"$\bf{Comfort\ has\ the\ biggest\ taxi\ population}$" + "\nTaxi Population of different companies over the years", loc = "left", fontsize = 16)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
ax.set_xlabel("Years")
ax.set_ylabel("Company")

plt.show()

- Now, looking at this line graph, we can see that the taxi company comfort has the biggest taxi population, followed by cityCab and TransCab, both with around the same population. The rest of the taxi companies are as follows.
- It seems that the taxi company comfort has taken the biggest hit of taxi population decrease from the years 2016 to 2022, decreasing from around 13000 in 2016 to around 7000 in 2022, a drastic decrease of 46%, almost halfing their population.
- The taxi companies TransCab, CityCab, SMRT and Premier has taken a slight hit, seeing a decrease of around 500 to 1000 taxis.
- The rest of the taxi companies does not seem to have taken a hit, though their numbers are close to zero, with some flat out zero, possibly indicating that these taxi companies closed down.
- Over the years, it only seems that comfort and TransCab has seen huge increases in their taxi population, with these increases being from the years around 2006 to 2016.
- For TransCab, the increase could be due to the fact that it started out in around 2006, the increase could be them investing many funds into a huge taxi fleet.
- For Comfort, the increase could be due to them being the biggest taxi company in Singapore, they are hence able to accumulate the funds to buy many taxis into their fleet.

## Objective 2
##### To draw a correlation between the spread and densiy of the various public transportation and the variance in traffic congestion over time. Then we will implement some changes to public transport to reduce traffic congestion.

#### Step 5.2: What is traffic congestion?

- Traffic congestion is a condition in transport that is characterized by slower speeds, longer trip times, and increased vehicular queueing.
- For roads:
    - Some characteristics of traffic congestion are slow travelling speeds and vehicular queues that last more than 15 minutes.
    - During traffic congestion, the travelling speed of a vehicle may be as slow as 10 kilometres per hour in comparison to the usual speed of about 80 kilometres per hour. In some instances, traffic may be so congested that vehicles do no move at all.
- For trains:
    - Some characteristics of congestion can include when alll the seats on a train are full and you can hardly or cannot move around the carriage, due to all the people standing about.
    - Queues at train stations that are so long that people near the end or halfway through the queue cannot even get inside the train at all.

#### Step 5.3: Traffic congestion in Singapore.

Passenger volume by different types of public transport over time.

Firstly, let's create a new column named "TOTAL_VOLUME" in each of the passenger volume datasets. As the name suggests, the values in the "TOTAL_VOLUME" column is the sum of the columns "TOTAL_TAP_IN_VOLUME" and "TOTAL_TAP_OUT_VOLUME".
Secondly, we will create yet another column named "NET_VOLUME" in each of the passenger volume datasets. The values in the "NET_VOLUME" column is the difference between the columns "TOTAL_TAP_IN_VOLUME" and "TOTAL_TAP_OUT_VOLUME".

In [None]:
# Creating the new columns
# TOTAL_VOLUME
bus_passenger_vol['TOTAL_VOLUME'] = bus_passenger_vol['TOTAL_TAP_IN_VOLUME'] + bus_passenger_vol['TOTAL_TAP_OUT_VOLUME']
train_passenger_vol['TOTAL_VOLUME'] = train_passenger_vol['TOTAL_TAP_IN_VOLUME'] + train_passenger_vol['TOTAL_TAP_OUT_VOLUME']

# NET_VOLUME
bus_passenger_vol['NET_VOLUME'] = bus_passenger_vol['TOTAL_TAP_IN_VOLUME'] - bus_passenger_vol['TOTAL_TAP_OUT_VOLUME']
train_passenger_vol['NET_VOLUME'] = train_passenger_vol['TOTAL_TAP_IN_VOLUME'] - train_passenger_vol['TOTAL_TAP_OUT_VOLUME']

(Train)
Day against total, tap in and tap out volume

In [None]:
fig = plt.figure(figsize=(10, 6), tight_layout = True)
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)

sns.set_style({"grid.color": ".8", "grid.linestyle": ":", "ticks": True})
sns.set_theme(style="whitegrid")

sns.stripplot(
    data=train_passenger_vol,
    x='DAY_TYPE', y='TOTAL_VOLUME', hue="DAY_TYPE",
    palette="dark", alpha=.6, ax=ax1
)
sns.stripplot(
    data=train_passenger_vol,
    x='DAY_TYPE', y='NET_VOLUME', hue="DAY_TYPE",
    palette="dark", alpha=.6, ax=ax2
)
ax1.title.set_text('Total volume')
ax2.title.set_text('Net volume')

sns.despine(left=True)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=40, ha="right")
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40, ha="right")
ax1.set_ylabel("Volume")
ax1.set_xlabel("Type of day")
ax2.set_ylabel("Volume")
ax2.set_xlabel("Type of day")
ax1.legend(title='Type')
ax2.legend(title='Type')
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))
sns.move_legend(ax2, "upper left", bbox_to_anchor=(1, 1))

plt.show()

- As we can see, the total passenger volume on weekdays is a lot higher than the total passenger volume during the weekends or holidays, which is expected.
- Similarly, the spread of the net passenger volume on weekdays is a lot higher than the net passenger volume during the weekends or holidays, which too is also expected. There are more extreme values on weekdays.
- The two observations we made above are pretty obvious, no analysis is even needed, we try to find some other observations.
- For total passenger volume, on the weekday, there are a few outliers with values 250000 spotted.
- The total passenger volumes on the weekdays are more than twice than that on the weekends and holidays.
- Similarly, the net passenger volume on the weekdays about 1.5 times more than that on the weekends and holidays.
- This observation is expected, since people do not need to work or go to school on the weekends and holidays, they do not require to ride the public transport on weekends and holidays.

(Train)
Hours against total, tap in and tap out volume

In [None]:
train_passenger_vol_by_hour = train_passenger_vol.groupby('TIME_PER_HOUR')[['TOTAL_VOLUME', 'NET_VOLUME']].mean()
train_passenger_vol_by_hour = train_passenger_vol_by_hour.reset_index()

In [None]:
fig = plt.figure(figsize=(16, 9), tight_layout = True)
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)

sns.set_style({"grid.color": ".6", "grid.linestyle": ":", "ticks": True})
sns.set_theme(style="whitegrid", palette="pastel")

sns.barplot(data=train_passenger_vol_by_hour, x='TIME_PER_HOUR', y="TOTAL_VOLUME", ax=ax1)
sns.barplot(data=train_passenger_vol_by_hour, x='TIME_PER_HOUR', y="NET_VOLUME", ax=ax2)

sns.lineplot(x=np.arange(0,len(train_passenger_vol_by_hour)), y=train_passenger_vol_by_hour['TOTAL_VOLUME'], ax=ax1)
ax1.fill_between(np.arange(0,len(train_passenger_vol_by_hour)), train_passenger_vol_by_hour['TOTAL_VOLUME'], alpha=0.5)

sns.lineplot(x=np.arange(0,len(train_passenger_vol_by_hour)), y=train_passenger_vol_by_hour['NET_VOLUME'], ax=ax2)
ax2.fill_between(np.arange(0,len(train_passenger_vol_by_hour)), train_passenger_vol_by_hour['NET_VOLUME'], alpha=0.5)

ax1.title.set_text('Total volume')
ax2.title.set_text('Net volume')

sns.despine(left=True)
ax1.set_ylabel("Volume")
ax1.set_xlabel("Hour")
ax2.set_ylabel("Volume")
ax2.set_xlabel("Hour")

plt.show()

- Let us talk about the total passenger volume first.
- We can see that there are two peaks in the graph, one is at 8am and the other at 6pm.
- This corresponds to the rush hours in the day and night, where in the day, people take the train to work or school and in the night, people take the train home.
- In the wee hours in the morning, the total passenger volume is very low, estimated to be around 1000. This is the time where most people are at home sleeping, hence the low train passenger volume.
- From 6 to 8, where everyone is going to work or school, the total passenger volume shots up before dipping down a bit from around 18000 to 14000 at 9.
- The total passenger volume remains about the same till the second rush hour at 6.
- From 6 to 11pm, the toal passenger volume drops down at a steady rate from the peak of around 27000 to 5000. Everyone should be arriving home, hence the total passenger volume drops as the hours past by.

- Now, let us talk about the net passenger volume.
- From 5 to 7, the net passenger volume is positive, this means that the total tap in volume is higher than the tap out volume.
- As this is the rush hour, where everyone is boarding the train to get to work or school, the total tap in volume would naturally be higher than the tap out volume.
- Then from 8 to 9, the net passenger volume changes drastically to become negative. This is when everyone would have arrived at their destination and are now alighting the train.
- From the 9 to 14, the net passenger volume slowly increases to become positive as the rush hour is over by that time.
- At 15, the net passenger volume becomes postive, meaning the total tap in volume is higher than the tap out volume again. This is when the second rush hour begins, where everyone is returning back home.
- From 15 to 17, the passenger volume shots up drastically, representing an expoontial curve, increasing from around 300 to around 1800.
- However, for some unexplained reason from 17 to 19, the net passenger volume shots down drastically to become negative before increasing back again to around 1100 at 21.
- Then, its suddenly drops again to around -800 at 23.
- One possible reason for this that the second rush hour comes in two batches, One batch getting released earlier than the other.
- The first batch boards the train at 17, and then alights at 19.
- Then, after that, the second batch boards the train at 21, before alighting at 23.

(Bus)
Day against total, tap in and tap out volume

In [None]:
fig = plt.figure(figsize=(10, 6), tight_layout = True)
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)

sns.set_style({"grid.color": ".8", "grid.linestyle": ":", "ticks": True})
sns.set_theme(style="whitegrid")

sns.stripplot(
    data=bus_passenger_vol,
    x='DAY_TYPE', y='TOTAL_VOLUME', hue="DAY_TYPE",
    palette="dark", alpha=.6, ax=ax1
)
sns.stripplot(
    data=bus_passenger_vol,
    x='DAY_TYPE', y='NET_VOLUME', hue="DAY_TYPE",
    palette="dark", alpha=.6, ax=ax2
)
ax1.title.set_text('Total volume')
ax2.title.set_text('Net volume')

sns.despine(left=True)
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=40, ha="right")
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=40, ha="right")
ax1.set_ylabel("Volume")
ax1.set_xlabel("Type of day")
ax2.set_ylabel("Volume")
ax2.set_xlabel("Type of day")
ax1.legend(title='Type')
ax2.legend(title='Type')
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))
sns.move_legend(ax2, "upper left", bbox_to_anchor=(1, 1))

plt.show()

- As we can see, the total passenger volume on weekdays is a lot higher than the total passenger volume during the weekends or holidays, which is expected.
- Similarly, the spread of the net passenger volume on weekdays is a lot higher than the net passenger volume during the weekends or holidays, which too is also expected. There are more extreme values on weekdays.
- The two observations we made above are pretty obvious, no analysis is even needed, we try to find some other observations.
- AS compared to the train, for the bus total passenger volume, on the weekday, there are a fewer outliers with values 150000 spotted.
- The total passenger volumes on the weekdays are about twice than that on the weekends and holidays.
- Similarly, the net passenger volume on the weekdays about 1.2 times more than that on the weekends and holidays.
- This observation is expected, since people do not need to work or go to school on the weekends and holidays, they do not require to ride the public transport on weekends and holidays.

(Bus)
Hours against total, tap in and tap out volume

In [None]:
bus_passenger_vol_by_hour = bus_passenger_vol.groupby('TIME_PER_HOUR')[['TOTAL_VOLUME', 'NET_VOLUME']].mean()
bus_passenger_vol_by_hour = bus_passenger_vol_by_hour.reset_index()

In [None]:
fig = plt.figure(figsize=(16, 9), tight_layout = True)
ax1 = fig.add_subplot(221)
ax2 = fig.add_subplot(222)

sns.set_style({"grid.color": ".6", "grid.linestyle": ":", "ticks": True})
sns.set_theme(style="whitegrid", palette="pastel")

sns.barplot(data=bus_passenger_vol_by_hour, x='TIME_PER_HOUR', y="TOTAL_VOLUME", ax=ax1)
sns.barplot(data=bus_passenger_vol_by_hour, x='TIME_PER_HOUR', y="NET_VOLUME", ax=ax2)

sns.lineplot(x=np.arange(0,len(bus_passenger_vol_by_hour)), y=bus_passenger_vol_by_hour['TOTAL_VOLUME'], ax=ax1)
ax1.fill_between(np.arange(0,len(bus_passenger_vol_by_hour)), bus_passenger_vol_by_hour['TOTAL_VOLUME'], alpha=0.5)

sns.lineplot(x=np.arange(0,len(bus_passenger_vol_by_hour)), y=bus_passenger_vol_by_hour['NET_VOLUME'], ax=ax2)
ax2.fill_between(np.arange(0,len(bus_passenger_vol_by_hour)), bus_passenger_vol_by_hour['NET_VOLUME'], alpha=0.5)

ax1.title.set_text('Total volume')
ax2.title.set_text('Net volume')

sns.despine(left=True)
ax1.set_ylabel("Volume")
ax1.set_xlabel("Hour")
ax2.set_ylabel("Volume")
ax2.set_xlabel("Hour")

plt.show()

- Let us talk about the total passenger volume first.
- We can see that there are two peaks in the graph, one is at 8am and the other at 6pm, similar to the train.
- This corresponds to the rush hours in the day and night, where in the day, people take the train to work or school and in the night, people take the train home.
- In the wee hours in the morning, the total passenger volume is very low, even coming really close to zero at 3. This is the time where most people are at home sleeping, hence the low train passenger volume.
- From 5 to 8, where everyone is going to work or school, the total passenger volume shots up before dipping down a bit from around 1100 to 800 at 10.
- The total passenger volume remains about the same till the second rush hour at 5.
- From 5 to 11pm, the toal passenger volume drops down at a steady rate from the peak of around 1200 to 300. Everyone should be arriving home, hence the total passenger volume drops as the hours past by.

- Now, let us talk about the net passenger volume.
- From 5 to 7, the net passenger volume is positive, this means that the total tap in volume is higher than the tap out volume.
- As this is the rush hour, where everyone is boarding the train to get to work or school, the total tap in volume would naturally be higher than the tap out volume.
- Then from 9 to 9, the net passenger volume changes drastically to become negative. This is when everyone would have arrived at their destination and are now alighting the train.
- From the 9 to 14, the net passenger volume slowly increases to become positive and maintains there around the 0 mark as the rush hour is over by that time.
- At 15, the net passenger volume becomes postive, meaning the total tap in volume is higher than the tap out volume again. This is when the second rush hour begins, where everyone is returning back home.
- From 15 to 17, the passenger volume shots, increasing from around 5 to around 38.
- However, for some unexplained reason from 17 to 19, the net passenger volume shots down drastically to become negative before increasing back again to around -8 at 21.
- Then, its suddenly drops again to around -23 at 23.
- This is the same thing that we observed for the train, hence we will use the same reason to explain this, that is, the second rush hour comes in two batches.

Map of Passenger volume in Singapore

In [None]:
new_train_location

In [None]:
new_bus_location

Firstly, we need to merge the location datasets with the passenger volume datasets, in order to find the passenger volume at their repesctive locations. 
- The inner joins for train and bus are as follows:
    - (train) Station number <=> PT_CODE
    - (bus) Bus stop number <=> PT_CODE

In [None]:
# Merging the datasets
# Modifying the datasets before merging
train_passenger_vol.rename(columns = {
    'PT_CODE': 'Station number'
}, inplace= True)
bus_passenger_vol.rename(columns = {
    'PT_CODE': 'Bus stop number'
}, inplace= True)
new_bus_location['Bus stop number'] = new_bus_location['Bus stop number'].astype(int)

# Train
merged_train_location = pd.merge(new_train_location, train_passenger_vol, how='inner', on='Station number')
# Bus
merged_bus_location = pd.merge(new_bus_location, bus_passenger_vol, how='inner', on='Bus stop number')

In [None]:
mapbox_access_token = open(".mapbox_token").read()

# Figures
fig = go.Figure()
fig.add_trace(
    go.Scattermapbox(
        lat=merged_train_location.geometry.x, 
        lon=merged_train_location.geometry.y,
        name='Train stations',
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=merged_train_location['TOTAL_VOLUME']/5000,
            opacity=0.5,
            color='salmon'
        ),
        hovertemplate=
        '<i>Latitude</i>: %{lat:.3f}'+
        '<br><i>Longitude</i>: %{lon:.3f}<br>' +
        '<b>%{text[0]}</b>' +
        '<br><b>%{text[1]}</b>',
        text=merged_train_location[['Station name', 'Station number']]
    )
)
fig.add_trace(
    go.Scattermapbox(
        lat=merged_bus_location.geometry.x, 
        lon=merged_bus_location.geometry.y,
        name='Bus stops',
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=merged_bus_location['TOTAL_VOLUME']/5000,
            opacity=0.5,
            color='salmon'
        ),
        hovertemplate=
        '<i>Latitude</i>: %{lat:.3f}'+
        '<br><i>Longitude</i>: %{lon:.3f}<br>' +
        '<b>%{text[0]}</b>' +
        '<br><b>%{text[1]}</b>',
        text=merged_bus_location[['Bus stop number', 'Bus roof number']]
    )
)

# Create the buttons
dropdown_buttons_type = [
{'label': "ALL", 'method': "update", 'args': [{"visible": [True, True, True]}, {"title": "ALL"}]},
{'label': "Train stations", 'method': "update", 'args': [{"visible": [True, False, False]}, {"title": "Train stations"}]},
{'label': "Bus stops", 'method': "update", 'args': [{"visible": [False, True, False]}, {"title": "Bus stops"}]},
]

# Figure layout
fig.update_layout(
    autosize=False,
    width=1400,
    height=700,
    hovermode='closest',
    mapbox_style="mapbox://styles/tarnishedhunter/clde0tvs1003s01mk55yaku0z",
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=1.3521,
            lon=103.8198
        ),
        pitch=0,
        zoom=10
    ),
    margin={"r":0,"t":0,"l":0,"b":0}, # remove the white gutter between the frame and map
    hoverlabel=dict(
        bgcolor="white", # white background
        font_size=16, # label font size
        font_family="Rockwell",

    ),
    legend=dict(
        title="Colour Codes for Locations",
        entrywidth=0.3,
        orientation="h",
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01,
        font=dict(
            family="Rockwell",
            size=14,
            color="black"
        ),
        bgcolor="azure",
        bordercolor="Grey",
        borderwidth=2
    ),
    updatemenus=[
        dict(
            buttons=dropdown_buttons_type,
            direction="down",
            pad={"r": 0, "t": 0, "l":0,"b":0},
            showactive=True,
            x=1,
            xanchor="right",
            y=0.95,
            yanchor="top",
            font=dict(
                size=14,
                color="black"
            )
        )
    ]
)
fig.show()

- As we can see from the map here, there seems to be a huge volume of passengers in the west area of Singapore. We take a closer look at the west first.
- Looking closely, it seems that the town with the largest passenger volume in the west is Jurong east, followed by Jurong west and then clementi, as seen by the red points. The bigger the point, the larger the passenger volume.
- In the south of Singapore, where the central business and shopping districts are located, there is a very tight cluster of red seen. Let us zoom into the south area of Singapore and have a closer look.
- Zooming in, we can see that there are huge passenger volumes in Chinatown, raffles place and Tanjong Pagar. These areas are located in the central business district and some popular tourist spots, so it is to no one's surprised that we can find a huge volume of passengers here.
- Raffles place has the biggest big red point among the three, though Chinatown has two red points. Another cluster of red points like Chinatown can again be seen just north east of Chinatown, in bugis, where we can find another two red points situated in the same area.
- In the north, we can see some red points slight bigger than those found in the south of Singapore, though they are still obviously smaller than that found in Jurong. The points can be found in Yishun, Sembawang and Woodlands.
- The red points in the North east and East not as big as compared to those found in the south and west.
- From the many variations in sizes of the red points, we can tell that the distribution of passenger volume is not evenly distributed in Singapore. Even the red points in each town are not evenly distributed too, with some being significantly bigger than others.

- By now, we would have noticed that all of these red points are mrt stations. Even though the bus ridership is bigger than the train ridership, the carrying capacity of an mrt is a lot bigger than even the bus with the largest carrying capacity (>70). This obviously will cause the passenger volume at a given time to be a lot higher in an mrt station as compared to a bus stop.
- Hence, I have decided to create two tabs on the map, one of passenger volume of mrt/lrt stations, and another for bus stops.
- Now, let us switch to the bus stops tab.
- Instantly, we will observed the distribution of passenger volumes of bus stops is a lot different as comapred to that of the train.
- The south part of Singapore, in the central business and shopping districts, there used to be a high train passenger volume there. However, now for bus, there isnt.
- Another difference is that, in the west, the train passenger volume for Jurong east is a higher than that for Jurong west. However, the bus passenger volume for Jurong west is a lot more higher than that for Jurong east.
- Also, in the north, woodlands is the only one out of the three (Yishun, Sembawang and Woodlands) with a high bus passenger volume.
- There seems to be a decently high bus passenger volume in the east in Tampines. This is compared to their low train bus passenger volume.
- In the north east, generally, the train passenger volume is a lot higher than the bus passenger volume.

- Now, let's investigate further into these towns with high passenger volume, or let's say, high traffic congestion.
- Let's see how their passenger volume changes over time as well as the type of day.
- The towns that we will be looking at are as follows:
- Train
    - Jurong East
    - Chinatown
    - Raffles Place
- Bus
    - Jurong West
    - Tampines
    - Woodlands

In [None]:
# Extracting the datasets
# Train
selected_train_station = ['JURONG EAST MRT STATION', 'CHINATOWN MRT STATION', 'RAFFLES PLACE MRT STATION']
merged_train_location_selected = merged_train_location[merged_train_location['Station name'].isin(selected_train_station)]

# Bus
selected_bus_stop = [22009, 46009, 75009]
merged_bus_location_selected = merged_bus_location[merged_bus_location['Bus stop number'].isin(selected_bus_stop)]

Jointplot of Train passenger volume tap in against tap out

In [None]:
fig = plt.figure(figsize=(16, 9), tight_layout = True)

sns.set_style({"grid.color": ".6", "grid.linestyle": ":", "ticks": True})
sns.set_theme(style="darkgrid", palette="pastel")

sns.jointplot(x="TOTAL_TAP_IN_VOLUME", y="TOTAL_TAP_OUT_VOLUME", data = merged_train_location_selected, hue = "Station name")
sns.despine(left=True)
plt.xlabel("Total tap in volume")
plt.ylabel("Total tap out volume")
plt.legend(title="Towns", labels=['Chinatown', 'Raffles place', 'Jurong East'])

plt.show()

- Looking at the jointplot, we can immediately see that there is a posittive correlation between the total tap in and tap out volume.
- From the jointplot, we can see that the dispersion of passenger volume is very big in the towns chinatown and jurong east. The dispersion for raffles place, however, is not very big. The points are more clustered together.
- This means that the net passenger volumes for the twons chinatown and jurong east are very widely dispersed, there are extreme values for net volumes. If we plot the volumes onto a box plot, we would see that there will be many outliers.
- As for the net passenger volume for raffles place, the spread will be a lot lesser, there arent that many extreme values. Ploting them onto a box plot, few outliers will be seen
- Looking at the kde plots for both total tap in and tap out volumes, we can see that the distribution for total tap in volume is positively skewed while the total tap out volume is negatively skewed.
- This means that in the day, the total tap in volume is a lot higher, while in the night, the total tap out volume will be a lot higher
- For both kde plots, raffles place has the highest peak, followed by jurong east, and lastly chinatown.

Jointplot of Bus passenger volume tap in against tap out

In [None]:
fig = plt.figure(figsize=(16, 9), tight_layout = True)

sns.set_style({"grid.color": ".6", "grid.linestyle": ":", "ticks": True})
sns.set_theme(style="darkgrid", palette="pastel")

sns.jointplot(x="TOTAL_TAP_IN_VOLUME", y="TOTAL_TAP_OUT_VOLUME", data = merged_bus_location_selected, hue = "Bus stop number")
sns.despine(left=True)
plt.xlabel("Total tap in volume")
plt.ylabel("Total tap out volume")
plt.legend(title="Towns", labels=['Jurong West', 'Woodlands', 'Tampines'])

plt.show()

- Looking at the jointplot, we can immediately see that there is a posittive correlation between the total tap in and tap out volume.
- From the jointplot, we can see unlike the previous jointplot, the dispersion for all the towns are not very big.
- This means that for the net passneger volumes in all towns, the spread will be a lot lesser, there arent that many extreme values. Ploting them onto a box plot, few outliers will be seen
- Looking at the kde plots for both total tap in and tap out volumes, we can see that the distribution for total tap in volume is positively skewed while the total tap out volume is negatively skewed.
- This means that in the day, the total tap in volume is a lot higher, while in the night, the total tap out volume will be a lot higher
- For all the kde plots, the distribution of bothh total tap in and tapa out volume are about the same.

Linegraph of net passenger volume over time of each area by each public transport type

In [None]:
fig, ax = plt.subplots(figsize = (16, 9), tight_layout = True)

sns.lineplot(data=merged_train_location_selected, x='TIME_PER_HOUR', y='NET_VOLUME', hue='Station name', ax=ax)

ax.set_title(r"$\bf{Comparing\ the\ train\ passenger\ volume\ across\ towns\ over\ the\ day}$" + "\nNet Passenger Volume Per Hour", loc = "left", fontsize = 20)
ax.set_xticks(np.arange(24))
ax.set_xlabel("Hours")
ax.set_ylabel("Net Passenger Volume")
ax.legend(title="Towns", labels=['Chinatown', 'Raffles place', 'Jurong East'], loc="upper left")

label_box = {
    "boxstyle" : "square,pad=0.3",
    "fc" : "white",
    "ec" : "dimgrey"
}
ax.annotate(r"$\bf{7-8am:}$" + "\nIn the morning, office workers and students \nalight the train at their offices and schools", 
(8, -58000), (10, -70000), arrowprops = { "arrowstyle" : "->", "color" : "black"}, bbox = label_box)
ax.annotate("", (8, -16000), (10, -70000), arrowprops = { "arrowstyle" : "->", "color" : "black"})
ax.annotate("", (7, -27000), (10, -70000), arrowprops = { "arrowstyle" : "->", "color" : "black"})

ax.annotate(r"$\bf{5-9pm:}$" + "\nIn the evening, office workers and students \nride the train to return home", 
(17, 24000), (15, -30000), arrowprops = { "arrowstyle" : "->", "color" : "black"}, bbox = label_box)
ax.annotate("", (18, 45000), (15, -22000), arrowprops = { "arrowstyle" : "->", "color" : "black"})
ax.annotate("", (20, 22000), (15, -22000), arrowprops = { "arrowstyle" : "->", "color" : "black"})

plt.show()

- As we can see, the train net passenger volume is negative in the morning and positive at night.
- This means that in the morning, the total tap out volume is a lot smaller than the tap in volume. Remember, Net volume = Tap in volume - Tap out volume. This explains the negative value.
- In the night, the total tap in volume is a lot higher than the tap out volume. The net passenger volume will hence be positive.
- In both the day and night, jurong east has the highest peaks, followed by chinatown, and lastly raffles place, at around -55000 and around 40000 respectively.
- In the night the peak of Chinatowns comes very close to jurong east, only a miniscule margin separates them.

In [None]:
fig, ax = plt.subplots(figsize = (16, 9), tight_layout = True)

sns.lineplot(data=merged_bus_location_selected, x='TIME_PER_HOUR', y='NET_VOLUME', hue='Bus stop number', ax=ax)

ax.set_title(r"$\bf{Comparing\ the\ bus\ passenger\ volume\ across\ towns\ over\ the\ day}$" + "\nNet Passenger Volume Per Hour", loc = "left", fontsize = 20)
ax.set_xticks(np.arange(24))
ax.set_xlabel("Hours")
ax.set_ylabel("Net Passenger Volume")
ax.legend(title="Towns", labels=['Jurong West', 'Woodlands', 'Tampines'], loc="upper left")

label_box = {
    "boxstyle" : "square,pad=0.3",
    "fc" : "white",
    "ec" : "dimgrey"
}
ax.annotate(r"$\bf{7-8am:}$" + "\nIn the morning, office workers and students \nalight the train at their offices and schools", 
(8, -1000), (5, 30000), arrowprops = { "arrowstyle" : "->", "color" : "black"}, bbox = label_box)
ax.annotate("", (8, -16000), (5, 30000), arrowprops = { "arrowstyle" : "->", "color" : "black"})
ax.annotate("", (7, -7000), (5, 30000), arrowprops = { "arrowstyle" : "->", "color" : "black"})

ax.annotate(r"$\bf{6pm:}$" + "\nIn the evening, office workers and students \nride the train to return home", 
(21, 19000), (15, -10000), arrowprops = { "arrowstyle" : "->", "color" : "black"}, bbox = label_box)
ax.annotate("", (21, 30000), (15, -10000), arrowprops = { "arrowstyle" : "->", "color" : "black"})
ax.annotate("", (21, 22000), (15, -10000), arrowprops = { "arrowstyle" : "->", "color" : "black"})

plt.show()

- As we can see, the bus net passenger volume is negative in the morning and positive at night.
- This means that in the morning, the total tap out volume is a lot smaller than the tap in volume. Remember, Net volume = Tap in volume - Tap out volume. This explains the negative value.
- In the night, the total tap in volume is a lot higher than the tap out volume. The net passenger volume will hence be positive.
- In the day, Tampines has the highest net passenger volume peak at around -15000. This is followed by woodlands and lastly, Jurong West, which actually has a positive net passenger volume peak of around 500.
- In the night, Jurong West has the highest net passenger volume peak at around 30000. This is followed by Tampines and lastly, woodlands. Both Tampines and woodlands have almost similar net passenger volumes, at around 21000 and 20000 respectively. They are separated by a very small margin.

Stripplot of net passenger volume by type of day of each area by each public transport type

In [None]:
fig, ax = plt.subplots(figsize = (10, 6), tight_layout = True)
g= sns.boxplot(x="Station name", y="NET_VOLUME", hue="DAY_TYPE", data = merged_train_location_selected, ax = ax)

g.set_xticks(range(len(merged_train_location_selected['Station name'].unique())))
g.set_xticklabels(['Chinatown', 'Raffles place', 'Jurong East'])
ax.set_xlabel("Train passenger volume")
ax.set_ylabel("Station")
ax.set_title("Train passenger volume by station by day type", weight="semibold", fontsize = 20)
ax.legend(title='Day type', labels=['Weekday', 'Weekends/holiday'])
sns.despine(right = True)

plt.show()

- Previously, we have already concluded that both the train and bus passenger volume are higher on the weekdays as compared to the weekends/holidays.
- It appears that this conclusion still is correct and applies now that we are looking at the individual stations themselves.
- Weekday is the boxplot on the left of each station and weekends/holidays is the one on the right.
- For all three stations, we can clearly see that the spread of passenger volume is indeed larger on the weekdays as compared to the weekends/holidays.
- The medians for all the boxplots (all the stations and days) are all almost about the same, around the zero mark.

- However, for the three stations, the spreads of the passenger volume on the weekday and weekends/holidays are all not the same.
- For the weekdays:
    - The spread of passenger volume is the highest in Jurong East, followed by raffles place and lastly chinatown
- For the weekends/holidays:
    - The spread of passenger volume is the highest in Jurong East again, followed by Chinatown and lastly raffles place.
- There isnt many outliers to be seen, there is only 6 of them
- There are three outliers on the weekdays in raffles place, two are above the upper whisker and one below the lower whisker. There is another outlier, above the upper whisker, on the weekend/holidays in raffles place too.
- There are two outliers, both above the upper whisker, on the weekends/holidays in Jurong East.

In [None]:
fig, ax = plt.subplots(figsize = (10, 6), tight_layout = True)
g= sns.boxplot(x="Bus stop number", y="NET_VOLUME", hue="DAY_TYPE", data = merged_bus_location_selected, ax = ax)

g.set_xticks(range(len(merged_bus_location_selected['Bus stop number'].unique())))
g.set_xticklabels(['Jurong West', 'Woodlands', 'Tampines'])
ax.set_xlabel("Bus passenger volume")
ax.set_ylabel("Bus stop")
ax.set_title("Bus passenger volume by station by day type", weight="semibold", fontsize = 20)
ax.legend(title='Day type', labels=['Weekends/holiday', 'Weekday'])
sns.despine(right = True)

plt.show()

- The same conclusion applies too to the net passenger volume in each individual stop.
- Weekday is the boxplot on the left of each stop and weekends/holidays is the one on the right.
- For all three bus stops, we can clearly see that the spread of passenger volume is indeed larger on the weekdays as compared to the weekends/holidays.
- Unlike the train stations, the medians for all the bus stop box plots are not the same.
- All the medians are about the same, at the zero mark, except for the median net bus passenger volume on the weekdays at Jurong West. The median is about 5000.

- However, for the three bus stops, the spreads of the passenger volume on the weekday and weekends/holidays are all not the same.
- For the weekdays:
    - The spread of passenger volume is about the same for Jurong West and Tampines. The spread for Woodlands is lesser than the both of them.
- For the weekends/holidays:
    - The spread of passenger volume is about the same for Woodlands and Tampines. The spread for Jurong East is slightly more than the both of them.
- There isnt many outliers to be seen, there is only 10 of them.
- There are two outliers on the weekends/holidays in Jurong West, both above the upper whisker.
- There are three outliers, all above the upper whisker, on the weekends/holidays in Woodlands.
- There are five outliers, three above the upper whisker and two below the lower whisker, on the weekends/holidays in Tampines.
- It appears that there arent any outliers for all three bus stops on the weekdays.

#### Step 5.4: Why is traffic congestion an important issue?

- Traffic congestion is an important issue that must be resolved for it can bring about many negative impacts on society itself.
- These impacts include economical, social and environmental imapcts.
- Economical impacts
    - Reduced productivity
    - Productivity refers to the rate at which goods or services are produced. It involves engaging in activities that bring about economic benefit.
    - Traffic congestion can reduce productivity since people have to spend more time commuting. This wastes time that could be spent on productive work, leading to an economic loss for the country as a whole.
- Social impacts
    - Increased strain on mental health
    - Long travelling times can lead to fatigue. These fatigue can sometimes lead to stress as well as frustration. On the roads, drivers and their passengers are more prone to losing their concentration or falling asleep, causing accidents and even deaths
- Environmental impacts
    - Air and noise pollution
    - Iding vehciles stuck in traffic congestions can produce harmful exhaust flumes such as carbon dioxide, carbon monoxide and particulate matter. Some particulates are so small that they can penetrate deep into the lung tissues, causing inflammation. Long term exposure to these flumes have been linked to several health problems, including asthma and lung diseases.
    - The rumbling engines of the many idling vehicles stuck in traffic congestions can cause harmful and excessive noise that can disrupt human and animal life.
- These negative impacts hence proves the point that traffic congestion is an important isssue that must be resolved immediately.

Traffic congestion is such a serious issue! What can we do to prevent it? We will talk about some possible solutions in the next step.

#### Step 5.5: What changes can we implement to the public transport in Singapore to prevent traffic congestion?

- Timed to precision public transport
- Stacked work and school hours
- Expansion of rail networks

In [None]:
world_metro = pd.read_csv('Datasets/japan transportation/dataclips_ssxbtmzqfqzgdsxhlibtetfutilz.csv')

In [None]:
from shapely import wkt

world_metro['geometry'] = world_metro['geometry'].apply(wkt.loads)

In [None]:
world_metro = gpd.GeoDataFrame(world_metro)
world_metro = world_metro.set_geometry('geometry')

In [None]:
world_metro.set_crs(epsg=4326, inplace=True, allow_override=True)

In [None]:
world_metro.crs

In [None]:
world_metro

In [None]:
mapbox_access_token = open(".mapbox_token").read()

# Figures
fig = go.Figure()
fig.add_trace(
    go.Scattermapbox(
        lat=world_metro.geometry.y,
        lon=world_metro.geometry.x,
        name='Metro stations',
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=5,
            opacity=0.5,
            color='darkseagreen'
        ),
        hovertemplate=
        '<i>Latitude</i>: %{lat:.3f}'+
        '<br><i>Longitude</i>: %{lon:.3f}<br>' +
        '<b>%{text[0]}</b>' +
        '<br><b>%{text[1]}</b>',
        text=world_metro[['name', 'city_id']]
    )
)

# Figure layout
fig.update_layout(
    autosize=False,
    width=1400,
    height=700,
    hovermode='closest',
    mapbox_style="mapbox://styles/tarnishedhunter/clde0tvs1003s01mk55yaku0z",
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=35.6762,
            lon=139.6503
        ),
        pitch=0,
        zoom=1
    ),
    margin={"r":0,"t":0,"l":0,"b":0}, # remove the white gutter between the frame and map
    hoverlabel=dict(
        bgcolor="white", # white background
        font_size=16, # label font size
        font_family="Rockwell",
    )
)

fig.show()

In [None]:
tokyo_sections = gpd.read_file('Datasets/japan transportation/tokyo_sections.geojson')
tokyo_stations = gpd.read_file('Datasets/japan transportation/tokyo_stations.geojson')
tokyo_lines = pd.read_json("Datasets/japan transportation/tokyo_lines_systems_and_modes.json")

In [None]:
for dataset in [tokyo_sections, tokyo_stations, tokyo_lines]:
    display(dataset)

In [None]:
import shapely.geometry
import wget

lats = []
lons = []
ids = []

for feature, id in zip(tokyo_sections.geometry, tokyo_sections.id):
    if isinstance(feature, shapely.geometry.linestring.LineString):
        linestrings = [feature]
    elif isinstance(feature, shapely.geometry.multilinestring.MultiLineString):
        linestrings = feature.geoms
    else:
        continue
    for linestring in linestrings:
        x, y = linestring.xy
        lats = np.append(lats, y)
        lons = np.append(lons, x)
        ids = np.append(ids, [id]*len(y))
        lats = np.append(lats, None)
        lons = np.append(lons, None)
        ids = np.append(ids, None)

# Scatter Map
mapbox_access_token = open(".mapbox_token").read()

# Figures
fig = go.Figure()
fig.add_trace(
    go.Scattermapbox(
        lat=tokyo_stations.geometry.y,
        lon=tokyo_stations.geometry.x,
        name='Tokyo metro stations',
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=5,
            opacity=0.5,
            color='darkseagreen'
        ),
        hovertemplate=
        '<i>Latitude</i>: %{lat:.3f}'+
        '<br><i>Longitude</i>: %{lon:.3f}<br>' +
        '<b>%{text[0]}</b>' +
        '<br><b>%{text[1]}</b>',
        text=tokyo_stations[['name', 'lines']]
    )
)
fig.add_trace(
    go.Scattermapbox(
        lat=lats,
        lon=lons,
        name='Bus stops',
        mode='markers+lines',
        marker=go.scattermapbox.Marker(
            size=3,
            opacity=0.5,
            color='darkseagreen'
        ),
        hovertemplate=
        '<i>Latitude</i>: %{lat:.3f}'+
        '<br><i>Longitude</i>: %{lon:.3f}<br>' +
        '<i>ID</i>: %{text}',
        text=ids
    )
)

# Create the buttons
dropdown_buttons = [
{'label': "ALL", 'method': "update", 'args': [{"visible": [True, True, True]}, {"title": "ALL"}]},
{'label': "Tokyo metro stations", 'method': "update", 'args': [{"visible": [True, False, False]}, {"title": "Tokyo metro stations"}]},
{'label': "Tokyo metro sections", 'method': "update", 'args': [{"visible": [False, True, False]}, {"title": "Tokyo metro sections"}]}
]

# Figure layout
fig.update_layout(
    autosize=False,
    width=1400,
    height=700,
    hovermode='closest',
    mapbox_style="mapbox://styles/tarnishedhunter/clde0tvs1003s01mk55yaku0z",
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=go.layout.mapbox.Center(
            lat=35.6762,
            lon=139.6503
        ),
        pitch=0,
        zoom=10
    ),
    margin={"r":0,"t":0,"l":0,"b":0}, # remove the white gutter between the frame and map
    hoverlabel=dict(
        bgcolor="white", # white background
        font_size=16, # label font size
        font_family="Rockwell",

    ),
    legend=dict(
        title="Colour Codes for Locations",
        entrywidth=0.3,
        orientation="h",
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01,
        font=dict(
            family="Rockwell",
            size=14,
            color="black"
        ),
        bgcolor="azure",
        bordercolor="Grey",
        borderwidth=2
    ),
    updatemenus=[
        dict(
            buttons=dropdown_buttons,
            direction="down",
            pad={"r": 0, "t": 0, "l":0,"b":0},
            showactive=True,
            x=1,
            xanchor="right",
            y=0.95,
            yanchor="top",
            font=dict(
                size=14,
                color="black"
            )
        ),
    ]
)
fig.show()