In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load datasets
demand_data = pd.read_csv("D:/Ullas/MSc Data Analytics Capstone Project/Datasets/Passenger Journeys by Public Transport.csv")
demographic_data = pd.read_csv("D:/Ullas/MSc Data Analytics Capstone Project/Datasets/Demographic.csv")
weather_data = pd.read_csv("D:/Ullas/MSc Data Analytics Capstone Project/Datasets/weather.csv")

In [3]:
demand_data.head()

Unnamed: 0,STATISTIC,Statistic Label,TLIST(A1),Year,C03935V04687,Mode of Transport,C01198V01436,Weeks of the year,UNIT,VALUE
0,THA24C01,Passenger Journeys,2019,2019,10,Dublin Metro Bus,1,Week 01,Number,1987891.0
1,THA24C01,Passenger Journeys,2019,2019,10,Dublin Metro Bus,2,Week 02,Number,2709579.0
2,THA24C01,Passenger Journeys,2019,2019,10,Dublin Metro Bus,3,Week 03,Number,2784678.0
3,THA24C01,Passenger Journeys,2019,2019,10,Dublin Metro Bus,4,Week 04,Number,2858346.0
4,THA24C01,Passenger Journeys,2019,2019,10,Dublin Metro Bus,5,Week 05,Number,2924821.0


In [4]:
demographic_data.head()

Unnamed: 0,STATISTIC,Statistic Label,TLIST(A1),Year,C02199V02655,Sex,C02076V02508,Age Group,UNIT,VALUE
0,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,350,18 - 24 years,%,
1,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,415,25 - 34 years,%,19.0
2,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,465,35 - 44 years,%,10.0
3,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,500,45 - 54 years,%,9.0
4,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,535,55 - 64 years,%,7.0


In [5]:
weather_data.head()

Unnamed: 0,date,ind,rain,ind.1,maxt,ind.2,mint,gmin,soil
0,01-Jan-41,0,2.4,,,,,,
1,02-Jan-41,0,0.9,,,,,,
2,03-Jan-41,0,0.0,,,,,,
3,04-Jan-41,0,0.0,,,,,,
4,05-Jan-41,0,0.0,,,,,,


In [6]:
# Displaying basic information about the datasets
print("Demand Data Info:")
print(demand_data.info())

print("\nDemographic Data Info:")
print(demographic_data.info())

print("\nWeather Data Info:")
print(weather_data.info())

# Summary statistics
print("\nDemand Data Summary Statistics:")
print(demand_data.describe())

print("\nDemographic Data Summary Statistics:")
print(demographic_data.describe())

print("\nWeather Data Summary Statistics:")
print(weather_data.describe())

Demand Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1060 entries, 0 to 1059
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   STATISTIC          1060 non-null   object 
 1   Statistic Label    1060 non-null   object 
 2   TLIST(A1)          1060 non-null   int64  
 3   Year               1060 non-null   int64  
 4   C03935V04687       1060 non-null   int64  
 5   Mode of Transport  1060 non-null   object 
 6   C01198V01436       1060 non-null   int64  
 7   Weeks of the year  1060 non-null   object 
 8   UNIT               1060 non-null   object 
 9   VALUE              868 non-null    float64
dtypes: float64(1), int64(4), object(5)
memory usage: 82.9+ KB
None

Demographic Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STATI

In [7]:
demand_data.columns

Index(['STATISTIC', 'Statistic Label', 'TLIST(A1)', 'Year', 'C03935V04687',
       'Mode of Transport', 'C01198V01436', 'Weeks of the year', 'UNIT',
       'VALUE'],
      dtype='object')

In [8]:
demographic_data.columns

Index(['STATISTIC', 'Statistic Label', 'TLIST(A1)', 'Year', 'C02199V02655',
       'Sex', 'C02076V02508', 'Age Group', 'UNIT', 'VALUE'],
      dtype='object')

In [9]:
weather_data.columns

Index(['date', 'ind', 'rain', 'ind.1', 'maxt', 'ind.2', 'mint', 'gmin',
       'soil'],
      dtype='object')

# Preparing the first dataset: demand_data

In [10]:
# Renaming columns
demand_data.rename(columns={'Weeks of the year': 'Weeks', 'VALUE': 'Demand'}, inplace=True)

In [11]:
demand_data.drop(['STATISTIC','TLIST(A1)','UNIT'], axis = 1, inplace = True)

In [12]:
#converting year and weeks columns to date column 
demand_data['Date'] = demand_data['Year'].astype(str) + '-W' + demand_data['Weeks'].str.slice(start=5).astype(int).apply(lambda x: f'{x:02}')
demand_data['Date'] = pd.to_datetime(demand_data['Date'] + '-1', format='%Y-W%U-%w')
demand_data['Date'] = demand_data['Date'].dt.strftime('%d/%m/%Y')

In [13]:
desired_order = ['Year', 'Weeks', 'Date', 'Mode of Transport', 'Statistic Label', 'Demand', 'C03935V04687', 'C01198V01436']

demand_data = demand_data.reindex(columns=desired_order)

#Dealing with the missing values on the dataset
demand_data.interpolate(method='linear', inplace=True)

#formating the Deamnd column to make it more simpler 
demand_data['Demand'] = demand_data['Demand'].apply(lambda x: "{:,.0f}".format(x))

# remove commas on Demand
demand_data['Demand'] = demand_data['Demand'].str.replace(',', '').astype(int)

In [14]:
demand_data.drop(['C03935V04687','C01198V01436'], axis = 1, inplace = True)

In [15]:
demand_data.columns

Index(['Year', 'Weeks', 'Date', 'Mode of Transport', 'Statistic Label',
       'Demand'],
      dtype='object')

In [16]:
demand_data.head()

Unnamed: 0,Year,Weeks,Date,Mode of Transport,Statistic Label,Demand
0,2019,Week 01,07/01/2019,Dublin Metro Bus,Passenger Journeys,1987891
1,2019,Week 02,14/01/2019,Dublin Metro Bus,Passenger Journeys,2709579
2,2019,Week 03,21/01/2019,Dublin Metro Bus,Passenger Journeys,2784678
3,2019,Week 04,28/01/2019,Dublin Metro Bus,Passenger Journeys,2858346
4,2019,Week 05,04/02/2019,Dublin Metro Bus,Passenger Journeys,2924821


In [17]:
demand_data['Date'] = pd.to_datetime(demand_data['Date'], format='%d/%m/%Y')

In [18]:
demand_data['Date'] = pd.to_datetime(demand_data['Date'])

In [19]:
unique_transport_modes = demand_data["Mode of Transport"].unique()

In [20]:
print(unique_transport_modes)

['Dublin Metro Bus' 'Bus, excluding Dublin Metro' 'Rail'
 'All public transport, excluding LUAS']


In [21]:
# Mapping the names in the column 
transport_mapping = {
    "Dublin Metro Bus": "DART", "Bus, excluding Dublin Metro": "Dublin Bus", "All public transport, excluding LUAS" : "LUAS" 
}

In [22]:
# Replace the old names with new names
demand_data["Mode of Transport"] = demand_data["Mode of Transport"].replace(transport_mapping)

In [23]:
pd.set_option('display.max_rows', None)
print(demand_data)

      Year    Weeks       Date Mode of Transport     Statistic Label   Demand
0     2019  Week 01 2019-01-07              DART  Passenger Journeys  1987891
1     2019  Week 02 2019-01-14              DART  Passenger Journeys  2709579
2     2019  Week 03 2019-01-21              DART  Passenger Journeys  2784678
3     2019  Week 04 2019-01-28              DART  Passenger Journeys  2858346
4     2019  Week 05 2019-02-04              DART  Passenger Journeys  2924821
5     2019  Week 06 2019-02-11              DART  Passenger Journeys  2979960
6     2019  Week 07 2019-02-18              DART  Passenger Journeys  3053710
7     2019  Week 08 2019-02-25              DART  Passenger Journeys  2927041
8     2019  Week 09 2019-03-04              DART  Passenger Journeys  2989171
9     2019  Week 10 2019-03-11              DART  Passenger Journeys  3065148
10    2019  Week 11 2019-03-18              DART  Passenger Journeys  2998440
11    2019  Week 12 2019-03-25              DART  Passenger Jour

In [24]:
total_lines = len(demand_data)
print("Total lines of data:", total_lines)

Total lines of data: 1060


# Preparing the Second dataset: demographic_data

In [25]:
demographic_data.head()

Unnamed: 0,STATISTIC,Statistic Label,TLIST(A1),Year,C02199V02655,Sex,C02076V02508,Age Group,UNIT,VALUE
0,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,350,18 - 24 years,%,
1,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,415,25 - 34 years,%,19.0
2,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,465,35 - 44 years,%,10.0
3,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,500,45 - 54 years,%,9.0
4,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,535,55 - 64 years,%,7.0


In [26]:
total_lines = len(demographic_data)
print("Total lines of data:", total_lines)

Total lines of data: 112


In [27]:
unique_Statistic_label = demographic_data["Statistic Label"].unique()

In [28]:
print(unique_Statistic_label)

['Frequency of use of bus services - Weekly or more than weekly'
 'Frequency of use of bus services - Less than weekly'
 'Frequency of use of bus services - Never'
 'Frequency of use of bus services - of which had COVID concerns'
 'Frequency of use of rail services - Weekly or more than weekly'
 'Frequency of use of rail services - Less than weekly'
 'Frequency of use of rail services - Never'
 'Frequency of use of rail services - of which had COVID concerns']


In [29]:
# Renaming columns
demographic_data.rename(columns={'Sex': 'Gender'}, inplace=True)

In [30]:
demographic_data.drop(['STATISTIC','TLIST(A1)','C02199V02655','C02076V02508','UNIT','Statistic Label','VALUE'], axis = 1, inplace = True)

In [31]:
demographic_data.head()

Unnamed: 0,Year,Gender,Age Group
0,2021,Male,18 - 24 years
1,2021,Male,25 - 34 years
2,2021,Male,35 - 44 years
3,2021,Male,45 - 54 years
4,2021,Male,55 - 64 years


In [32]:
# Forward fill and Backward fill
demographic_data.fillna(method='ffill', inplace=True)  
demographic_data.fillna(method='bfill', inplace=True) 

In [33]:
demographic_data.interpolate(method='linear', inplace=True)

In [34]:
pd.set_option('display.max_rows', None)
print(demographic_data)

     Year  Gender          Age Group
0    2021    Male      18 - 24 years
1    2021    Male      25 - 34 years
2    2021    Male      35 - 44 years
3    2021    Male      45 - 54 years
4    2021    Male      55 - 64 years
5    2021    Male      65 - 74 years
6    2021    Male  75 years and over
7    2021  Female      18 - 24 years
8    2021  Female      25 - 34 years
9    2021  Female      35 - 44 years
10   2021  Female      45 - 54 years
11   2021  Female      55 - 64 years
12   2021  Female      65 - 74 years
13   2021  Female  75 years and over
14   2021    Male      18 - 24 years
15   2021    Male      25 - 34 years
16   2021    Male      35 - 44 years
17   2021    Male      45 - 54 years
18   2021    Male      55 - 64 years
19   2021    Male      65 - 74 years
20   2021    Male  75 years and over
21   2021  Female      18 - 24 years
22   2021  Female      25 - 34 years
23   2021  Female      35 - 44 years
24   2021  Female      45 - 54 years
25   2021  Female      55 - 64 years
2

In [35]:
demographic_data.columns

Index(['Year', 'Gender', 'Age Group'], dtype='object')

In [36]:
total_lines = len(demographic_data)
print("Total lines of data:", total_lines)

Total lines of data: 112


# Preparing the Third dataset: weather_data

In [38]:
weather_data.head()

Unnamed: 0,date,ind,rain,ind.1,maxt,ind.2,mint,gmin,soil
0,01-Jan-41,0,2.4,,,,,,
1,02-Jan-41,0,0.9,,,,,,
2,03-Jan-41,0,0.0,,,,,,
3,04-Jan-41,0,0.0,,,,,,
4,05-Jan-41,0,0.0,,,,,,


In [39]:
# Renaming columns
weather_data.rename(columns={'ind': 'Indicator', 'rain': 'Rain(mm)','maxt':'Temperature(c)','soil':'Soil'}, inplace=True)

In [40]:
weather_data.drop(['ind.1','ind.2','mint','gmin','Soil','Indicator'], axis = 1, inplace = True)

In [41]:
weather_data.columns

Index(['date', 'Rain(mm)', 'Temperature(c)'], dtype='object')

In [42]:
# Converting date column to datetime format
weather_data['date'] = pd.to_datetime(weather_data['date'], format='%d-%b-%y')

In [43]:
start_date = pd.to_datetime('1941-01-01')
end_date = pd.to_datetime('2023-05-31')

In [44]:
weather_data = weather_data[(weather_data['date'] >= start_date) & (weather_data['date'] <= end_date)]

In [45]:
total_lines = len(weather_data)
print("Total lines of data:", total_lines)

Total lines of data: 26782


In [46]:
for column in ["Rain(mm)", "Temperature(c)"]:
    weather_data[column] = weather_data.groupby('date')[column].transform(
        lambda x: x.fillna(np.random.choice(x.dropna()))
    )

In [47]:
pd.set_option('display.max_rows', None)
print(weather_data)

            date  Rain(mm) Temperature(c)
11257 1969-01-01       0.0               
11258 1969-01-01       0.0            6.6
11259 1969-01-02       0.2               
11260 1969-01-02       0.2            8.5
11261 1969-01-03       0.0               
11262 1969-01-03       0.0            7.9
11263 1969-01-04       0.2            8.4
11264 1969-01-04       0.2               
11265 1969-01-05       0.0            4.1
11266 1969-01-05       0.0               
11267 1969-01-06      10.8               
11268 1969-01-06      10.8            4.6
11269 1969-01-07      13.8               
11270 1969-01-07      13.8            6.6
11271 1969-01-08       0.0            6.8
11272 1969-01-08       0.0               
11273 1969-01-09       1.8               
11274 1969-01-09       1.8            8.5
11275 1969-01-10       1.0               
11276 1969-01-10       1.0            7.4
11277 1969-01-11       5.0               
11278 1969-01-11       5.0            7.8
11279 1969-01-12       7.4        

In [48]:
weather_data.columns

Index(['date', 'Rain(mm)', 'Temperature(c)'], dtype='object')

# Inclusive Preprocessing Methodology

In [49]:
demand_data.head()

Unnamed: 0,Year,Weeks,Date,Mode of Transport,Statistic Label,Demand
0,2019,Week 01,2019-01-07,DART,Passenger Journeys,1987891
1,2019,Week 02,2019-01-14,DART,Passenger Journeys,2709579
2,2019,Week 03,2019-01-21,DART,Passenger Journeys,2784678
3,2019,Week 04,2019-01-28,DART,Passenger Journeys,2858346
4,2019,Week 05,2019-02-04,DART,Passenger Journeys,2924821


In [50]:
demographic_data.head()

Unnamed: 0,Year,Gender,Age Group
0,2021,Male,18 - 24 years
1,2021,Male,25 - 34 years
2,2021,Male,35 - 44 years
3,2021,Male,45 - 54 years
4,2021,Male,55 - 64 years


In [51]:
weather_data.head()

Unnamed: 0,date,Rain(mm),Temperature(c)
11257,1969-01-01,0.0,
11258,1969-01-01,0.0,6.6
11259,1969-01-02,0.2,
11260,1969-01-02,0.2,8.5
11261,1969-01-03,0.0,


In [52]:
# Checking indexing
print(demand_data.index)
print(demographic_data.index)
print(weather_data.index)

RangeIndex(start=0, stop=1060, step=1)
RangeIndex(start=0, stop=112, step=1)
Int64Index([11257, 11258, 11259, 11260, 11261, 11262, 11263, 11264, 11265,
            11266,
            ...
            38029, 38030, 38031, 38032, 38033, 38034, 38035, 38036, 38037,
            38038],
           dtype='int64', length=26782)


In [53]:
# Resetting indexing
demand_data = demand_data.reset_index()
demographic_data = demographic_data.reset_index()
weather_data = weather_data.reset_index()

In [54]:
# Checking indexing
print(demand_data.index)
print(demographic_data.index)
print(weather_data.index)

RangeIndex(start=0, stop=1060, step=1)
RangeIndex(start=0, stop=112, step=1)
RangeIndex(start=0, stop=26782, step=1)
