# Starting with comparitive analysis and all main functions

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
# Load datasets
demand_data = pd.read_csv("D:/Ullas/MSc Data Analytics Capstone Project/Datasets/Passenger Journeys by Public Transport.csv")
demographic_data = pd.read_csv("D:/Ullas/MSc Data Analytics Capstone Project/Datasets/Demographic.csv")
weather_data = pd.read_csv("D:/Ullas/MSc Data Analytics Capstone Project/Datasets/weather.csv")

In [3]:
demand_data.head()

Unnamed: 0,STATISTIC,Statistic Label,TLIST(A1),Year,C03935V04687,Mode of Transport,C01198V01436,Weeks of the year,UNIT,VALUE
0,THA24C01,Passenger Journeys,2019,2019,10,Dublin Metro Bus,1,Week 01,Number,1987891.0
1,THA24C01,Passenger Journeys,2019,2019,10,Dublin Metro Bus,2,Week 02,Number,2709579.0
2,THA24C01,Passenger Journeys,2019,2019,10,Dublin Metro Bus,3,Week 03,Number,2784678.0
3,THA24C01,Passenger Journeys,2019,2019,10,Dublin Metro Bus,4,Week 04,Number,2858346.0
4,THA24C01,Passenger Journeys,2019,2019,10,Dublin Metro Bus,5,Week 05,Number,2924821.0


In [4]:
demographic_data.head()

Unnamed: 0,STATISTIC,Statistic Label,TLIST(A1),Year,C02199V02655,Sex,C02076V02508,Age Group,UNIT,VALUE
0,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,350,18 - 24 years,%,
1,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,415,25 - 34 years,%,19.0
2,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,465,35 - 44 years,%,10.0
3,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,500,45 - 54 years,%,9.0
4,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,535,55 - 64 years,%,7.0


In [5]:
weather_data.head()

Unnamed: 0,date,ind,rain,ind.1,maxt,ind.2,mint,gmin,soil
0,01-Jan-41,0,2.4,,,,,,
1,02-Jan-41,0,0.9,,,,,,
2,03-Jan-41,0,0.0,,,,,,
3,04-Jan-41,0,0.0,,,,,,
4,05-Jan-41,0,0.0,,,,,,


In [6]:
# Displaying basic information about the datasets
print("Demand Data Info:")
print(demand_data.info())

print("\nDemographic Data Info:")
print(demographic_data.info())

print("\nWeather Data Info:")
print(weather_data.info())

# Summary statistics
print("\nDemand Data Summary Statistics:")
print(demand_data.describe())

print("\nDemographic Data Summary Statistics:")
print(demographic_data.describe())

print("\nWeather Data Summary Statistics:")
print(weather_data.describe())

Demand Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1060 entries, 0 to 1059
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   STATISTIC          1060 non-null   object 
 1   Statistic Label    1060 non-null   object 
 2   TLIST(A1)          1060 non-null   int64  
 3   Year               1060 non-null   int64  
 4   C03935V04687       1060 non-null   int64  
 5   Mode of Transport  1060 non-null   object 
 6   C01198V01436       1060 non-null   int64  
 7   Weeks of the year  1060 non-null   object 
 8   UNIT               1060 non-null   object 
 9   VALUE              868 non-null    float64
dtypes: float64(1), int64(4), object(5)
memory usage: 82.9+ KB
None

Demographic Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   STATI

In [7]:
demand_data.columns

Index(['STATISTIC', 'Statistic Label', 'TLIST(A1)', 'Year', 'C03935V04687',
       'Mode of Transport', 'C01198V01436', 'Weeks of the year', 'UNIT',
       'VALUE'],
      dtype='object')

In [8]:
demographic_data.columns

Index(['STATISTIC', 'Statistic Label', 'TLIST(A1)', 'Year', 'C02199V02655',
       'Sex', 'C02076V02508', 'Age Group', 'UNIT', 'VALUE'],
      dtype='object')

In [9]:
weather_data.columns

Index(['date', 'ind', 'rain', 'ind.1', 'maxt', 'ind.2', 'mint', 'gmin',
       'soil'],
      dtype='object')

# Preparing the first dataset: demand_data

In [10]:
# Renaming columns
demand_data.rename(columns={'Weeks of the year': 'Weeks', 'VALUE': 'Demand'}, inplace=True)

In [11]:
demand_data.drop(['STATISTIC','TLIST(A1)','UNIT'], axis = 1, inplace = True)

In [12]:
#converting year and weeks columns to date column 
demand_data['Date'] = demand_data['Year'].astype(str) + '-W' + demand_data['Weeks'].str.slice(start=5).astype(int).apply(lambda x: f'{x:02}')
demand_data['Date'] = pd.to_datetime(demand_data['Date'] + '-1', format='%Y-W%U-%w')
demand_data['Date'] = demand_data['Date'].dt.strftime('%d/%m/%Y')

In [13]:
desired_order = ['Year', 'Weeks', 'Date', 'Mode of Transport', 'Statistic Label', 'Demand', 'C03935V04687', 'C01198V01436']

demand_data = demand_data.reindex(columns=desired_order)

#Dealing with the missing values on the dataset
demand_data.interpolate(method='linear', inplace=True)

#formating the Deamnd column to make it more simpler 
demand_data['Demand'] = demand_data['Demand'].apply(lambda x: "{:,.0f}".format(x))

# remove commas on Demand
demand_data['Demand'] = demand_data['Demand'].str.replace(',', '').astype(int)
# Convert Date column to datetime
demand_data['Date'] = pd.to_datetime(demand_data['Date'])

  demand_data['Date'] = pd.to_datetime(demand_data['Date'])


In [14]:
demand_data.drop(['C03935V04687','C01198V01436'], axis = 1, inplace = True)

In [15]:
demand_data.columns

Index(['Year', 'Weeks', 'Date', 'Mode of Transport', 'Statistic Label',
       'Demand'],
      dtype='object')

In [16]:
demand_data.head()

Unnamed: 0,Year,Weeks,Date,Mode of Transport,Statistic Label,Demand
0,2019,Week 01,2019-07-01,Dublin Metro Bus,Passenger Journeys,1987891
1,2019,Week 02,2019-01-14,Dublin Metro Bus,Passenger Journeys,2709579
2,2019,Week 03,2019-01-21,Dublin Metro Bus,Passenger Journeys,2784678
3,2019,Week 04,2019-01-28,Dublin Metro Bus,Passenger Journeys,2858346
4,2019,Week 05,2019-04-02,Dublin Metro Bus,Passenger Journeys,2924821


In [17]:
demand_data = demand_data[demand_data['Year'] == 2021]

In [18]:
demand_data.head()

Unnamed: 0,Year,Weeks,Date,Mode of Transport,Statistic Label,Demand
424,2021,Week 01,2021-04-01,Dublin Metro Bus,Passenger Journeys,747912
425,2021,Week 02,2021-11-01,Dublin Metro Bus,Passenger Journeys,735000
426,2021,Week 03,2021-01-18,Dublin Metro Bus,Passenger Journeys,731273
427,2021,Week 04,2021-01-25,Dublin Metro Bus,Passenger Journeys,742813
428,2021,Week 05,2021-01-02,Dublin Metro Bus,Passenger Journeys,791189


In [19]:
total_lines = len(demand_data)
print("Total lines of data:", total_lines)

Total lines of data: 212


# Preparing the second dataset: demographic_data

In [20]:
demographic_data.head()

Unnamed: 0,STATISTIC,Statistic Label,TLIST(A1),Year,C02199V02655,Sex,C02076V02508,Age Group,UNIT,VALUE
0,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,350,18 - 24 years,%,
1,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,415,25 - 34 years,%,19.0
2,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,465,35 - 44 years,%,10.0
3,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,500,45 - 54 years,%,9.0
4,NTA105C01,Frequency of use of bus services - Weekly or m...,2021,2021,1,Male,535,55 - 64 years,%,7.0


In [21]:
# Renaming columns
demographic_data.rename(columns={'VALUE': 'Demand %'}, inplace=True)

In [22]:
demographic_data.drop(['STATISTIC','TLIST(A1)','C02199V02655','C02076V02508','UNIT'], axis = 1, inplace = True)

In [23]:
demographic_data.head()

Unnamed: 0,Statistic Label,Year,Sex,Age Group,Demand %
0,Frequency of use of bus services - Weekly or m...,2021,Male,18 - 24 years,
1,Frequency of use of bus services - Weekly or m...,2021,Male,25 - 34 years,19.0
2,Frequency of use of bus services - Weekly or m...,2021,Male,35 - 44 years,10.0
3,Frequency of use of bus services - Weekly or m...,2021,Male,45 - 54 years,9.0
4,Frequency of use of bus services - Weekly or m...,2021,Male,55 - 64 years,7.0


In [24]:
mean_value = demographic_data['Demand %'].mean()
demographic_data['Demand %'].fillna(mean_value, inplace=True)

In [25]:
# Forward fill and Backward fill
demographic_data.fillna(method='ffill', inplace=True)  
demographic_data.fillna(method='bfill', inplace=True)  

In [26]:
demographic_data.interpolate(method='linear', inplace=True)

In [27]:
demographic_data.head()

Unnamed: 0,Statistic Label,Year,Sex,Age Group,Demand %
0,Frequency of use of bus services - Weekly or m...,2021,Male,18 - 24 years,28.59375
1,Frequency of use of bus services - Weekly or m...,2021,Male,25 - 34 years,19.0
2,Frequency of use of bus services - Weekly or m...,2021,Male,35 - 44 years,10.0
3,Frequency of use of bus services - Weekly or m...,2021,Male,45 - 54 years,9.0
4,Frequency of use of bus services - Weekly or m...,2021,Male,55 - 64 years,7.0


In [28]:
demographic_data.columns

Index(['Statistic Label', 'Year', 'Sex', 'Age Group', 'Demand %'], dtype='object')

In [29]:
total_lines = len(demographic_data)
print("Total lines of data:", total_lines)

Total lines of data: 112


# Preparing the Third dataset: weather_data

In [30]:
weather_data.head()

Unnamed: 0,date,ind,rain,ind.1,maxt,ind.2,mint,gmin,soil
0,01-Jan-41,0,2.4,,,,,,
1,02-Jan-41,0,0.9,,,,,,
2,03-Jan-41,0,0.0,,,,,,
3,04-Jan-41,0,0.0,,,,,,
4,05-Jan-41,0,0.0,,,,,,


In [31]:
# Renaming columns
weather_data.rename(columns={'ind': 'Indicator', 'rain': 'Rain','maxt':'Temperature(c)','soil':'Soil'}, inplace=True)

In [32]:
weather_data.drop(['ind.1','ind.2','mint','gmin'], axis = 1, inplace = True)

In [33]:
weather_data.columns

Index(['date', 'Indicator', 'Rain', 'Temperature(c)', 'Soil'], dtype='object')

In [34]:
# Converting date column to datetime format
weather_data['date'] = pd.to_datetime(weather_data['date'], format='%d-%b-%y')
# Only taking data of year 2021
weather_data = weather_data[weather_data['date'].dt.year == 2021]

In [35]:
weather_data.head()

Unnamed: 0,date,Indicator,Rain,Temperature(c),Soil
37158,2021-01-01,0,0.4,6.1,
37159,2021-01-02,4,0.0,4.5,
37160,2021-01-03,0,4.4,7.0,
37161,2021-01-04,0,1.0,6.6,
37162,2021-01-05,0,4.9,5.6,


In [36]:
total_lines = len(weather_data)
print("Total lines of data:", total_lines)

Total lines of data: 365


In [37]:
# filling with values on Soil column 
random_soil_values = np.random.rand(len(weather_data)) * 10 
weather_data['Soil'] = random_soil_values

In [38]:
# filling missing values
weather_data['Rain'].interpolate(method='linear', inplace=True)
weather_data['Temperature(c)'].fillna(method='ffill', inplace=True)

In [39]:
weather_data.head()

Unnamed: 0,date,Indicator,Rain,Temperature(c),Soil
37158,2021-01-01,0,0.4,6.1,4.481144
37159,2021-01-02,4,0.0,4.5,6.824264
37160,2021-01-03,0,4.4,7.0,2.214276
37161,2021-01-04,0,1.0,6.6,0.191226
37162,2021-01-05,0,4.9,5.6,7.090872


In [40]:
weather_data.columns

Index(['date', 'Indicator', 'Rain', 'Temperature(c)', 'Soil'], dtype='object')

In [41]:
weather_data.head()

Unnamed: 0,date,Indicator,Rain,Temperature(c),Soil
37158,2021-01-01,0,0.4,6.1,4.481144
37159,2021-01-02,4,0.0,4.5,6.824264
37160,2021-01-03,0,4.4,7.0,2.214276
37161,2021-01-04,0,1.0,6.6,0.191226
37162,2021-01-05,0,4.9,5.6,7.090872


# Visulisation for the first dataset:demand_data

In [42]:
demand_data.head()

Unnamed: 0,Year,Weeks,Date,Mode of Transport,Statistic Label,Demand
424,2021,Week 01,2021-04-01,Dublin Metro Bus,Passenger Journeys,747912
425,2021,Week 02,2021-11-01,Dublin Metro Bus,Passenger Journeys,735000
426,2021,Week 03,2021-01-18,Dublin Metro Bus,Passenger Journeys,731273
427,2021,Week 04,2021-01-25,Dublin Metro Bus,Passenger Journeys,742813
428,2021,Week 05,2021-01-02,Dublin Metro Bus,Passenger Journeys,791189


In [43]:
demand_data['Date'] = pd.to_datetime(demand_data['Date'])

In [44]:
demand_data.set_index('Date', inplace=True)

In [45]:
# Plotting demand over time
plt.figure(figsize=(10, 6))
plt.plot(demand_data['Demand'], label='Demand', color='blue')
plt.xlabel('Date')
plt.ylabel('Demand')
plt.title('Demand Over Time')
plt.legend()
plt.grid(True)
plt.show()

NameError: name 'plt' is not defined

In [None]:
# Plotting demand distribution by mode of transport
plt.figure(figsize=(10, 6))
demand_data.groupby('Mode of Transport')['Demand'].sum().plot(kind='bar', color='green')
plt.xlabel('Mode of Transport')
plt.ylabel('Total Demand')
plt.title('Total Demand by Mode of Transport')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

In [None]:
# Plotting demand trend by weeks
weekly_demand = demand_data.groupby('Weeks')['Demand'].sum()
plt.figure(figsize=(10, 6))
plt.plot(weekly_demand, marker='o', color='orange')
plt.xlabel('Weeks')
plt.ylabel('Total Demand')
plt.title('Weekly Demand Trend')
plt.grid(True)
plt.show()

# Visulisation for the second dataset:demographic_data

In [None]:
demographic_data.head()

In [None]:
demographic_data.head()

In [None]:
demographic_data.set_index('Year', inplace=True)

In [None]:
demographic_data.head()

In [None]:
# Plotting Age Group Distribution
plt.figure(figsize=(10, 6))
demographic_data.groupby('Age Group')['Demand %'].mean().plot(kind='bar', color='purple')
plt.xlabel('Age Group')
plt.ylabel('Average Demand %')
plt.title('Average Demand % by Age Group')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

In [None]:
# Plotting Sex Distribution
plt.figure(figsize=(10, 6))
demographic_data.groupby('Sex')['Demand %'].mean().plot(kind='pie', autopct='%1.1f%%', colors=['lightcoral', 'lightblue'])
plt.title('Demand % Distribution by Sex')
plt.ylabel('')
plt.show()

In [None]:
# Plotting Statistic Label Trend over Time
statistic_labels = demographic_data['Statistic Label'].unique()
plt.figure(figsize=(12, 8))
for label in statistic_labels:
    label_data = demographic_data[demographic_data['Statistic Label'] == label]
    plt.plot(label_data.index, label_data['Demand %'], label=label)
plt.xlabel('Year')
plt.ylabel('Demand %')
plt.title('Demand % Trend for Different Statistic Labels')
plt.legend()
plt.grid(True)
plt.show()

# Visulisation for the third dataset:weather_data

In [None]:
weather_data['date'] = pd.to_datetime(weather_data['date'])

In [None]:
weather_data.set_index('date', inplace=True)

In [None]:
# Plotting Rain over Time
plt.figure(figsize=(10, 6))
plt.plot(weather_data['Rain'], label='Rain', color='blue')
plt.xlabel('Date')
plt.ylabel('Rain')
plt.title('Rain Over Time')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plotting Temperature Trend over Time
plt.figure(figsize=(15, 22))
plt.plot(weather_data['Temperature(c)'], label='Temperature', color='red')
plt.xlabel('Date')
plt.ylabel('Temperature (°C)')
plt.title('Temperature Trend Over Time')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plotting Soil Distribution
plt.figure(figsize=(10, 6))
weather_data['Soil'].hist(bins=20, color='green')
plt.xlabel('Soil')
plt.ylabel('Frequency')
plt.title('Soil Distribution')
plt.grid(True)
plt.show()

# Inclusive Preprocessing Methodology

In [None]:
demand_data.head()

In [None]:
demographic_data.head()

In [None]:
weather_data.head()

In [None]:
# Checking indexing
print(demand_data.index)
print(demographic_data.index)
print(weather_data.index)

In [None]:
# Resetting indexing
demand_data = demand_data.reset_index()
demographic_data = demographic_data.reset_index()
weather_data = weather_data.reset_index()

In [None]:
demand_data.head()

In [None]:
demographic_data.head()

In [None]:
weather_data.head()

In [None]:
print(demand_data.head())
print(demographic_data.head())
print(weather_data.head())

In [None]:
# Renamming the columns in demographic_data to match demand_data for merging
demographic_data.rename(columns={'Statistic Label': 'Mode of Transport'}, inplace=True)

In [None]:
# merging the datasets
merged_data = pd.concat([demand_data, demographic_data, weather_data], ignore_index=True)

In [None]:
# Sorting the concatenated_data by the Date column
merged_data.sort_values(by='Date', inplace=True)

In [None]:
# Resetting the index of the merged data
merged_data.reset_index(drop=True, inplace=True)

In [None]:
merged_data

In [None]:
# Dropping the date column to make it easier
merged_data.drop(['date'], axis = 1, inplace = True)

In [None]:
merged_data

In [None]:
# Dealing with missing values with random values from the same column 
for column in merged_data.columns:
    non_null_values = merged_data[column].dropna()
    num_missing_values = merged_data[column].isnull().sum()
    if num_missing_values > 0:
        random_fill_values = np.random.choice(non_null_values, num_missing_values)
        merged_data[column][merged_data[column].isnull()] = random_fill_values

In [None]:
# Converting the date colum to datetime
merged_data['Date'] = pd.to_datetime(merged_data['Date'])

In [None]:
# Extracting Weeks number to arrange according to Date column 
merged_data['Weeks'] = merged_data['Date'].dt.strftime('%U')

In [None]:
# Sorting the dataset according to the Date column 
merged_data.sort_values(by='Date', inplace=True)

In [None]:
# Resetting the index
merged_data.reset_index(drop=True, inplace=True)

In [None]:
merged_data.head()

# Detecting and addressing outliers that might affect the model's performance.

In [None]:
# Specifying columns to consider for outlier detection
numerical_columns = ['Demand', 'Demand %','Indicator','Rain', 'Temperature(c)', 'Soil']

In [None]:
# Calculating z-scores for each numerical column
z_scores = np.abs((merged_data[numerical_columns] - merged_data[numerical_columns].mean()) / merged_data[numerical_columns].std())

In [None]:
print("Z-Scores:")
print(z_scores)

In [None]:
# Setting a threshold for outlier detection
outlier_threshold = 3

In [None]:
# Creating a mask for outliers
outlier_mask = z_scores > outlier_threshold

In [None]:
# Dropping rows with outliers
merged_data = merged_data[~outlier_mask.any(axis=1)]

In [None]:
merged_data.head()

# Standardising numerical features to ensure consistent scaling.

In [None]:
# Specifying columns to Standadise
numerical_columns = ['Demand', 'Demand %','Indicator','Rain', 'Temperature(c)', 'Soil']

In [None]:
scaler = MinMaxScaler()

In [None]:
# Fitting the scaler on the data and transforming the selected columns
merged_data[numerical_columns] = scaler.fit_transform(merged_data[numerical_columns])

In [None]:
merged_data.head()

# Encoding categorical variables using one-hot encoding and label Encoder

In [None]:
# Specifying columns to encode
categorical_columns = ['Mode of Transport','Statistic Label', 'Sex', 'Age Group','Soil','Demand %','Temperature(c)']

In [None]:
# Perform one-hot encoding method
one_hot_encoded = pd.get_dummies(merged_data, columns=categorical_columns, drop_first=True)

In [None]:
# Encode categorical columns
label_encoder = LabelEncoder()
for col in categorical_columns:
    merged_data[col] = label_encoder.fit_transform(merged_data[col])

In [None]:
merged_data

# Perform feature selection to identify the most relevant features for the predictive models of my dataset

# Correlation Analysis:

In [None]:
# Calculating correlations
correlation_matrix = merged_data.corr()

In [None]:
# Sorting features by their correlation with the target variable
correlation_with_target = correlation_matrix['Demand'].abs().sort_values(ascending=False)

In [None]:
# Selecting top N relevant features
top_features = correlation_with_target[1:6].index.tolist()

# Feature Importance from Models:

In [None]:
# Separating features and target
X = merged_data.drop(columns=['Demand'])
y = merged_data['Demand']

In [None]:
# Encoding categorical variables using one-hot encoding
X_encoded = pd.get_dummies(X, columns=['Mode of Transport','Statistic Label', 'Sex', 'Age Group'])

In [None]:
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [None]:
print(X_train.dtypes)

In [None]:
X_train['Year'] = X_train['Date'].dt.year
X_train['Month'] = X_train['Date'].dt.month
X_train['Day'] = X_train['Date'].dt.day
X_train.drop(columns=['Date'], inplace=True)

In [None]:
# Trainning a RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Getting the feature importance scores
feature_importances = model.feature_importances_

In [None]:
# Getting the columns from X_train after one-hot encoding
encoded_columns = X_train.columns

In [None]:
# Creating a DataFrame for feature importances using the correct columns
feature_importance_df = pd.DataFrame({'Feature': encoded_columns, 'Importance': feature_importances})

In [None]:
# Sortting the features by importance
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

In [None]:
# Plotting feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance from Model')
plt.show()

# Exploring different ML models for our datset

In [None]:
# Converting Date column to datetime type
merged_data['Date'] = pd.to_datetime(merged_data['Date'])

In [None]:
# Calculating the number of days since one reference date
reference_date = merged_data['Date'].min()
merged_data['Days_Since_Reference'] = (merged_data['Date'] - reference_date).dt.days

In [None]:
# Defining features and target variable
X = merged_data.drop(['Demand', 'Date'], axis=1)
y = merged_data['Demand']

# Linear Regression:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
# Making predictions
predictions = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, predictions)
print("Linear Regression MSE:", mse)

In [None]:
# Scatter plot for actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, color='blue')
plt.xlabel('Actual Demand')
plt.ylabel('Predicted Demand')
plt.title('Actual vs. Predicted Demand')
plt.grid(True)
plt.show()

# Decision Trees:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = DecisionTreeRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, predictions)
print("Decision Tree MSE:", mse)

In [None]:
# Scatter plot for actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, color='green')
plt.xlabel('Actual Demand')
plt.ylabel('Predicted Demand')
plt.title('Actual vs. Predicted Demand (Decision Tree)')
plt.grid(True)
plt.show()

# Random Forest:

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, predictions)
print("Random Forest MSE:", mse)

In [None]:
# Scatter plot for actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, color='blue')
plt.xlabel('Actual Demand')
plt.ylabel('Predicted Demand')
plt.title('Actual vs. Predicted Demand (Random Forest)')
plt.grid(True)
plt.show()

# Neural Networks (using Keras):

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(1, activation='linear'))

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train_scaled, y_train, epochs=100, batch_size=32)

In [None]:
predictions = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, predictions)
print("Neural Network MSE:", mse)

In [None]:
# Scatter plot for actual vs predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, color='blue')
plt.xlabel('Actual Demand')
plt.ylabel('Predicted Demand')
plt.title('Actual vs. Predicted Demand (Neural Network)')
plt.grid(True)
plt.show()

# Investigating time series analysis methods, including autoregressive integrated moving average (ARIMA),seasonal decomposition of time series (STL) and long short-term memory networks (LSTM)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import LSTM, Dense
from statsmodels.tsa.arima.model import ARIMA 

In [None]:
# Converting Date column to datetime type
merged_data['Date'] = pd.to_datetime(merged_data['Date'])

In [None]:
# Setting Date as the index
merged_data.set_index('Date', inplace=True)

In [None]:
resampled_data = merged_data.resample('W').sum()

In [None]:
# Plotting the time series data
plt.figure(figsize=(12, 6))
plt.plot(resampled_data['Demand'])
plt.title('Time Series Plot of Demand')
plt.xlabel('Date')
plt.ylabel('Demand')
plt.show()

# STL

In [None]:
# Seasonal Decomposition using STL
result = seasonal_decompose(resampled_data['Demand'], model='additive', period=26)

In [None]:
# Plotting the decomposition components
plt.figure(figsize=(12, 8))
plt.subplot(4, 1, 1)
plt.plot(result.observed)
plt.title('Observed')
plt.subplot(4, 1, 2)
plt.plot(result.trend)
plt.title('Trend')
plt.subplot(4, 1, 3)
plt.plot(result.seasonal)
plt.title('Seasonal')
plt.subplot(4, 1, 4)
plt.plot(result.resid)
plt.title('Residual')
plt.tight_layout()
plt.show()

# ARIMA

In [None]:
# ARIMA Modeling using auto arima to find optimal parameters
stepwise_fit = auto_arima(resampled_data['Demand'], seasonal=True, m=52, trace=True)

In [None]:
# Fitting the ARIMA model
order = stepwise_fit.get_params()['order']
seasonal_order = stepwise_fit.get_params()['seasonal_order']
model = ARIMA(resampled_data['Demand'], order=order, seasonal_order=seasonal_order)
arima_result = model.fit()

In [None]:
# Plotting ARIMA forecast
plt.figure(figsize=(12, 6))
plt.plot(resampled_data.index, resampled_data['Demand'], label='Observed')
plt.plot(resampled_data.index, arima_result.predict(typ='levels'), color='red', label='ARIMA Forecast')
plt.title('ARIMA Forecast')
plt.xlabel('Date')
plt.ylabel('Demand')
plt.legend()
plt.show()

# LSTM

In [None]:
# Prepare data for LSTM
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(np.array(resampled_data['Demand']).reshape(-1, 1))

In [None]:
# Split data into train and test sets
train_size = int(len(scaled_data) * 0.8)
train_data, test_data = scaled_data[:train_size], scaled_data[train_size:]

In [None]:
# Creating sequences for LSTM
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

seq_length = 10
X_train, y_train = create_sequences(train_data, seq_length)
X_test, y_test = create_sequences(test_data, seq_length)

In [None]:
# Build LSTM model
model_lstm = Sequential()
model_lstm.add(LSTM(50, activation='relu', input_shape=(seq_length, 1)))
model_lstm.add(Dense(1))
model_lstm.compile(optimizer='adam', loss='mse')

In [None]:
# Trainning LSTM model
model_lstm.fit(X_train, y_train, epochs=50, batch_size=16, verbose=1)

In [None]:
# Making predictions using LSTM model
y_pred = model_lstm.predict(X_test)

In [None]:
# Inverse transform predictions for comparison
y_pred_inv = scaler.inverse_transform(y_pred)
y_test_inv = scaler.inverse_transform(y_test)

In [None]:
# Calculating RMSE for LSTM
rmse_lstm = np.sqrt(mean_squared_error(y_test_inv, y_pred_inv))
print(f'RMSE for LSTM: {rmse_lstm}')

In [None]:
# Plotting LSTM predictions
plt.figure(figsize=(12, 6))
plt.plot(resampled_data.index[train_size+seq_length:], y_test_inv, label='True Values')
plt.plot(resampled_data.index[train_size+seq_length:], y_pred_inv, label='LSTM Predictions', color='red')
plt.title('LSTM Predictions')
plt.xlabel('Date')
plt.ylabel('Demand')
plt.legend()
plt.show()

# Supervised Machine Learning

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error

In [None]:
X = merged_data.drop(['Demand %'], axis=1) 
y = merged_data['Demand %'] 

In [None]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Preprocessing
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Regression Models
regressors = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42)
}

In [None]:
for name, model in regressors.items():
    # Trainning the model
    model.fit(X_train_scaled, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate the model
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'{name}: RMSE = {rmse}')

In [None]:
# Classification Models
classifiers = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree Classifier': DecisionTreeClassifier(random_state=42)
}

In [None]:
# Mode of Transportfor classification
y_train_class = (y_train > 0.5).astype(int)
y_test_class = (y_test > 0.5).astype(int)

In [None]:
for name, model in classifiers.items():
    # Train the model
    model.fit(X_train_scaled, y_train_class)
    
    # Predict on the test set
    y_pred_class = model.predict(X_test_scaled)

In [None]:
    # Evaluate the model
    accuracy = accuracy_score(y_test_class, y_pred_class)
    precision = precision_score(y_test_class, y_pred_class)
    recall = recall_score(y_test_class, y_pred_class)
    f1 = f1_score(y_test_class, y_pred_class)

In [None]:
print(f'{name}: Accuracy = {accuracy}, Precision = {precision}, Recall = {recall}, F1-score = {f1}')

# Hyperparameter Tuning and Grid Search Cross-Validation

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# Defining features and target variable
X = merged_data.drop(["Demand"], axis=1)
y = merged_data["Demand"]

In [None]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize a RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

In [None]:
# Defining hyperparameters grid for grid search
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [None]:
# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           scoring='neg_mean_squared_error', cv=5)

In [None]:
# Fit the GridSearchCV on the training data
grid_search.fit(X_train, y_train)

In [None]:
# Getting the best parameters and best estimator from the grid search
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

In [None]:
# Making predictions on the test set using the best estimator
y_pred = best_estimator.predict(X_test)

In [None]:
# Calculate mean squared error as the evaluation metric
mse = mean_squared_error(y_test, y_pred)

In [None]:
print("Best hyperparameters:", best_params)
print("Mean Squared Error:", mse)

In [None]:
# Create a table for model comparison
model_comparison = pd.DataFrame({
    "Model": ["Random Forest"],
    "Mean Squared Error": [mse]
})

# Predictive Modeling

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
# Define features and target variable
X = merged_data[["Rain", "Temperature(c)", "Soil"]]
y = merged_data["Demand"]

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize a LinearRegression model
lr_model = LinearRegression()

In [None]:
# Fit the model on the training data
lr_model.fit(X_train, y_train)

In [None]:
# Make predictions on the test set
y_pred = lr_model.predict(X_test)

In [None]:
# Print Mean Squared Error
print("Mean Squared Error:", mse)

In [None]:
# Investigate feature impacts
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": lr_model.coef_
})

In [None]:
# Visualize feature impacts
plt.figure(figsize=(10, 6))
sns.barplot(x="Coefficient", y="Feature", data=coef_df)
plt.title("Feature Impacts on Demand")
plt.xlabel("Coefficient")
plt.ylabel("Feature")
plt.tight_layout()
plt.show()

In [None]:
# Propose decision-support system
def demand_prediction(rain, temperature, soil):
    features = [[rain, temperature, soil]]
    demand = lr_model.predict(features)
    return demand[0]

In [None]:
# usage of the decision-support system
predicted_demand = demand_prediction(rain=50, temperature=25, soil=200)
print("Predicted Demand:", predicted_demand)