In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
test_file_path = 'C:/Users/we19383/DataspellProjects/MLP_coursework/morebikes2022/test.csv'
test_data_path = os.path.join(test_file_path)
test_data = pd.read_csv(test_data_path)

In [3]:
test_data['station']

0       201
1       201
2       201
3       201
4       201
       ... 
2245    275
2246    275
2247    275
2248    275
2249    275
Name: station, Length: 2250, dtype: int64

In [4]:
train_file_path = 'C:/Users/we19383/DataspellProjects/MLP_coursework/morebikes2022/Models/'
train_data = os.path.join(train_file_path)
model_type = ['full', 'full_temp']


In [5]:
result_list = []

# Iterate through each station
for i in range(1,201):
    # Create an empty DataFrame to store the weights for the current station
    station_dict = {'station': i}

    # Iterate through each model for the current station
    for j in range(len(model_type)):  # Assuming models are named model1.csv, model2.csv, ..., model6.csv
        model_filename = os.path.join(train_file_path, f"model_station_{i}_rlm_{model_type[j]}.csv")
        #print(model_filename)
        # Check if the file exists
        if os.path.isfile(model_filename):
            # Load the model weights into a DataFrame
            model_weights = pd.read_csv(model_filename)
            
            bikes_3h_ago = model_weights['weight'][1]
            full_profile_bikes = model_weights['weight'][2]

            avg_bikes_3h_ago = np.mean(bikes_3h_ago)
            avg_full_profile_bikes = np.mean(full_profile_bikes)

            station_dict[f"bikes_3h_ago"] = avg_bikes_3h_ago
            station_dict[f"full_profile_bikes"] = avg_full_profile_bikes
           
    result_list.append(station_dict)

weight_df = pd.DataFrame(result_list)

In [6]:
weight_df

Unnamed: 0,station,bikes_3h_ago,full_profile_bikes
0,1,0.518931,3.438250e-01
1,2,0.626153,2.959324e-01
2,3,0.595897,2.415987e-01
3,4,0.587546,3.286884e-01
4,5,0.331868,6.738652e-01
...,...,...,...
195,196,0.999998,9.656896e-07
196,197,0.894453,2.638717e-02
197,198,0.903765,6.661564e-02
198,199,0.902956,3.322967e-04


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression()
bayes_rid = BayesianRidge()
rid = Ridge()
hbg = HistGradientBoostingRegressor()

In [8]:
def station_models_v2(model, test_data):
    """This function will train a selected model on stations 1-200 and calculate the MAE for each station between 201-275."""

    #station_models = {}  # This dictionary will store individual models for each station
    station_maes = []
    station_rmses = []
    # Extract training data for stations 1-200
    train_df = weight_df[weight_df['station'] <= 200]
    X_train = train_df[['station', 'full_profile_bikes']]
    y_train = train_df['bikes_3h_ago']/train_df['bikes_3h_ago'].max()

    # Train the model on stations 1-200
    model.fit(X_train, y_train)

    for station in range(201, 276):
        # Extract test data for the current station
        test_data_station = test_data[test_data['station'] == station]


        X_test = test_data_station[['station', 'full_profile_bikes']]
        y_test = test_data_station['bikes_3h_ago'] / test_data_station['bikes_3h_ago'].max()

        # Make predictions on the test data for the current station
        predictions = model.predict(X_test)

        # Evaluate the model
        mae = mean_absolute_error(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions, squared=False))
        station_maes.append(mae)
        station_rmses.append(rmse)
        #print(f"This is the mean absolute error for station {station}: {mae:.3f}")

    # Calculate and print the average MAE
    average_mae = np.mean(station_maes)
    average_rmse = np.mean(station_rmses)
    print(f"\nAverage MAE across all stations between 201-275 for {model}: {average_mae:.3f}+/- {average_mae.std():.3f}\n")
    print(f"\nAverage RMSE across all stations between 201-275 for {model}: {average_rmse:.3f}+/- {average_rmse.std():.3f}\n")

from sklearn.metrics import mean_absolute_error

station_models_v2(lin_reg, test_data)
station_models_v2(rid, test_data)
station_models_v2(bayes_rid, test_data)
station_models_v2(hbg, test_data)



Average MAE across all stations between 201-275 for LinearRegression(): 4.970+/- 0.000


Average RMSE across all stations between 201-275 for LinearRegression(): 2.295+/- 0.000


Average MAE across all stations between 201-275 for Ridge(): 3.643+/- 0.000


Average RMSE across all stations between 201-275 for Ridge(): 1.967+/- 0.000

Average MAE across all stations between 201-275 for BayesianRidge(): 4.955+/- 0.000


Average RMSE across all stations between 201-275 for BayesianRidge(): 2.292+/- 0.000

Average MAE across all stations between 201-275 for HistGradientBoostingRegressor(): 0.297+/- 0.000


Average RMSE across all stations between 201-275 for HistGradientBoostingRegressor(): 0.581+/- 0.000


In [9]:
filepath = 'C:/Users/we19383/DataspellProjects/MLP_coursework/morebikes2022/Models/'

phase2_files = sorted([i for i in os.listdir(filepath) if i.endswith('.csv')])
models = []

for file in phase2_files:
    csv_filepath = os.path.join(filepath, file)
    model = pd.read_csv(csv_filepath)
    models.append(model)

all_models = pd.concat(models, ignore_index=True)
all_models

Unnamed: 0,feature,weight
0,(Intercept),0.273869
1,bikes_3h_ago,0.776697
2,full_profile_bikes,0.157565
3,full_profile_3h_diff_bikes,0.562749
4,(Intercept),0.124532
...,...,...
6195,(Intercept),0.062848
6196,bikes_3h_ago,0.487413
6197,short_profile_bikes,0.431775
6198,short_profile_3h_diff_bikes,0.423223


In [10]:
pivot_df = all_models.pivot(columns='feature', values='weight')
pivot_df = pivot_df.fillna(pivot_df.median())
print(pivot_df)

feature  (Intercept)  bikes_3h_ago  full_profile_3h_diff_bikes  \
0           0.273869      0.759984                    0.458880   
1           0.349830      0.776697                    0.458880   
2           0.349830      0.759984                    0.458880   
3           0.349830      0.759984                    0.562749   
4           0.124532      0.759984                    0.458880   
...              ...           ...                         ...   
6195        0.062848      0.759984                    0.458880   
6196        0.349830      0.487413                    0.458880   
6197        0.349830      0.759984                    0.458880   
6198        0.349830      0.759984                    0.458880   
6199        0.349830      0.759984                    0.458880   

feature  full_profile_bikes  short_profile_3h_diff_bikes  short_profile_bikes  \
0                  0.174402                     0.325995             0.083140   
1                  0.174402                  

In [11]:
from sklearn.model_selection import TimeSeriesSplit

ts_cv = TimeSeriesSplit(n_splits=5,
                        gap=48,
                        max_train_size=10000,
                        test_size=500
                        )

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate


def all_stations_v2(model, cv):
    """This function evaluates the mean absolute error and root-mean-square error for the bike rental data set, based on the cross validation split and model given"""
    stations = pivot_df['(Intercept)']
    for station in stations:
        #data for the current station
        station_data = pivot_df[pivot_df['(Intercept)'] == station]

        #Extract feature and target variable
        X = station_data[['full_profile_bikes', 'short_profile_bikes']]
        y = station_data['bikes_3h_ago'] / station_data['bikes_3h_ago'].max()

    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        f"Mean Absolute Error for {model}:     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Root Mean Squared Error for {model}: {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )
    

all_stations_v2(lin_reg, cv=ts_cv)
all_stations_v2(bayes_rid, ts_cv)
all_stations_v2(hbg, ts_cv)

Mean Absolute Error for LinearRegression():     0.031 +/- 0.006
Root Mean Squared Error for LinearRegression(): 0.072 +/- 0.013
Mean Absolute Error for BayesianRidge():     0.031 +/- 0.006
Root Mean Squared Error for BayesianRidge(): 0.072 +/- 0.013
Mean Absolute Error for HistGradientBoostingRegressor():     0.030 +/- 0.006
Root Mean Squared Error for HistGradientBoostingRegressor(): 0.072 +/- 0.013
