In [11]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import TimeSeriesSplit

#First specifying the path to the CSV files containing each station's data
trainpath = 'C:/Users/we19383/DataspellProjects/MLP_coursework/morebikes2022/Train/'
modelpath = 'C:/Users/we19383/DataspellProjects/MLP_coursework/morebikes2022/Models/'
testpath = 'C:/Users/we19383/DataspellProjects/MLP_coursework/morebikes2022/test.csv'

phase1_files = sorted([i for i in os.listdir(trainpath) if i.endswith('deploy.csv')])
dfs = []

phase3_files = sorted([i for i in os.listdir(trainpath) if i.endswith('deploy_full.csv')])

for file in phase1_files:
    csv_filepath = os.path.join(trainpath, file)
    df = pd.read_csv(csv_filepath)
    dfs.append(df)

train = pd.concat(dfs, ignore_index=True)

phase2_files = sorted([i for i in os.listdir(modelpath) if i.endswith('.csv')])
models = []

stn_list = []
for file in phase3_files:
    csv_filepath = os.path.join(trainpath, file)
    stn_df = pd.read_csv(csv_filepath)
    stn_list.append(stn_df)
    
stn_dfs = pd.concat(stn_list, ignore_index=True)

for file in phase2_files:
    csv_filepath = os.path.join(modelpath, file)
    model = pd.read_csv(csv_filepath)
    models.append(model)

all_models = pd.concat(models, ignore_index=True)
all_models

test_filepath = os.path.join(testpath)
test = pd.read_csv(test_filepath)

In [12]:
tdf = pd.concat([stn_dfs, train, test])

tdf = tdf.sort_values(by='station')
tdf = tdf.drop('weekday', axis=1)

To combine the two methods, from Phase 1 and Phase 2, it would be worth using the weights from the models trained in Phase 1, concatenated with the weights from Phase 2 for features such as 'full_profile_bikes' and 'bikes_3h_ago' to test if there's any improvement in performance.

In [13]:
from sklearn.model_selection import TimeSeriesSplit

ts_cv = TimeSeriesSplit(n_splits = 10,
                        gap = 48,
                        max_train_size = 10000,
                        test_size=1000
                        )
X_train = train.drop('weekday', axis=1)

In [14]:
X_train = X_train.fillna(X_train.mean())

In [15]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge

# Assuming X_train and test are defined

stations = X_train['station'].unique()
weights_dfs = []

for station in stations:
    # Data for the current station
    station_data = X_train[X_train['station'] == station]

    # Extract feature and target variable
    X_tr = station_data[['bikes_3h_ago', 'full_profile_bikes']]
    y_tr = station_data['bikes'] / station_data['bikes'].max()

    
    X_te = test[test['station'] == station][['station', 'full_profile_bikes']]
    y_te = test[test['station'] == station]['bikes_3h_ago'] / test['bikes_3h_ago'].max()

    # Created and fit a model
    model = LinearRegression()
    model.fit(X_tr, y_tr)

    # Get the coefficients (weights) and intercept
    coefficients = model.coef_
    intercept = model.intercept_

    
    weights_df_1 = pd.DataFrame(columns=['Feature', 'Coefficient'])

    for feature, coef in zip(X_tr.columns, coefficients):
        weights_df_1.loc[len(weights_df_1)] = [feature, coef]

    # Add a row for the intercept
    #weights_df_1.loc[len(weights_df_1)] = ['Intercept', intercept]

    # Append the DataFrame to the list
    weights_dfs.append(weights_df_1)

# Concatenate all DataFrames in the list into a single DataFrame
final_weights_df = pd.concat(weights_dfs, ignore_index=True)

final_weights_df = final_weights_df.pivot(columns='Feature', values='Coefficient')

# Reshape the DataFrame
final_weights_df = pd.concat([final_weights_df['bikes_3h_ago'].dropna().reset_index(drop=True),
                         final_weights_df['full_profile_bikes'].dropna().reset_index(drop=True)],
                        axis=1)
# Display the final DataFrame with weights
final_weights_df['station'] = range(201,276)

final_weights_df

Unnamed: 0,bikes_3h_ago,full_profile_bikes,station
0,0.015953,0.013988,201
1,0.033519,0.018670,202
2,0.008361,0.027301,203
3,0.016177,0.026723,204
4,0.022605,0.022409,205
...,...,...,...
70,0.017239,0.032161,271
71,0.052242,0.006572,272
72,0.054920,0.004055,273
73,0.052284,0.009469,274


In [16]:
train_file_path = 'C:/Users/we19383/DataspellProjects/MLP_coursework/morebikes2022/Models/'
train_data = os.path.join(train_file_path)
model_type = ['full', 'full_temp']

result_list = []

# Iterate through each station
for i in range(1, 201):
    # Create an empty DataFrame to store the weights for the current station
    station_dict = {'station': i}

    # Iterate through each model for the current station
    for j in range(len(model_type)):  # Assuming models are named model1.csv, model2.csv, ..., model6.csv
        model_filename = os.path.join(train_file_path, f"model_station_{i}_rlm_{model_type[j]}.csv")
        #print(model_filename)
        # Check if the file exists
        if os.path.isfile(model_filename):
            # Load the model weights into a DataFrame
            model_weights = pd.read_csv(model_filename)
            #print(model_weights)

            bikes_3h_ago = model_weights['weight'][1]
            full_profile_bikes = model_weights['weight'][2]

            avg_bikes_3h_ago = np.mean(bikes_3h_ago)
            avg_full_profile_bikes = np.mean(full_profile_bikes)

            station_dict[f"bikes_3h_ago"] = avg_bikes_3h_ago
            station_dict[f"full_profile_bikes"] = avg_full_profile_bikes
         
    result_list.append(station_dict)

weight_df = pd.DataFrame(result_list)


weight_df

Unnamed: 0,station,bikes_3h_ago,full_profile_bikes
0,1,0.518931,3.438250e-01
1,2,0.626153,2.959324e-01
2,3,0.595897,2.415987e-01
3,4,0.587546,3.286884e-01
4,5,0.331868,6.738652e-01
...,...,...,...
195,196,0.999998,9.656896e-07
196,197,0.894453,2.638717e-02
197,198,0.903765,6.661564e-02
198,199,0.902956,3.322967e-04


In [17]:
# Concatenate DataFrames along rows
concatenated_df = pd.concat([weight_df, final_weights_df], ignore_index=True)
print(concatenated_df)

     station  bikes_3h_ago  full_profile_bikes
0          1      0.518931            0.343825
1          2      0.626153            0.295932
2          3      0.595897            0.241599
3          4      0.587546            0.328688
4          5      0.331868            0.673865
..       ...           ...                 ...
270      271      0.017239            0.032161
271      272      0.052242            0.006572
272      273      0.054920            0.004055
273      274      0.052284            0.009469
274      275      0.046208            0.012073

[275 rows x 3 columns]


In [18]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression()
bayes_rid = BayesianRidge()
hgb = HistGradientBoostingRegressor()

def station_models_v3(model):
    """This function will iterate over each unique station ID and apply a selected model to each station. It will then store the MAE and RMSE for each station, in a dictionary."""


    station_models = {} #this dictionary will store individual models for each station
    station_maes = []
    station_rmses = []
    # Extract training data for stations 1-200
    train_df = concatenated_df[concatenated_df['station'] <= 200]
    X_train = train_df[['station', 'full_profile_bikes']]
    y_train = train_df['bikes_3h_ago']/train_df['bikes_3h_ago'].max()

    # Train the model on stations 1-200
    model.fit(X_train, y_train)

    for station in range(201, 276):
        # Extract test data for the current station
        test_data_station = concatenated_df[concatenated_df['station'] == station]


        X_test = test_data_station[['station', 'full_profile_bikes']]
        y_test = test_data_station['bikes_3h_ago'] / test_data_station['bikes_3h_ago'].max()

        # Make predictions on the test data for the current station
        predictions = model.predict(X_test)

        # Evaluate the model
        mae = mean_absolute_error(y_test, predictions)
        rmse = (mean_squared_error(y_test, predictions))
        station_maes.append(mae)
        station_rmses.append(rmse)
        #print(f"This is the mean absolute error for station {station}: {mae:.3f}")

    # Calculate and print the average MAE
    average_mae = np.mean(station_maes)
    average_rmse = np.mean(station_rmses)
    print(f"\nAverage MAE across all stations between 201-275 for {model}: {average_mae:.3f}+/- {average_mae.std():.3f}\n")
    print(f"\nAverage RMSE across all stations between 201-275 for {model}: {average_rmse:.3f}+/- {average_rmse.std():.3f}\n")

station_models_v3(lin_reg)
station_models_v3(bayes_rid)
station_models_v3(hgb)


Average MAE across all stations between 201-275 for LinearRegression(): 0.061+/- 0.000


Average RMSE across all stations between 201-275 for LinearRegression(): 0.004+/- 0.000

Average MAE across all stations between 201-275 for BayesianRidge(): 0.061+/- 0.000


Average RMSE across all stations between 201-275 for BayesianRidge(): 0.004+/- 0.000

Average MAE across all stations between 201-275 for HistGradientBoostingRegressor(): 0.092+/- 0.000


Average RMSE across all stations between 201-275 for HistGradientBoostingRegressor(): 0.008+/- 0.000


In [19]:
from sklearn.model_selection import cross_validate

tdf = tdf.fillna(tdf.mean())
X = tdf.drop('bikes', axis=1)
y = tdf['bikes']/tdf['bikes'].max()

def all_stations_v3(model, X, y, cv):
    """This function evaluates the mean absolute error and root-mean-square error for the bik rental data set, based on the cross validation split and model given"""
    cv_results = cross_validate(
        model,
        X,
        y,
        cv=cv,
        scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"],
    )
    mae = -cv_results["test_neg_mean_absolute_error"]
    rmse = -cv_results["test_neg_root_mean_squared_error"]
    print(
        f"Mean Absolute Error for {model}:     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        f"Root Mean Squared Error for {model}: {rmse.mean():.3f} +/- {rmse.std():.3f}"
    )



all_stations_v3(lin_reg,X, y, cv=ts_cv)
all_stations_v3(bayes_rid,X, y, cv=ts_cv)
all_stations_v3(hgb, X, y, cv=ts_cv)

Mean Absolute Error for LinearRegression():     0.051 +/- 0.008
Root Mean Squared Error for LinearRegression(): 0.068 +/- 0.013
Mean Absolute Error for BayesianRidge():     0.050 +/- 0.008
Root Mean Squared Error for BayesianRidge(): 0.067 +/- 0.013
Mean Absolute Error for HistGradientBoostingRegressor():     0.046 +/- 0.009
Root Mean Squared Error for HistGradientBoostingRegressor(): 0.064 +/- 0.016


In [22]:

ts_cv = TimeSeriesSplit(n_splits = 3,
                        gap = 48,
                        max_train_size = 10000,
                        test_size=10
                        )

def station_models_v4(model):
    """This function will iterate over each unique station ID and apply a selected model to each station. It will then store the MAE and RMSE for each station, in a dictionary."""

    station_models = {} #this dictionary will store individual models for each station

    station_maes = []
    station_rmses = []

    stations = tdf['station']
    for station in test['station'].unique():
        #data for the current station
        station_data = tdf[tdf['station'] == station]


        #Extract feature and target variable
        feat = station_data[['full_profile_bikes', 'latitude']]
        targ = station_data['bikes_3h_ago']/station_data['bikes_3h_ago'].max()

        cv_scores = cross_validate(model, feat, targ, cv=ts_cv, scoring=["neg_mean_absolute_error", "neg_root_mean_squared_error"])

        # Convert scores to positive values and calculate the mean
        mae = -cv_scores["test_neg_mean_absolute_error"]
        rmse = -cv_scores["test_neg_root_mean_squared_error"]

        # print(
        #     f"Average Mean Absolute Error for station {station}:     {mae.mean():.3f} +/- {mae.std():.3f}\n"
        #     f"Average Root Mean Squared Error for station {station}: {rmse.mean():.3f} +/- {rmse.std():.3f}"
        # )
        station_maes.append(mae)
        station_rmses.append(rmse)
    # Calculate and print the average MAE
    average_mae = np.mean(station_maes)
    average_rmse = np.mean(station_rmses)
    print(f"\nAverage MAE across all stations between 201-275 for {model}: {average_mae:.3f}+/- {average_mae.std():.3f}\n")
    print(f"\nAverage RMSE across all stations between 201-275 for {model}: {average_rmse:.3f}+/- {average_rmse.std():.3f}\n")

station_models_v4(lin_reg)
station_models_v4(bayes_rid)
#station_models_v4(hgb)


Average MAE across all stations between 201-275 for LinearRegression(): 0.214+/- 0.000


Average RMSE across all stations between 201-275 for LinearRegression(): 0.241+/- 0.000

Average MAE across all stations between 201-275 for BayesianRidge(): 0.215+/- 0.000


Average RMSE across all stations between 201-275 for BayesianRidge(): 0.241+/- 0.000
