In [1]:
# importing packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings

In [2]:
# configuring the notebook
pd.set_option("display.max_columns", None)
sns.set_theme(style = "whitegrid")
warnings.filterwarnings("ignore")

In [3]:
# importing the dataset

# content = requests.get("https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip").content
# with zipfile.ZipFile(io.BytesIO(content)) as arc:
#     raw_data = pd.read_csv(arc.open("hour.csv"), header=0, sep=',', parse_dates=['dteday'])

raw_data = pd.read_csv("./bike_sharing_dataset/hour.csv", header = 0, sep = ",", parse_dates = ["dteday"])
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [4]:
# set datetime column as the index of the dataset
raw_data.index = raw_data.apply(lambda row: datetime.datetime.combine(row.dteday.date(), datetime.time(row.hr)), axis = 1)

In [5]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 17379 entries, 2011-01-01 00:00:00 to 2012-12-31 23:00:00
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   instant     17379 non-null  int64         
 1   dteday      17379 non-null  datetime64[ns]
 2   season      17379 non-null  int64         
 3   yr          17379 non-null  int64         
 4   mnth        17379 non-null  int64         
 5   hr          17379 non-null  int64         
 6   holiday     17379 non-null  int64         
 7   weekday     17379 non-null  int64         
 8   workingday  17379 non-null  int64         
 9   weathersit  17379 non-null  int64         
 10  temp        17379 non-null  float64       
 11  atemp       17379 non-null  float64       
 12  hum         17379 non-null  float64       
 13  windspeed   17379 non-null  float64       
 14  casual      17379 non-null  int64         
 15  registered  17379 non-null  int64  

In [6]:
# check for the presence of missing values in the entire dataset
raw_data.isna().sum().sum()

np.int64(0)

In [7]:
raw_data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
2011-01-01 00:00:00,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2011-01-01 01:00:00,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2011-01-01 02:00:00,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
2011-01-01 03:00:00,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
2011-01-01 04:00:00,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


# Data Dictionary
- `instant` = record index
- `dteday` = data
- `season` = 1:winter, 2:spring, 3:summer, 4:fall
- `yr` = year (0: 2011, 1: 2012)
- `mnth` = month (1 to 12)
- `hr` = hour (0 to 23)
- `holiday` = weather day is holiday or not (extracted from http://dchr.dc.gov/page/holiday-schedule)
- `weekday` = day of the week
- `workingday` = if day is neither weekend nor holiday is 1, otherwise is 0
- `weathersit` = - 1: Clear, Few clouds, Partly cloudy, Partly cloudy
- `temp` = Normalized temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-8, t_max=+39 (only in hourly scale)
- `atemp` = Normalized feeling temperature in Celsius. The values are derived via (t-t_min)/(t_max-t_min), t_min=-16, t_max=+50 (only in hourly scale)
- `hum` = Normalized humidity. The values are divided to 100 (max)
- `windspeed` = Normalized wind speed. The values are divided to 67 (max)
- `casual` = count of casual users
- `registered` = count of registered users
- `cnt` = count of total rental bikes including both casual and registered

The cnt column can be a good target variable. The objective is to predict the count of number of bikes rented given a specific data.

In [8]:
# dividing the dataset into 2 parts, "reference" (historic) and "current" (present)
reference = raw_data.loc["2011-01-01 00:00:00": "2011-02-01 00:00:00"]
current = raw_data.loc["2011-02-02 00:00:00": "2011-03-01 00:00:00"]

In [9]:
# identify a few numerical and categorical features and perform statistical tests on them
numerical_features = ["cnt", "temp", "atemp", "windspeed", "yr", "mnth"]
categorical_fetures = ["season", "holiday", "weekday", "workingday"]

In [10]:
# set the p_value to 0.05
p_value = 0.05

# initializing a variable to keep a count of the number of columns who's distribution has changed
num_changed = 0
cat_changed = 0

In [11]:
from scipy import stats

# perform Kolmogorov-Smirnov test on numerical features
for col in numerical_features:
    test = stats.ks_2samp(reference[col], current[col])

    if test[1] <= p_value:
        print(f"The data distribution for {col} has changed.")
        num_changed += 1
print(f"The number of numerical features that have changed = {num_changed}")


# caution: year, if used as a feature, can be tricky to deal with.
# a change in year from, say 2011 to 2012, can cause the p_value to be as low as 0.0

The data distribution for cnt has changed.
The data distribution for temp has changed.
The data distribution for atemp has changed.
The data distribution for windspeed has changed.
The data distribution for mnth has changed.
The number of numerical features that have changed = 5


In [12]:
# perform chi-square test on categorical features
for col in categorical_fetures:
    test = stats.chi2_contingency([reference[col].value_counts(), current[col].value_counts()])

    if test[1] <= p_value:
        print(f"The categories in {col} have changed")
        cat_changed += 1
print(f"The number of categorical columns that have changed = {cat_changed}")

The number of categorical columns that have changed = 0


# Training A Model On Reference (Historical) Dataset

In [13]:
target = "cnt"
prediction = "prediction"
numerical_features = ["cnt", "temp", "atemp", "windspeed", "yr", "mnth"]
categorical_features = ["season", "holiday", "workingday"]

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(reference[numerical_features + categorical_features], reference[target], test_size = 0.3)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((482, 9), (207, 9), (482,), (207,))

In [15]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(random_state = 0)
model.fit(x_train, y_train)
pred_reference_test = model.predict(x_test)
pred_reference_test

array([  1.  ,  31.93,   5.  ,  54.78, 217.56, 227.77,  15.08,  10.86,
         1.  ,  21.  ,   8.  ,  52.  ,  43.87,  85.92, 137.38,  74.91,
         3.  , 145.36,   3.  ,  48.93,  16.01,  32.97,  77.99,  82.95,
        38.97, 117.32, 157.13,   1.  ,   8.  ,  74.04,  19.96,  94.97,
        89.12,  63.95,   1.  ,  52.  ,   8.  , 160.9 ,   2.  ,  28.02,
         1.  , 144.92,  67.91,  38.97,  43.87,   1.  ,  94.91, 105.77,
         1.  ,  23.01,  41.01,  28.89,   2.  , 156.12,  27.  ,   2.  ,
        43.8 ,   1.  ,  29.96,  52.98,  82.95,  85.8 ,  54.01,  38.  ,
        79.55,  34.99,   6.  ,  99.11,  59.17,  40.14,   7.  ,  63.01,
        28.08,   3.  ,  70.02,  49.47,  28.2 ,  30.97,  53.03, 108.33,
         2.  ,  54.84,  18.94,  27.04, 168.06,   3.  ,  75.97,   5.  ,
        20.07,  94.01,   7.  ,  63.98, 222.17,  41.58,  32.99, 121.53,
        20.01,   1.  ,  64.92,  56.97,  44.86,  61.98, 138.04,   2.  ,
        43.  ,  11.93,   8.  ,  88.09,  59.09,  36.  , 119.79,  43.05,
      

In [16]:
# computing MAE, MSE and R2 score for reference dataset
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

reference_mae = mean_absolute_error(y_test, pred_reference_test)
reference_mse = mean_squared_error(y_test, pred_reference_test)
reference_r2_score = r2_score(y_test, pred_reference_test)

reference_mae, reference_mse, reference_r2_score

(np.float64(0.3199033816425118),
 np.float64(2.4565381642512056),
 0.9990323872950326)

# Predicting On The Current (Present) Dataset

In [17]:
pred_current_test = model.predict(current[numerical_features + categorical_features])
pred_current_test

array([  2.  ,   3.  ,   4.  ,   1.  ,   1.  ,   3.  ,  17.89,  48.95,
       155.3 , 122.28,  60.98,  52.01,  64.  ,  74.96,  62.92,  75.91,
       103.73, 189.17, 180.19,  90.64,  74.85,  62.97,  40.12,  31.76,
        11.94,   5.  ,   2.  ,   1.  ,   2.  ,  38.97,  87.15, 187.75,
       134.31,  52.03,  63.97,  68.85,  50.89,  47.01,  60.03,  77.98,
       175.91, 146.2 ,  95.04, 109.75,  54.01,  41.06,  37.99,  13.05,
         7.  ,   1.  ,   1.  ,   7.  ,  28.23,  87.33, 219.83, 128.5 ,
        50.96,  63.98,  86.04,  81.78,  90.74,  89.79,  99.  , 206.33,
       155.41, 103.66,  70.9 ,  43.07,  45.64,  30.98,  38.98,  17.95,
        17.1 ,  10.85,   8.  ,   9.02,   3.99,   3.99,   9.83,  20.04,
        33.98,  46.97,  52.01,  72.03,  54.83,  59.9 ,  70.9 ,  77.98,
        82.92,  83.97,  68.81,  56.  ,  44.78,  59.01,  38.95,  43.8 ,
        20.07,  12.99,   2.  ,   1.  ,   1.  ,   8.  ,  22.98,  44.74,
        89.11, 117.35, 173.74, 179.48, 161.11, 179.62, 157.23, 121.67,
      

In [18]:
y_current = current[target]
y_current

2011-02-02 00:00:00     2
2011-02-02 01:00:00     3
2011-02-02 02:00:00     4
2011-02-02 03:00:00     1
2011-02-02 04:00:00     1
                       ..
2011-02-28 20:00:00    45
2011-02-28 21:00:00    80
2011-02-28 22:00:00    76
2011-02-28 23:00:00    45
2011-03-01 00:00:00     7
Name: cnt, Length: 627, dtype: int64

In [19]:
# computing MAE, MSE and R2 score for reference dataset
current_mae = mean_absolute_error(y_current, pred_current_test)
current_mse = mean_squared_error(y_current, pred_current_test)
current_r2_score = r2_score(y_current, pred_current_test)

current_mae, current_mse, current_r2_score

(np.float64(1.2288835725677834),
 np.float64(46.99890446570973),
 0.988514104290669)

# Using MLFlow To Track Experiments

In [20]:
import mlflow
import psutil
import random

In [21]:
mlflow.set_experiment("bike_sharing")

<Experiment: artifact_location='file:///Users/vidishsirdesai/Desktop/dev/dsml_end_to_end_reference/mlops/model_monitoring/mlruns/934353440333462081', creation_time=1731982569436, experiment_id='934353440333462081', last_update_time=1731982569436, lifecycle_stage='active', name='bike_sharing', tags={}>

In [22]:
experiment_batches = [
    ("2011-02-02 00:00:00", "2011-02-10 23:00:00"),
    ("2011-02-10 00:00:00", "2011-02-17 23:00:00"),
    ("2011-02-17 00:00:00", "2011-02-25 23:00:00"),
    ("2011-02-25 00:00:00", "2011-03-01 23:00:00")
]

In [23]:
for date in experiment_batches:
    with mlflow.start_run() as run:
        
        mlflow.set_tag("mlflow.runName", "experiment_"+str(date[0])+" : "+str(date[1]))

        # logging the date parameters
        mlflow.log_param("start_date", date[0])
        mlflow.log_param("end_date", date[1])

        # computing the prediction on the date range
        data = current.loc[date[0]: date[1]]
        features = data[numerical_features + categorical_features]
        target = data["cnt"]
        prediction = model.predict(features)

        # computing the model metrics
        mae = mean_absolute_error(target, prediction)
        mse = mean_squared_error(target, prediction)
        r2 = r2_score(target, prediction)

        # logging the model metrics
        mlflow.log_metric('MAE', round(mae, 3))
        mlflow.log_metric('MSE', round(mse, 3))
        mlflow.log_metric('R2', round(r2, 3))

        # computing and logging system metrics
        cpu_usage = psutil.cpu_percent()
        memory_usage = psutil.virtual_memory().percent
        mlflow.log_metric("cpu_usage", cpu_usage)
        mlflow.log_metric("memory_usage", memory_usage)