In [1]:
import mlflow
import pandas as pd
import xgboost
import mlflow.sklearn
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [2]:
#load dataframe
df = pd.read_csv("PJME_hourly.csv")
print(list(df.columns))

['Datetime', 'PJME_MW']


In [3]:
df.head()

Unnamed: 0,Datetime,PJME_MW
0,2002-12-31 01:00:00,26498.0
1,2002-12-31 02:00:00,25147.0
2,2002-12-31 03:00:00,24574.0
3,2002-12-31 04:00:00,24393.0
4,2002-12-31 05:00:00,24860.0


In [4]:
#change index to date time column
df = df.set_index("Datetime")
#date time column is an str -- change to date time type
df.index = pd.to_datetime(df.index)
#sort df using index
df = df.sort_index()

In [5]:
#remove any outliers
df = df[df["PJME_MW"] > 19000].copy()

In [8]:
df.head()

Unnamed: 0_level_0,PJME_MW,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,first_lag,second_lag,third_lag
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2002-01-01 01:00:00,30393.0,1,1,1,1,2002,1,1,1,,,
2002-01-01 02:00:00,29265.0,2,1,1,1,2002,1,1,1,,,
2002-01-01 03:00:00,28357.0,3,1,1,1,2002,1,1,1,,,
2002-01-01 04:00:00,27899.0,4,1,1,1,2002,1,1,1,,,
2002-01-01 05:00:00,28057.0,5,1,1,1,2002,1,1,1,,,


In [9]:
#create features for data
def create_features(df):
    #create a copy of the dataframe
    df = df.copy()
    #get hourly data
    df["hour"] = df.index.hour
    #get which day
    df["dayofweek"] = df.index.dayofweek
    #get the quarter
    df["quarter"] = df.index.quarter
    #get month
    df["month"] = df.index.month
    #get year
    df["year"] = df.index.year
    #get day of year
    df["dayofyear"] = df.index.dayofyear
    #get day of month
    df["dayofmonth"] = df.index.day
    #get week of the year
    df["weekofyear"] = df.index.isocalendar().week
    return df
df = create_features(df)

In [10]:
def add_lags(df):
    #convert the series containing PJME_MW to a dict with index as time stamp and key as PJME_MW
    target_map = df["PJME_MW"].to_dict()
    #first lag features are 1 year ago
    df["first_lag"] = (df.index - pd.Timedelta("364 days")).map(target_map)
    #second lag features are 2 years ago
    df["second_lag"] = (df.index - pd.Timedelta("728 days")).map(target_map)
    #third lay features are 3 years ago
    df["third_lag"] = (df.index - pd.Timedelta("1092 days")).map(target_map)
    return df
df = add_lags(df)

In [11]:
break_point = "01-01-2015"
train = df.loc[df.index < break_point]
test = df.loc[df.index >= break_point]

In [12]:
train.head()

Unnamed: 0_level_0,PJME_MW,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,first_lag,second_lag,third_lag
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2002-01-01 01:00:00,30393.0,1,1,1,1,2002,1,1,1,,,
2002-01-01 02:00:00,29265.0,2,1,1,1,2002,1,1,1,,,
2002-01-01 03:00:00,28357.0,3,1,1,1,2002,1,1,1,,,
2002-01-01 04:00:00,27899.0,4,1,1,1,2002,1,1,1,,,
2002-01-01 05:00:00,28057.0,5,1,1,1,2002,1,1,1,,,


In [18]:
test.head()

Unnamed: 0_level_0,PJME_MW,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear,first_lag,second_lag,third_lag
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2015-01-01 00:00:00,32802.0,0,3,1,1,2015,1,1,1,30159.0,32866.0,33624.0
2015-12-31 01:00:00,24305.0,1,3,4,12,2015,365,31,53,31647.0,28786.0,31112.0
2015-12-31 02:00:00,23156.0,2,3,4,12,2015,365,31,53,30755.0,28049.0,30207.0
2015-12-31 03:00:00,22514.0,3,3,4,12,2015,365,31,53,30189.0,27785.0,29879.0
2015-12-31 04:00:00,22330.0,4,3,4,12,2015,365,31,53,29890.0,27984.0,29915.0


In [None]:
#define features for input to model
features = list(set(df.columns) - set(["PJME_MW"]))
target = ["PJME_MW"]

In [17]:
#create train and test for model
X_train = train[features]
y_train = train[target]
X_test = test[features]
y_test = test[target]

In [18]:
import dagshub
#tracking uri to dagshub
mlflow.set_tracking_uri("https://dagshub.com/vsairaamresearch/mlops_code_learning.mlflow")
dagshub.init(repo_owner='vsairaamresearch', repo_name='mlops_code_learning', mlflow=True)



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=e4f4df76-d115-4897-bdae-5ef56a7921fd&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=a79406b14c438451bdbf3bea5e4e7709b55436f67d2b379197f59f5cbc42938c




gio: https://dagshub.com/login/oauth/authorize?state=e4f4df76-d115-4897-bdae-5ef56a7921fd&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=a79406b14c438451bdbf3bea5e4e7709b55436f67d2b379197f59f5cbc42938c: Operation not supported


In [19]:
#create and set an experiment
e_50_est = mlflow.set_experiment("XGBOOST n_estimators = 50")

2025/06/06 11:10:02 INFO mlflow.tracking.fluent: Experiment with name 'XGBOOST n_estimators = 50' does not exist. Creating a new experiment.


In [20]:
print(e_50_est.experiment_id)
print(e_50_est.artifact_location)
print(e_50_est.tags)
print(e_50_est.lifecycle_stage)

0
mlflow-artifacts:/558a0f7930714f39b144f5473d42992e
{}
active


In [None]:
import os
import logging
import time

#configure logging
logging.basicConfig(level = logging.INFO, format = "%{asctime}s - %{levelname}s - %{message}s")
