# Master pipeline notebook

This notebook is mainly meant to run the experimentation. 

The experimentation is divided into seperate parts according to machine learning process

The process involves: 
1. Data gathering and initial transformation
2. Data analysis
3. feature engineering 
4. Modelling 
5. Evaluation

In [1]:
import numpy as np
import pandas as pd
from data import gatherData
import plotly.graph_objects as go
from pre_process_data import processData
from feature import engineerFeaturesForTraining, splitData
from transform import transformData
from utils import load_config_file
from train import train
import torch
from LSTM_model import LSTM
from inference import makeInference

In [2]:
config = load_config_file("config.yml")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cpu'

## 1. Data gathering and transformation

This stage is mainly to gather the data. Combine them if necessary. Perform initial data wrangling, so that it is ready for 
data analysis

In [3]:
gather_data_obj = gatherData(config=config)

In [4]:
(
    lap_times_df,
    pit_stops_df,
    qualifying_df,
    races_df,
    results_df,
    sprint_results_df,
    status_df,
) = gather_data_obj.load_data()

In [5]:
pre_process_obj = processData(config=config)

master_lap_times_df = pre_process_obj.create_initial_dataset(
    lap_times_df=lap_times_df, races_df=races_df, results_df=results_df
)
master_lap_times_df

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId
0,841,20,1,1,98109,2011,1,1,Australian Grand Prix,2011-03-27,1,1
1,841,20,2,1,93006,2011,1,1,Australian Grand Prix,2011-03-27,1,1
2,841,20,3,1,92713,2011,1,1,Australian Grand Prix,2011-03-27,1,1
3,841,20,4,1,92803,2011,1,1,Australian Grand Prix,2011-03-27,1,1
4,841,20,5,1,92342,2011,1,1,Australian Grand Prix,2011-03-27,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
551737,1110,817,40,17,114361,2023,12,13,Belgian Grand Prix,2023-07-30,3,1
551738,1110,817,41,17,113367,2023,12,13,Belgian Grand Prix,2023-07-30,3,1
551739,1110,817,42,16,115247,2023,12,13,Belgian Grand Prix,2023-07-30,3,1
551740,1110,817,43,16,112115,2023,12,13,Belgian Grand Prix,2023-07-30,3,1


In [6]:
# check for nans in the data
master_lap_times_df.isna().any()

raceId          False
driverId        False
lap             False
position        False
milliseconds    False
year            False
round           False
circuitId       False
name            False
date            False
number          False
statusId        False
dtype: bool

In [7]:
driver_id = 20
race_id = 841

lap_times_selected_driver_race = master_lap_times_df[
    (master_lap_times_df["raceId"] == race_id)
    & (master_lap_times_df["driverId"] == driver_id)
]

In [8]:
## Lap times and pit stops

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=lap_times_selected_driver_race["lap"],
        y=lap_times_selected_driver_race["milliseconds"],
        mode="lines+markers",
        name="Lap times in milliseconds",
    )
)

fig.update_layout(
    title="Laptimes by each lap", xaxis_title="Lap number", yaxis_title="Total lap time"
)

- Two outliers in the data 
- this is mainly the pit stops that happen in between the race

In [9]:
master_laptimes_pitstop_df = pre_process_obj.add_pitstop_data(
    master_laptime_data=master_lap_times_df, pit_stop_data=pit_stops_df
)
master_laptimes_pitstop_df

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
0,841,20,1,1,98109,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
1,841,20,2,1,93006,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
2,841,20,3,1,92713,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
3,841,20,4,1,92803,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
4,841,20,5,1,92342,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551737,1110,817,40,17,114361,2023,12,13,Belgian Grand Prix,2023-07-30,3,1,0.0,False
551738,1110,817,41,17,113367,2023,12,13,Belgian Grand Prix,2023-07-30,3,1,0.0,False
551739,1110,817,42,16,115247,2023,12,13,Belgian Grand Prix,2023-07-30,3,1,0.0,False
551740,1110,817,43,16,112115,2023,12,13,Belgian Grand Prix,2023-07-30,3,1,0.0,False


In [10]:
driver_id = 20
circuit_id = 1

lap_times_selected_driver_circuit = master_laptimes_pitstop_df[
    (master_laptimes_pitstop_df["circuitId"] == circuit_id)
    & (master_laptimes_pitstop_df["driverId"] == driver_id)
]

In [11]:
lap_times_selected_driver_circuit

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
0,841,20,1,1,98109,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
1,841,20,2,1,93006,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
2,841,20,3,1,92713,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
3,841,20,4,1,92803,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
4,841,20,5,1,92342,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517395,1076,20,18,15,85367,2022,3,1,Australian Grand Prix,2022-04-10,5,3,0.0,False
517396,1076,20,19,15,85503,2022,3,1,Australian Grand Prix,2022-04-10,5,3,0.0,False
517397,1076,20,20,15,85398,2022,3,1,Australian Grand Prix,2022-04-10,5,3,0.0,False
517398,1076,20,21,13,85505,2022,3,1,Australian Grand Prix,2022-04-10,5,3,0.0,False


In [12]:
list_race_id = lap_times_selected_driver_circuit["raceId"].unique().tolist()

In [13]:
## Lap times and pit stops

fig = go.Figure()

for race_id in list_race_id:

    lap_times_race = lap_times_selected_driver_circuit[
        lap_times_selected_driver_circuit["raceId"] == race_id
    ]

    fig.add_trace(
        go.Scatter(
            x=lap_times_race["lap"],
            y=lap_times_race["milliseconds"],
            mode="lines+markers",
            name=f"race id: {race_id}",
        )
    )

    fig.update_layout(
        title="Laptimes by each lap",
        xaxis_title="Lap number",
        yaxis_title="Total lap time",
    )
fig.show()

In [14]:
fig = go.Figure()

# Add a box trace
fig.add_trace(
    go.Box(
        y=lap_times_selected_driver_circuit["milliseconds"], name="Lap times for Aus GP"
    )
)

# Customize the layout
fig.update_layout(
    title="Box Plot with plotly.graph_objects",
    yaxis_title="Values",
)

# Show the figure
fig.show()

In [15]:
lap_times_selected_driver_circuit[lap_times_selected_driver_circuit["raceId"] == 948]

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
111775,948,20,1,1,96916,2016,1,1,Australian Grand Prix,2016-03-20,5,1,0.0,False
111776,948,20,2,1,91664,2016,1,1,Australian Grand Prix,2016-03-20,5,1,0.0,False
111777,948,20,3,1,92167,2016,1,1,Australian Grand Prix,2016-03-20,5,1,0.0,False
111778,948,20,4,1,92014,2016,1,1,Australian Grand Prix,2016-03-20,5,1,0.0,False
111779,948,20,5,1,92273,2016,1,1,Australian Grand Prix,2016-03-20,5,1,0.0,False
111780,948,20,6,1,92204,2016,1,1,Australian Grand Prix,2016-03-20,5,1,0.0,False
111781,948,20,7,1,92080,2016,1,1,Australian Grand Prix,2016-03-20,5,1,0.0,False
111782,948,20,8,1,92289,2016,1,1,Australian Grand Prix,2016-03-20,5,1,0.0,False
111783,948,20,9,1,92578,2016,1,1,Australian Grand Prix,2016-03-20,5,1,0.0,False
111784,948,20,10,1,92571,2016,1,1,Australian Grand Prix,2016-03-20,5,1,0.0,False


In [16]:
lap_times_selected_driver_circuit[lap_times_selected_driver_circuit["raceId"] == 1]

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
343523,1,20,1,2,99647,2009,1,1,Australian Grand Prix,2009-03-29,15,4,0.0,False
343524,1,20,2,2,91173,2009,1,1,Australian Grand Prix,2009-03-29,15,4,0.0,False
343525,1,20,3,2,89752,2009,1,1,Australian Grand Prix,2009-03-29,15,4,0.0,False
343526,1,20,4,2,88999,2009,1,1,Australian Grand Prix,2009-03-29,15,4,0.0,False
343527,1,20,5,2,88849,2009,1,1,Australian Grand Prix,2009-03-29,15,4,0.0,False
343528,1,20,6,2,88424,2009,1,1,Australian Grand Prix,2009-03-29,15,4,0.0,False
343529,1,20,7,2,88235,2009,1,1,Australian Grand Prix,2009-03-29,15,4,0.0,False
343530,1,20,8,2,88140,2009,1,1,Australian Grand Prix,2009-03-29,15,4,0.0,False
343531,1,20,9,2,88331,2009,1,1,Australian Grand Prix,2009-03-29,15,4,0.0,False
343532,1,20,10,2,88970,2009,1,1,Australian Grand Prix,2009-03-29,15,4,0.0,False


In [17]:
master_laptimes_pitstop_df[
    (master_laptimes_pitstop_df["raceId"] == 948)
    & (master_laptimes_pitstop_df["lap"] == 18)
]

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
111792,948,20,18,1,201551,2016,1,1,Australian Grand Prix,2016-03-20,5,1,1089312.0,True
111849,948,8,18,3,188276,2016,1,1,Australian Grand Prix,2016-03-20,7,5,1089365.0,True
111870,948,3,18,2,199132,2016,1,1,Australian Grand Prix,2016-03-20,6,1,1089303.0,True
111927,948,830,18,5,189398,2016,1,1,Australian Grand Prix,2016-03-20,33,1,1089372.0,True
111984,948,13,18,8,1243269,2016,1,1,Australian Grand Prix,2016-03-20,19,1,1089805.0,True
112041,948,1,18,7,1243047,2016,1,1,Australian Grand Prix,2016-03-20,44,1,1090087.0,True
112098,948,832,18,6,1242495,2016,1,1,Australian Grand Prix,2016-03-20,55,1,1089971.0,True
112155,948,807,18,10,1229799,2016,1,1,Australian Grand Prix,2016-03-20,27,1,1089155.0,True
112212,948,817,18,4,190570,2016,1,1,Australian Grand Prix,2016-03-20,3,1,1089822.0,True
112285,948,815,18,13,1221249,2016,1,1,Australian Grand Prix,2016-03-20,11,1,1087311.0,True


In [18]:
master_laptimes_pitstop_df[
    (master_laptimes_pitstop_df["raceId"] == 948)
    & (master_laptimes_pitstop_df["driverId"] == 825)
]

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
112735,948,825,1,21,180423,2016,1,1,Australian Grand Prix,2016-03-20,20,1,27831.0,True
112736,948,825,2,21,106207,2016,1,1,Australian Grand Prix,2016-03-20,20,1,0.0,False
112737,948,825,3,21,97229,2016,1,1,Australian Grand Prix,2016-03-20,20,1,0.0,False
112738,948,825,4,21,94720,2016,1,1,Australian Grand Prix,2016-03-20,20,1,0.0,False
112739,948,825,5,21,95346,2016,1,1,Australian Grand Prix,2016-03-20,20,1,0.0,False
112740,948,825,6,21,103453,2016,1,1,Australian Grand Prix,2016-03-20,20,1,0.0,False
112741,948,825,7,21,94917,2016,1,1,Australian Grand Prix,2016-03-20,20,1,0.0,False
112742,948,825,8,21,95742,2016,1,1,Australian Grand Prix,2016-03-20,20,1,0.0,False
112743,948,825,9,21,95633,2016,1,1,Australian Grand Prix,2016-03-20,20,1,0.0,False
112744,948,825,10,21,97304,2016,1,1,Australian Grand Prix,2016-03-20,20,1,0.0,False


In [19]:
lap_times_selected_driver_circuit[lap_times_selected_driver_circuit["raceId"] == 900]

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
72699,900,20,1,15,120977,2014,1,1,Australian Grand Prix,2014-03-16,1,5,0.0,False
72700,900,20,2,16,109947,2014,1,1,Australian Grand Prix,2014-03-16,1,5,0.0,False
72701,900,20,3,16,111460,2014,1,1,Australian Grand Prix,2014-03-16,1,5,0.0,False


In [20]:
lap_times_selected_driver_circuit[
    (lap_times_selected_driver_circuit["raceId"] == 948)
    & (lap_times_selected_driver_circuit["lap"] == 18)
]

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
111792,948,20,18,1,201551,2016,1,1,Australian Grand Prix,2016-03-20,5,1,1089312.0,True


In [21]:
## work on the outlier
lap17_time = lap_times_selected_driver_circuit[
    (lap_times_selected_driver_circuit["raceId"] == 948)
    & (lap_times_selected_driver_circuit["lap"] == 17)
]["milliseconds"].values[0]

lap20_time = lap_times_selected_driver_circuit[
    (lap_times_selected_driver_circuit["raceId"] == 948)
    & (lap_times_selected_driver_circuit["lap"] == 20)
]["milliseconds"].values[0]

lap_times_selected_driver_circuit.loc[
    (lap_times_selected_driver_circuit["raceId"] == 948)
    & (lap_times_selected_driver_circuit["lap"] == 18),
    "milliseconds",
] = (lap17_time + lap20_time) / 2

lap_times_selected_driver_circuit.loc[
    (lap_times_selected_driver_circuit["raceId"] == 948)
    & (lap_times_selected_driver_circuit["lap"] == 19),
    "milliseconds",
] = (lap17_time + lap20_time) / 2

In [22]:
fig = go.Figure()

for race_id in list_race_id:

    lap_times_race = lap_times_selected_driver_circuit[
        lap_times_selected_driver_circuit["raceId"] == race_id
    ]

    fig.add_trace(
        go.Scatter(
            x=lap_times_race["lap"],
            y=lap_times_race["milliseconds"],
            mode="lines+markers",
            name=f"race id: {race_id}",
        )
    )

    fig.update_layout(
        title="Laptimes by each lap",
        xaxis_title="Lap number",
        yaxis_title="Total lap time",
    )
fig.show()

In [23]:
fig = go.Figure()

# Add a box trace
fig.add_trace(
    go.Box(
        y=lap_times_selected_driver_circuit["milliseconds"], name="Lap times for Aus GP"
    )
)

# Customize the layout
fig.update_layout(
    title="Box Plot with plotly.graph_objects",
    yaxis_title="Values",
)

# Show the figure
fig.show()

In [24]:
lap_times_selected_driver_circuit

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
0,841,20,1,1,98109.0,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
1,841,20,2,1,93006.0,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
2,841,20,3,1,92713.0,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
3,841,20,4,1,92803.0,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
4,841,20,5,1,92342.0,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517395,1076,20,18,15,85367.0,2022,3,1,Australian Grand Prix,2022-04-10,5,3,0.0,False
517396,1076,20,19,15,85503.0,2022,3,1,Australian Grand Prix,2022-04-10,5,3,0.0,False
517397,1076,20,20,15,85398.0,2022,3,1,Australian Grand Prix,2022-04-10,5,3,0.0,False
517398,1076,20,21,13,85505.0,2022,3,1,Australian Grand Prix,2022-04-10,5,3,0.0,False


# Feature engineering

In [25]:
engineer_data_obj = engineerFeaturesForTraining(config=config)


engineered_lap_times_df, encoder, encoder_columns = engineer_data_obj.engineer_data(
    lap_times_data=lap_times_selected_driver_circuit
)

In [26]:
engineered_lap_times_df

Unnamed: 0,raceId,lap,position,milliseconds,year,milliseconds_1_prior,lap_number_1_prior,position_1_prior_lap,month,day,isPitStop_False,isPitStop_True,statusId_1,statusId_3,statusId_4,statusId_36
1,1,2,2,91173.0,2009,99647.0,1.0,2.0,3,29,1.0,0.0,0.0,0.0,1.0,0.0
2,1,3,2,89752.0,2009,91173.0,2.0,2.0,3,29,1.0,0.0,0.0,0.0,1.0,0.0
3,1,4,2,88999.0,2009,89752.0,3.0,2.0,3,29,1.0,0.0,0.0,0.0,1.0,0.0
4,1,5,2,88849.0,2009,88999.0,4.0,2.0,3,29,1.0,0.0,0.0,0.0,1.0,0.0
5,1,6,2,88424.0,2009,88849.0,5.0,2.0,3,29,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
560,1076,18,15,85367.0,2022,85189.0,17.0,16.0,4,10,1.0,0.0,0.0,1.0,0.0,0.0
561,1076,19,15,85503.0,2022,85367.0,18.0,15.0,4,10,1.0,0.0,0.0,1.0,0.0,0.0
562,1076,20,15,85398.0,2022,85503.0,19.0,15.0,4,10,1.0,0.0,0.0,1.0,0.0,0.0
563,1076,21,13,85505.0,2022,85398.0,20.0,15.0,4,10,1.0,0.0,0.0,1.0,0.0,0.0


In [27]:
fig = go.Figure()

for race_id in list_race_id:

    lap_times_race = engineered_lap_times_df[
        engineered_lap_times_df["raceId"] == race_id
    ]

    fig.add_trace(
        go.Scatter(
            x=lap_times_race["lap"],
            y=lap_times_race["milliseconds"],
            mode="lines+markers",
            name=f"race id: {race_id}",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=lap_times_race["lap"],
            y=lap_times_race["milliseconds_1_prior"],
            mode="lines+markers",
            name=f"race id: {race_id} 1 prior",
        )
    )

    fig.update_layout(
        title="Laptimes by each lap and prior lap",
        xaxis_title="Lap number",
        yaxis_title="Total lap time",
    )
fig.show()

# Spliting into train test and validation

In [28]:
split_obj = splitData()

train_data, test_data = split_obj.train_test_split(
    data=engineered_lap_times_df, race_id=338
)

training_set, validation_set = split_obj.train_validation_split(train_data=train_data)

# Transform data into modelling data
- scaling 
- sequencing

In [29]:
## training_set trainformation

In [30]:
data_transform_obj = transformData(
    n_steps_input=config.get("N_STEP_INPUT"),
    n_steps_output=config.get("N_STEP_OUTPUT"),
    config=config,
)

scaled_input_df_train, scaled_output_df_train = (
    data_transform_obj.create_scaled_input_output_data(data=training_set, train=True)
)


scaled_input_df_val, scaled_output_df_val = (
    data_transform_obj.create_scaled_input_output_data(data=validation_set, train=False)
)

INFO:root:scaler_dict.pkl saved to ../artifacts/


In [41]:
scaled_input_df_val

Unnamed: 0,year_scaled,day_scaled,lap_scaled,position_scaled,milliseconds_1_prior_scaled,lap_number_1_prior_scaled,position_1_prior_lap_scaled,month_sin,month_cos,isPitStop_False,raceId,statusId_4,statusId_1,isPitStop_True,statusId_3,statusId_36
0,-1.660550,1.495531,1.000000,0.055556,0.273641,1.000000,0.055556,1.000000,6.123234e-17,1.0,1,1.0,0.0,0.0,0.0,0.0
1,-1.660550,1.495531,1.022727,0.055556,0.112934,1.022727,0.055556,1.000000,6.123234e-17,1.0,1,1.0,0.0,0.0,0.0,0.0
2,-1.660550,1.495531,1.045455,0.055556,0.055577,1.045455,0.055556,1.000000,6.123234e-17,1.0,1,1.0,0.0,0.0,0.0,0.0
3,-1.660550,1.495531,1.068182,0.055556,0.051616,1.068182,0.055556,1.000000,6.123234e-17,1.0,1,1.0,0.0,0.0,0.0,0.0
4,-1.660550,1.495531,1.090909,0.055556,0.052558,1.090909,0.055556,1.000000,6.123234e-17,1.0,1,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,2.102348,-2.096095,0.363636,0.777778,-0.004709,0.363636,0.833333,0.866025,-5.000000e-01,1.0,1076,0.0,0.0,0.0,1.0,0.0
108,2.102348,-2.096095,0.386364,0.777778,-0.002412,0.386364,0.777778,0.866025,-5.000000e-01,1.0,1076,0.0,0.0,0.0,1.0,0.0
109,2.102348,-2.096095,0.409091,0.777778,-0.000658,0.409091,0.777778,0.866025,-5.000000e-01,1.0,1076,0.0,0.0,0.0,1.0,0.0
110,2.102348,-2.096095,0.431818,0.666667,-0.002013,0.431818,0.777778,0.866025,-5.000000e-01,1.0,1076,0.0,0.0,0.0,1.0,0.0


In [32]:
x_sequential_train, y_sequential_train = data_transform_obj.create_sequence(
    scaled_input_df=scaled_input_df_train, scaled_output_df=scaled_output_df_train
)

x_sequential_val, y_sequential_val = data_transform_obj.create_sequence(
    scaled_input_df=scaled_input_df_val, scaled_output_df=scaled_output_df_val
)

In [33]:
x_train = torch.Tensor(x_sequential_train)
y_train = torch.Tensor(y_sequential_train)

x_train = torch.reshape(
    x_train,
    (
        x_train.shape[0],
        2,
        x_train.shape[2],
    ),
)

x_validation = torch.Tensor(x_sequential_val)
y_validation = torch.Tensor(y_sequential_val)

x_validation = torch.reshape(
    x_validation,
    (
        x_validation.shape[0],
        2,
        x_validation.shape[2],
    ),
)

In [34]:
train_obj = train(config=config)

train_data_loader, validation_data_loader = train_obj.create_data_loader(
    batch_size=8,
    x_train=x_train,
    y_train=y_train,
    x_validation=x_validation,
    y_validation=y_validation,
)

In [35]:
x_train.shape[2]

15

In [36]:
lstm_model = LSTM(
    num_classes=config.get("N_STEP_OUTPUT"),
    input_size=x_train.shape[2],
    hidden_size_layer=8,
    num_layers=config.get("NUM_LAYERS"),
    dropout_rate=0.1,
)
lstm_model.to(device)

loss_fn = torch.nn.MSELoss()

optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.001)

validation_loss = train_obj.train_validation_loop(
    n_epochs=10,
    lstm_model=lstm_model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    train_loader=train_data_loader,
    validation_loader=validation_data_loader,
    device=device,
)

Epoch num: 0
Batch 1, train loss: 0.0017917284276336432
Batch 2, train loss: 0.0018878477858379483
Batch 3, train loss: 0.0015220972709357738
Batch 4, train loss: 0.0023793629370629787
Batch 5, train loss: 0.0014914677012711763
Batch 6, train loss: 0.001568135223351419
Batch 7, train loss: 0.0013983328826725483
Batch 8, train loss: 0.001572939450852573
Batch 9, train loss: 0.0020486374851316214
Batch 10, train loss: 0.001302922610193491
Batch 11, train loss: 0.0023481135722249746
Batch 12, train loss: 0.0011498351814225316
Batch 13, train loss: 0.0012275032931938767
Batch 14, train loss: 0.001193415024317801
Batch 15, train loss: 0.0012467049527913332
Batch 16, train loss: 0.001421367283910513
Batch 17, train loss: 0.0009192790021188557
Batch 18, train loss: 0.0009815662633627653
Batch 19, train loss: 0.0013636728981509805
Batch 20, train loss: 0.0008283961797133088
Batch 21, train loss: 0.0014847315615043044
Batch 22, train loss: 0.0009862756123766303
Batch 23, train loss: 0.000952734

INFO:root:model.pkl saved to ../artifacts/


Batch 10, train loss: 6.820795533712953e-05
Batch 11, train loss: 0.00020760226470883936
Batch 12, train loss: 0.00016096467152237892
Batch 13, train loss: 0.0004527407872956246
Batch 14, train loss: 0.0002731763815972954
Batch 15, train loss: 4.907527909381315e-05
Batch 16, train loss: 0.00021205983648542315
Batch 17, train loss: 0.0003569457621779293
Batch 18, train loss: 0.0005265909712761641
Batch 19, train loss: 8.239853195846081e-05
Batch 20, train loss: 0.0006219689385034144
Batch 21, train loss: 0.0007536853663623333
Batch 22, train loss: 0.00030931062065064907
Batch 23, train loss: 0.001092849182896316
Batch 24, train loss: 0.0006307278526946902
Batch 25, train loss: 0.0002553887024987489
Batch 26, train loss: 0.00011588749475777149
Batch 27, train loss: 6.157558527775109e-05
Batch 28, train loss: 0.00010390221723355353
Batch 29, train loss: 9.617996693123132e-05
Batch 30, train loss: 0.00012965429050382227
Batch 31, train loss: 0.00010703235602704808
Batch 32, train loss: 0.0

In [37]:
infer_obj = makeInference(config=config)

prediction_tensor, actual_sequential = infer_obj.perform_inference(
    test_data=test_data, device=device
)

tensor([[0.1477, 0.2981, 0.0000, 0.0000, 0.0000],
        [0.1114, 0.2649, 0.0000, 0.0000, 0.0000],
        [0.1275, 0.2189, 0.0000, 0.0000, 0.0000],
        [0.1385, 0.0000, 0.0000, 0.0000, 0.0029],
        [0.2136, 0.0000, 0.0000, 0.0000, 0.0308],
        [0.2246, 0.0000, 0.0000, 0.0000, 0.0312],
        [0.2503, 0.0000, 0.0000, 0.0000, 0.0510],
        [0.2591, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2302, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2616, 0.0000, 0.0000, 0.0000, 0.0740],
        [0.2849, 0.0000, 0.0000, 0.0000, 0.0816],
        [0.2936, 0.0000, 0.0000, 0.0000, 0.0819],
        [0.3017, 0.0000, 0.0000, 0.0000, 0.0761],
        [0.3076, 0.0000, 0.0000, 0.0000, 0.0623],
        [0.3133, 0.0000, 0.0000, 0.0000, 0.0664],
        [0.3229, 0.0000, 0.0000, 0.0000, 0.0669],
        [0.3310, 0.0000, 0.0000, 0.0000, 0.0636],
        [0.3382, 0.0000, 0.0000, 0.0000, 0.0601],
        [0.3454, 0.0000, 0.0000, 0.0000, 0.0567]])


In [38]:
prediction_tensor

array([[ 96690.39001092, 108404.16316617,  85189.        ,
         85189.        ,  85189.        ],
       [ 93864.57443706, 105820.15895972,  85189.        ,
         85189.        ,  85189.        ],
       [ 95119.02374397, 102236.41331172,  85189.        ,
         85189.        ,  85189.        ],
       [ 95977.93654197,  85189.        ,  85189.        ,
         85189.        ,  85414.36435786],
       [101825.88968657,  85189.        ,  85189.        ,
         85189.        ,  87588.84876087],
       [102682.14961007,  85189.        ,  85189.        ,
         85189.        ,  87622.71062659],
       [104679.72233069,  85189.        ,  85189.        ,
         85189.        ,  89162.71355748],
       [105371.15937042,  85189.        ,  85189.        ,
         85189.        ,  85189.        ],
       [103113.59334397,  85189.        ,  85189.        ,
         85189.        ,  85189.        ],
       [105563.49161115,  85189.        ,  85189.        ,
         85189.        

In [39]:
actual_sequential

[array([156229., 150999., 104978., 103250., 102775.]),
 array([150999., 104978., 103250., 102775., 101476.]),
 array([104978., 103250., 102775., 101476., 121768.]),
 array([103250., 102775., 101476., 121768., 103572.]),
 array([102775., 101476., 121768., 103572.,  93057.]),
 array([101476., 121768., 103572.,  93057.,  92439.]),
 array([121768., 103572.,  93057.,  92439.,  91856.]),
 array([103572.,  93057.,  92439.,  91856.,  92287.]),
 array([93057., 92439., 91856., 92287., 93925.]),
 array([92439., 91856., 92287., 93925., 92447.]),
 array([91856., 92287., 93925., 92447., 91866.]),
 array([92287., 93925., 92447., 91866., 91812.]),
 array([93925., 92447., 91866., 91812., 91744.]),
 array([92447., 91866., 91812., 91744., 91626.]),
 array([91866., 91812., 91744., 91626., 91558.]),
 array([91812., 91744., 91626., 91558., 91556.]),
 array([91744., 91626., 91558., 91556., 92064.]),
 array([91626., 91558., 91556., 92064., 92939.]),
 array([91558., 91556., 92064., 92939., 92391.])]