# Master pipeline notebook


### This notebook follows the ML pipeline process of gathering data, performing analysis, modelling and evaluation


The process involves: 
1. Data gathering and pre-processing
2. Data analysis
3. Feature engineering 
4. Data transformation
5. LSTM Modelling 
6. Evaluation


_Note: Run the notebook from start to end._


In [45]:
import numpy as np
import pandas as pd
from data import gatherData
import plotly.graph_objects as go
from pre_process_data import processData
from feature import engineerFeaturesForTraining, splitData
from transform import transformData
from utils import load_config_file
from train import train
import torch
from LSTM_model import LSTM
from inference import makeInference
from evaluation import evaluateModellingResults

In [46]:
config = load_config_file("config.yml")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cpu'

## Data gathering and pre-processing

- Gather the data 
- Pre process it:
    - Merge columns from races_df and results_df
    - Add columns from pit_stop_df
    - Work on the outliers in the data
    - remove race_ids with less that 8 data points


In [47]:
gather_data_obj = gatherData(config=config)

In [4]:
(
    lap_times_df,
    pit_stops_df,
    qualifying_df,
    races_df,
    results_df,
    sprint_results_df,
    status_df,
) = gather_data_obj.load_data()

In [5]:
pre_process_obj = processData(config=config)

master_lap_times_df = pre_process_obj.create_initial_dataset(
    lap_times_df=lap_times_df, races_df=races_df, results_df=results_df
)
master_lap_times_df

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId
0,841,20,1,1,98109,2011,1,1,Australian Grand Prix,2011-03-27,1,1
1,841,20,2,1,93006,2011,1,1,Australian Grand Prix,2011-03-27,1,1
2,841,20,3,1,92713,2011,1,1,Australian Grand Prix,2011-03-27,1,1
3,841,20,4,1,92803,2011,1,1,Australian Grand Prix,2011-03-27,1,1
4,841,20,5,1,92342,2011,1,1,Australian Grand Prix,2011-03-27,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
551737,1110,817,40,17,114361,2023,12,13,Belgian Grand Prix,2023-07-30,3,1
551738,1110,817,41,17,113367,2023,12,13,Belgian Grand Prix,2023-07-30,3,1
551739,1110,817,42,16,115247,2023,12,13,Belgian Grand Prix,2023-07-30,3,1
551740,1110,817,43,16,112115,2023,12,13,Belgian Grand Prix,2023-07-30,3,1


In [6]:
# check for nans in the data
master_lap_times_df.isna().any()

raceId          False
driverId        False
lap             False
position        False
milliseconds    False
year            False
round           False
circuitId       False
name            False
date            False
number          False
statusId        False
dtype: bool

In [7]:
driver_id = 20
race_id = 841

lap_times_selected_driver_race = master_lap_times_df[
    (master_lap_times_df["raceId"] == race_id)
    & (master_lap_times_df["driverId"] == driver_id)
]

In [8]:
## Lap times and pit stops

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=lap_times_selected_driver_race["lap"],
        y=lap_times_selected_driver_race["milliseconds"],
        mode="lines+markers",
        name="Lap times in milliseconds",
    )
)

fig.update_layout(
    title="Laptimes by each lap", xaxis_title="Lap number", yaxis_title="Total lap time"
)

- Two outliers in the data 
- this is mainly the pit stops that happen in between the race

In [9]:
master_laptimes_pitstop_df = pre_process_obj.add_pitstop_data(
    master_laptime_data=master_lap_times_df, pit_stop_data=pit_stops_df
)
master_laptimes_pitstop_df

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
0,841,20,1,1,98109,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
1,841,20,2,1,93006,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
2,841,20,3,1,92713,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
3,841,20,4,1,92803,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
4,841,20,5,1,92342,2011,1,1,Australian Grand Prix,2011-03-27,1,1,0.0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551737,1110,817,40,17,114361,2023,12,13,Belgian Grand Prix,2023-07-30,3,1,0.0,False
551738,1110,817,41,17,113367,2023,12,13,Belgian Grand Prix,2023-07-30,3,1,0.0,False
551739,1110,817,42,16,115247,2023,12,13,Belgian Grand Prix,2023-07-30,3,1,0.0,False
551740,1110,817,43,16,112115,2023,12,13,Belgian Grand Prix,2023-07-30,3,1,0.0,False


In [10]:
driver_id = 20
lap_times_selected_driver = master_laptimes_pitstop_df[(master_laptimes_pitstop_df["driverId"] == driver_id)
]

In [11]:
fig = go.Figure()

# Add a box trace
fig.add_trace(
    go.Box(
        y=lap_times_selected_driver["milliseconds"], name="Lap times for Aus GP"
    )
)

# Customize the layout
fig.update_layout(
    title="Box Plot with plotly.graph_objects",
    yaxis_title="Values",
)

# Show the figure
fig.show()

In [12]:
master_lap_time_data = pre_process_obj.work_on_outliers(master_lap_time_data=lap_times_selected_driver)

In [13]:
fig = go.Figure()

# Add a box trace
fig.add_trace(
    go.Box(
        y=master_lap_time_data["milliseconds"], name="Lap times for all GPs"
    )
)

# Customize the layout
fig.update_layout(
    title="Box Plot for millisecond for all GPs",
    yaxis_title="Lap times (milliseconds)",
)

# Show the figure
fig.show()

In [15]:
fig = go.Figure()

circuit_id = 1

lap_times_selected_driver_circuit = master_lap_time_data[master_lap_time_data["circuitId"]==circuit_id]
list_race_id = lap_times_selected_driver_circuit["raceId"].unique().tolist()

for race_id in list_race_id:

    lap_times_race = lap_times_selected_driver_circuit[
        lap_times_selected_driver_circuit["raceId"] == race_id
    ]

    fig.add_trace(
        go.Scatter(
            x=lap_times_race["lap"],
            y=lap_times_race["milliseconds"],
            mode="lines+markers",
            name=f"race id: {race_id}",
        )
    )

    fig.update_layout(
        title=f"Laptimes by each lap for circuit: {circuit_id}",
        xaxis_title="Lap number",
        yaxis_title="Total lap time",
    )
fig.show()

In [16]:
master_lap_time_data[master_lap_time_data["raceId"] == 900]

Unnamed: 0,raceId,driverId,lap,position,milliseconds,year,round,circuitId,name,date,number,statusId,pitStopMilliseconds,isPitStop
72699,900,20,1,15,120977.0,2014,1,1,Australian Grand Prix,2014-03-16,1,5,0.0,False
72700,900,20,2,16,109947.0,2014,1,1,Australian Grand Prix,2014-03-16,1,5,0.0,False
72701,900,20,3,16,111460.0,2014,1,1,Australian Grand Prix,2014-03-16,1,5,0.0,False


In [17]:
fig = go.Figure()

circuit_id = 12

lap_times_selected_driver_circuit = master_lap_time_data[master_lap_time_data["circuitId"]==circuit_id]
list_race_id = lap_times_selected_driver_circuit["raceId"].unique().tolist()

for race_id in list_race_id:

    lap_times_race = lap_times_selected_driver_circuit[
        lap_times_selected_driver_circuit["raceId"] == race_id
    ]

    fig.add_trace(
        go.Scatter(
            x=lap_times_race["lap"],
            y=lap_times_race["milliseconds"],
            mode="lines+markers",
            name=f"race id: {race_id}",
        )
    )

    fig.update_layout(
        title=f"Laptimes by each lap for circuit: {circuit_id}",
        xaxis_title="Lap number",
        yaxis_title="Total lap time",
    )
fig.show()

## Feature Engineering

- Add columns that can help in modelling
    - add date feature columns
    - add lagged features: previous_lap_time, previous_lap_position, previous_lap_number
    - one hot encode the categorical features

In [18]:
engineer_data_obj = engineerFeaturesForTraining(config=config)


engineered_lap_times_df, encoder, encoder_columns = engineer_data_obj.engineer_data(
    lap_times_data=master_lap_time_data
)

In [19]:
engineered_lap_times_df

Unnamed: 0,raceId,lap,position,milliseconds,year,milliseconds_1_prior,lap_number_1_prior,position_1_prior_lap,month,day,...,circuitId_68,circuitId_69,circuitId_70,circuitId_71,circuitId_73,circuitId_75,circuitId_76,circuitId_77,circuitId_78,circuitId_79
1,1,2,2,91173.0,2009,99647.0,1.0,2.0,3,29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,3,2,89752.0,2009,91173.0,2.0,2.0,3,29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,4,2,88999.0,2009,89752.0,3.0,2.0,3,29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,5,2,88849.0,2009,88999.0,4.0,2.0,3,29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,6,2,88424.0,2009,88849.0,5.0,2.0,3,29,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16394,1096,54,11,90931.0,2022,91056.0,53.0,11.0,11,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16395,1096,55,11,91304.0,2022,90931.0,54.0,11.0,11,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16396,1096,56,10,91485.0,2022,91304.0,55.0,11.0,11,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16397,1096,57,10,91227.0,2022,91485.0,56.0,10.0,11,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
fig = go.Figure()

for race_id in list_race_id:

    lap_times_race = engineered_lap_times_df[
        engineered_lap_times_df["raceId"] == race_id
    ]

    fig.add_trace(
        go.Scatter(
            x=lap_times_race["lap"],
            y=lap_times_race["milliseconds"],
            mode="lines+markers",
            name=f"race id: {race_id}",
        )
    )

    fig.add_trace(
        go.Scatter(
            x=lap_times_race["lap"],
            y=lap_times_race["milliseconds_1_prior"],
            mode="lines+markers",
            name=f"race id: {race_id} 1 prior",
        )
    )

    fig.update_layout(
        title=f"Laptimes by each lap and prior lap {circuit_id}",
        xaxis_title="Lap number",
        yaxis_title="Total lap time",
    )
fig.show()

## Spliting into train test and validation

The data split into two types

 - First a race_id is selected which will act as an unseen set, and the model will perform predictions on it
 - Train and validation set: 80 % of the perticular race is used for training and other as validation set

In [21]:
split_obj = splitData()

train_data, test_data = split_obj.train_test_split(
    data=engineered_lap_times_df, race_id=config.get("RACE_ID_TEST_SET")
)

training_set, validation_set = split_obj.train_validation_split(train_data=train_data)

# Transform data for LSTM model
- scaling 
    - Scale each numerical column using appropriate scaler
- sequencing
    - all scaled and categorical columns are combined 
    - the dataframe is then converted to a sequential tensor

In [23]:
data_transform_obj = transformData(
    n_steps_input=config.get("NUMBER_OF_HISTORICAL_LAP"),
    n_steps_output=config.get("N_STEP_OUTPUT"),
    config=config,
)

scaled_input_df_train, scaled_output_df_train = (
    data_transform_obj.create_scaled_input_output_data(data=training_set, train=True)
)


scaled_input_df_val, scaled_output_df_val = (
    data_transform_obj.create_scaled_input_output_data(data=validation_set, train=False)
)

INFO:root:scaler_dict.pkl saved to ../artifacts/


In [25]:
x_sequential_train, y_sequential_train = data_transform_obj.create_sequence(
    scaled_input_df=scaled_input_df_train, scaled_output_df=scaled_output_df_train
)

x_sequential_val, y_sequential_val = data_transform_obj.create_sequence(
    scaled_input_df=scaled_input_df_val, scaled_output_df=scaled_output_df_val
)

In [26]:
x_train = torch.Tensor(x_sequential_train)
y_train = torch.Tensor(y_sequential_train)

x_train = torch.reshape(
    x_train,
    (
        x_train.shape[0],
        config.get("NUMBER_OF_HISTORICAL_LAP"),
        x_train.shape[2],
    ),
)

x_validation = torch.Tensor(x_sequential_val)
y_validation = torch.Tensor(y_sequential_val)

x_validation = torch.reshape(
    x_validation,
    (
        x_validation.shape[0],
        config.get("NUMBER_OF_HISTORICAL_LAP"),
        x_validation.shape[2],
    ),
)

## Model training
- Create data loader for the LSTM model
- define LSTM model architecture
- define training loop
- train and save the model 

In [27]:
train_obj = train(config=config)

train_data_loader, validation_data_loader = train_obj.create_data_loader(
    batch_size=config.get("BATCH_SIZE"),
    x_train=x_train,
    y_train=y_train,
    x_validation=x_validation,
    y_validation=y_validation,
)

In [28]:
x_train.shape[2]

68

In [29]:
lstm_model = LSTM(
    num_classes=config.get("N_STEP_OUTPUT"),
    input_size=x_train.shape[2],
    hidden_size_layer_1=config.get("HIDDEN_SIZE_1"),
    hidden_size_layer_2=config.get("HIDDEN_SIZE_2"),
    num_layers=config.get("NUM_LAYERS"),
    dense_layer_size=config.get("DENSE_LAYER"),
    dropout_rate=0.1,
)
lstm_model.to(device)

loss_fn = torch.nn.MSELoss()

optimizer = torch.optim.Adam(lstm_model.parameters(), lr=0.01)

validation_loss = train_obj.train_validation_loop(
    n_epochs=20,
    lstm_model=lstm_model,
    optimizer=optimizer,
    loss_fn=loss_fn,
    train_loader=train_data_loader,
    validation_loader=validation_data_loader,
    device=device,
)

Epoch num: 0
Batch 1, train loss: 0.004526348318904638
Batch 2, train loss: 0.005203499924391508
Batch 3, train loss: 0.006711249239742756
Batch 4, train loss: 0.002407653722912073
Batch 5, train loss: 0.003762411419302225
Batch 6, train loss: 0.003183706197887659
Batch 7, train loss: 0.007202202454209328
Batch 8, train loss: 0.0032595335505902767
Batch 9, train loss: 0.004308083560317755
Batch 10, train loss: 0.004272842779755592
Batch 11, train loss: 0.003994180355221033
Batch 12, train loss: 0.0073915040120482445
Batch 13, train loss: 0.005659695714712143
Batch 14, train loss: 0.004405379761010408
Batch 15, train loss: 0.0063493456691503525
Batch 16, train loss: 0.007529687136411667
Batch 17, train loss: 0.005278805270791054
Batch 18, train loss: 0.006199687719345093
Batch 19, train loss: 0.0059022074565291405
Batch 20, train loss: 0.004281233996152878
Batch 21, train loss: 0.005711867939680815
Batch 22, train loss: 0.0017804462695494294
Batch 23, train loss: 0.002643103012815118
Ba

INFO:root:model.pkl saved to ../artifacts/


Validation loss: 0.3536624610424042


## Evaluation

- Check model performance in real life setting
- Compare with laptimes from the unseen race id selected earlier
- show the model performance throughout the race

In [30]:
infer_obj = makeInference(config=config)

prediction_tensor, actual_sequential, lap_sequential = infer_obj.perform_inference(
    test_data=test_data, device=device
)

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.7501, 0.4073, 0.4189, 0.4518, 0.3150],
        [1.9005, 2.0088, 1.8678, 1.6509, 1.5464],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000],


In [34]:
evaluate = evaluateModellingResults()

rmse_mae_mape_dict = evaluate.get_model_evaluation_metrics(
    ls_prediction=prediction_tensor.tolist(),
    ls_actual=actual_sequential,
    lap_sequential=lap_sequential,
)

[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[109409.8964471519, 101964.48328310251, 102216.3289963603, 102931.67885796726, 99960.8888002634]
[134392.30228865147, 136744.05836367607, 133682.7134385109, 128973.45386886597, 126702.44645023346]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93119.0, 93119.0, 93119.0]
[93119.0, 93119.0, 93

In [35]:
rmse_mae_mape_df = pd.DataFrame(rmse_mae_mape_dict)

In [36]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=rmse_mae_mape_df["current_lap"],
        y=rmse_mae_mape_df["RMSE"],
        mode="lines+markers",
        name=f"RMSE",
    )
)

fig.add_trace(
    go.Scatter(
        x=rmse_mae_mape_df["current_lap"],
        y=rmse_mae_mape_df["MAE"],
        mode="lines+markers",
        name=f"MAE",
    )
)


fig.update_layout(
    title="error rate in prediction at each lap",
    xaxis_title="Lap number",
    yaxis_title="Error",
)

## Data Analysis

In [42]:
import plotly.figure_factory as ff

histvalues = [engineered_lap_times_df["milliseconds"]]
names = ["lap times"]

fig = ff.create_distplot(histvalues, names, bin_size=10000, show_rug=False, curve_type ="kde", histnorm ="probability")

fig.show()

In [43]:
fig.show()