# Krafthack 7-8. february 2022

In [1]:
%load_ext autoreload

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error as mape
from tqdm.notebook import tqdm

from xgboost import XGBRegressor

import tensorflow as tf
from keras import optimizers, Sequential
from keras.models import Model
from keras.layers import Dense, Input, Activation

import matplotlib.pyplot as plt

from utils.preprocessing import get_timeslots, get_temporal_lookback_features, get_temporal_lookback_df, add_hour_feature, add_seconds_operational

2022-03-08 00:07:57.280171: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-03-08 00:07:57.280360: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Import data

In [None]:
df_train = pd.read_parquet('data/input_dataset-2.parquet')
df_test = pd.read_parquet('data/prediction_input.parquet')

## Clean dataset

In [None]:
# Extract relevant features
cols_keep = list(df_test.columns) + [f'Bolt_{i}_Tensile' for i in range(1,7)]
df_train = df_train[cols_keep]

# Remove rows that contain any missing values
df_train = df_train.dropna()

In [None]:
# Combine both datasets before doing feature engineering
df_full = pd.concat([df_train, df_test], axis=0)

In [None]:
df_full.tail().T

## Feature Engineering
- Log-transform
- Signal-analysis (derivatives, Fourier transform, power, etc)
- Temporal features (day, month, holiday, etc)
- Sequencing
- Onehhot encoding of categorical features

### Add temporal features

In [None]:
df_full = add_hour_feature(df_full)
df_full = add_seconds_operational(df_full)
df_full['time_weekday'] = df_full.index.dayofweek

In [None]:
df_full.tail().T

### Handle categorical feature

In [None]:
def get_mode_as_dummy(df):
    # Make "mode" into dummy variable
    y = pd.get_dummies(df["mode"], prefix="Mode")
    df = df.join(y)
    df.drop("mode", inplace=True, axis=1)
    return df

df_full = get_mode_as_dummy(df_full)

In [None]:
df_full.tail()

## Get look-back features

In [None]:
# TODO: Maybe change this for some aggregated features instead

In [None]:
columns = [
    'Unit_4_Power',
    'Unit_4_Reactive Power',
    'Turbine_Guide Vane Opening',
    'Turbine_Pressure Drafttube',
    'Turbine_Pressure Spiral Casing',
    'Turbine_Rotational Speed'
    ]


df_timeslots_list = get_timeslots(df_full)
df_full_with_lookback = get_temporal_lookback_df(df_timeslots_list, cols=columns, window_size=30, steps=5)

## Split data into train-validate-test

In [None]:
df_train_new = df_full_with_lookback[df_train.index[0]:df_train.index[-1]].dropna()
df_test_new  = df_full_with_lookback[df_test.index[0]:df_test.index[-1]]

In [None]:
# Insert previous y_value
labels = [f"Bolt_{i}_Tensile" for i in range(1,7)]
labels_prev = [f"Bolt_{i}_Tensile_prev" for i in range(1,7)]

df_train_new[labels_prev] = df_train_new[labels].shift(1)
df_train_new.loc[df_train_new.index[0], labels_prev] = df_train_new.loc[df_train_new.index[1], labels_prev]

df_test_new[labels_prev] = np.nan
df_test_new.loc[df_test_new.index[0], labels_prev] = df_train_new.loc[df_train_new.index[-1], labels_prev]

In [None]:
df_train_new.tail(2)

In [None]:
df_test_new.head(2)

In [None]:
print(f"df_train: {df_train.shape}")
print(f"df_train_new: {df_train_new.shape}")
print(f"df_test: {df_test.shape}")
print(f"df_test_new: {df_test_new.shape}")

In [None]:
PCT_SPLIT = 0.7

X_train = df_train_new.drop(labels, axis=1)
X_train_train = X_train[:int(PCT_SPLIT*len(X_train))]
X_train_val = X_train[int(PCT_SPLIT*len(X_train)):]

X_test = df_test_new.drop(labels, axis=1)

y_train = df_train_new[labels]
y_train_train = y_train[:int(PCT_SPLIT*len(y_train))]
y_train_val = y_train[int(PCT_SPLIT*len(y_train)):]

y_test = df_test_new[labels]

In [5]:
print(f"X_train:        {X_train.shape}")
print(f"X_train_train:  {X_train_train.shape}")
print(f"X_train_val:    {X_train_val.shape}")
print(f"X_test:         {X_test.shape}")

print()

print(f"y_train:        {y_train.shape}")
print(f"y_train_train:  {y_train_train.shape}")
print(f"y_train_val:    {y_train_val.shape}")
print(f"y_test:         {y_test.shape}")

X_train:        (1748596, 53)
X_train_train:  (1224017, 53)
X_train_val:    (524579, 53)
X_test:         (226364, 53)

y_train:        (1748596, 6)
y_train_train:  (1224017, 6)
y_train_val:    (524579, 6)
y_test:         (226364, 6)


### Save datasets

In [None]:
X_train.to_pickle('data/X_train.pkl')
X_train_train.to_pickle('data/X_train_train.pkl')
X_train_val.to_pickle('data/X_train_val.pkl')
X_test.to_pickle('data/X_test.pkl')
y_train.to_pickle('data/y_train.pkl')
y_train_train.to_pickle('data/y_train_train.pkl')
y_train_val.to_pickle('data/y_train_val.pkl')
y_test.to_pickle('data/y_test.pkl')

### Load datasets

In [4]:
X_train = pd.read_pickle('data/X_train.pkl')
X_train_train = pd.read_pickle('data/X_train_train.pkl')
X_train_val = pd.read_pickle('data/X_train_val.pkl')
X_test = pd.read_pickle('data/X_test.pkl')
y_train = pd.read_pickle('data/y_train.pkl')
y_train_train = pd.read_pickle('data/y_train_train.pkl')
y_train_val = pd.read_pickle('data/y_train_val.pkl')
y_test = pd.read_pickle('data/y_test.pkl')

## Scaling


In [6]:
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_scaled = pd.DataFrame(
    scaler_X.fit_transform(X_train_train),
    index = X_train_train.index,
    columns = X_train_train.columns
    )
X_train_val_scaled = pd.DataFrame(
    scaler_X.transform(X_train_val),
    index = X_train_val.index,
    columns = X_train_val.columns
    )
y_train_scaled = pd.DataFrame(
    scaler_y.fit_transform(y_train_train),
    index = y_train_train.index,
    columns = y_train_train.columns
    )
y_train_val_scaled = pd.DataFrame(
    scaler_y.transform(y_train_val),
    index = y_train_val.index,
    columns = y_train_val.columns
    )

## Train Model

In [7]:
models = {}

### Linear Regression

In [8]:
models['linreg'] = [LinearRegression() for i in range(2)]
# models['linreg'] = [LinearRegression() for i in range(6)]

for i, model in tqdm(enumerate(models['linreg']), total=len(models['linreg'])):
    model.fit(X_train_scaled, y_train_scaled[f"Bolt_{i+1}_Tensile"])

  0%|          | 0/2 [00:00<?, ?it/s]

### Random Forest

In [None]:
# models['rndforest'] = [RandomForestRegressor(
#     max_depth=4,
#     n_estimators=100,
#     criterion="absolute_error",
#     max_features=4)
#     for i in range(6)]

# for i, model in tqdm(enumerate(models['rndforest']), total=6):
#     model.fit(X_train_train, y_train_train[f"Bolt_{i+1}_Tensile"])

### XGBoost

In [None]:
models['xgboost'] = [XGBRegressor(
    booster="gbtree",
    learning_rate=0.2,
    gamma=0.1,
    max_depth=6,
    eval_metric="mae")
    for i in range(6)]

for i, model in tqdm(enumerate(models['xgboost']), total=6):
    model.fit(X_train_scaled, y_train_scaled[f"Bolt_{i+1}_Tensile"])

### Multilayer perceptron

## Cross Validation
- [Special methods for time-series data](https://medium.com/@soumyachess1496/cross-validation-in-time-series-566ae4981ce4)

## Hyperparameter tuning
- [Sklearn](https://scikit-learn.org/stable/modules/grid_search.html)
- [Nevergrad](https://facebookresearch.github.io/nevergrad/)
- [Keras Tuner](https://www.tensorflow.org/tutorials/keras/keras_tuner)

## Predict and score
- Good metrics for temporal data?
- Depends on competition metric

In [None]:
idx_inf = [i for i, arr in enumerate(X_train_val_scaled) if not np.isfinite(arr).all()]

In [None]:
idx_inf

In [None]:
X_train_val.iloc[idx_inf[0]-2:idx_inf[0]+2, -6:]

In [None]:
pd.DataFrame(X_train_val_scaled[idx[0]-2:idx[0]+2, -6:])

In [2]:
labels = [f"Bolt_{i}_Tensile" for i in range(1,2)]
# labels = [f"Bolt_{i}_Tensile" for i in range(1,7)]
y_preds = {}

for model_name, model in tqdm(models.items(), total=len(models)):
    # For each model type
    y_preds[model_name] = {}
    for model_i, label in tqdm(enumerate(labels), total=len(labels)):
        # For each sub-model specialized for a unique label column
        y_preds[model_name][label] = []
        for idx, (_, row) in enumerate(X_train_val_scaled.iterrows()):
            # For each row, predict and forward-fill predicted value to next row
            y_hat = model[model_i].predict(row.to_frame().T)[0]
            y_preds[model_name][label].append(y_hat)
            if idx < len(X_train_val_scaled):
                X_train_val_scaled.iloc[idx+1, label+'_prev'] = y_hat

    y_preds[model_name] = pd.DataFrame(y_preds[model_name])

NameError: name 'tqdm' is not defined

## Model explanation
- Explainable model ([interpretml](https://github.com/interpretml/interpret))
- Certainty score
- [LIME](https://github.com/marcotcr/lime)
- [SHAP](https://github.com/slundberg/shap)

## Deployment
- Pipeline for deploying model
- Host model in e.g. Azure