In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from IPython.core.display import display, HTML
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import KFold, cross_val_score

plt.style.use('fivethirtyeight')
display(HTML("<style>.container { width:80% !important; }</style>"))

# Read input files

In [2]:
# Decided to use both input dataset 1 and 2 to get more data to train on
#df = pd.read_parquet("../container-data/input_dataset-1.parquet")
df = pd.read_parquet("../container-data/input_dataset-2.parquet")
pred_df = pd.read_parquet("../container-data/prediction_input.parquet")

In [3]:
pred_cols = ["Bolt_"+str(x)+"_Tensile" for x in range(1,7)]+list(pred_df.columns)
pred_cols

['Bolt_1_Tensile',
 'Bolt_2_Tensile',
 'Bolt_3_Tensile',
 'Bolt_4_Tensile',
 'Bolt_5_Tensile',
 'Bolt_6_Tensile',
 'Unit_4_Power',
 'Unit_4_Reactive Power',
 'Turbine_Guide Vane Opening',
 'Turbine_Pressure Drafttube',
 'Turbine_Pressure Spiral Casing',
 'Turbine_Rotational Speed',
 'mode']

In [4]:
#df = pd.concat([df1.loc[:,pred_cols], df2.loc[:,pred_cols]], axis=0)

In [5]:
#df.to_pickle("../container-data/df.pkl")

In [6]:
#df = pd.read_pickle("../container-data/df.pkl")
df = df.loc[:,pred_cols]

In [7]:
df.columns

Index(['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile',
       'Bolt_5_Tensile', 'Bolt_6_Tensile', 'Unit_4_Power',
       'Unit_4_Reactive Power', 'Turbine_Guide Vane Opening',
       'Turbine_Pressure Drafttube', 'Turbine_Pressure Spiral Casing',
       'Turbine_Rotational Speed', 'mode'],
      dtype='object')

# Common preprocessing

In [8]:
# Want to extract the time of the earliest datapoint in the dataset
start_time = df.index.min()
start_time

Timestamp('1970-12-19 09:51:44')

In [9]:
df.head()

Unnamed: 0_level_0,Bolt_1_Tensile,Bolt_2_Tensile,Bolt_3_Tensile,Bolt_4_Tensile,Bolt_5_Tensile,Bolt_6_Tensile,Unit_4_Power,Unit_4_Reactive Power,Turbine_Guide Vane Opening,Turbine_Pressure Drafttube,Turbine_Pressure Spiral Casing,Turbine_Rotational Speed,mode
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1970-12-19 09:51:44,1598.48139,1480.989917,1684.247079,1601.359944,1635.5857,1674.848803,262.204308,2.899036,82.279976,173.955216,5310.799181,107.964278,operation
1970-12-19 09:51:45,1598.477449,1480.989528,1684.261611,1601.366508,1635.588478,1674.823883,262.104319,3.34463,82.277248,173.989815,5311.219755,107.964273,operation
1970-12-19 09:51:46,1598.479316,1481.003188,1684.270504,1601.374254,1635.583464,1674.841318,262.00433,3.790223,82.27452,174.024413,5311.640329,107.964269,operation
1970-12-19 09:51:47,1598.490184,1481.028827,1684.270683,1601.383179,1635.581384,1674.843245,261.90434,4.235817,82.271792,174.059012,5312.060902,107.964264,operation
1970-12-19 09:51:48,1598.494073,1481.059017,1684.271062,1601.378391,1635.591746,1674.8723,261.804351,4.064759,82.269064,174.153819,5312.405938,107.964259,operation


In [10]:
def common_prep(df):
    df["datetime"] = df.index.values
    df.index = np.arange(len(df))
    # We drop rows with missing. No use in trying to impute. Creates more uncertainty.
    df = df.dropna()
    return df

In [11]:
df = common_prep(df)

Want to make a holdout dataset for validation. This is held separate from crossvalidation.

The holdout-set is chosen as 20% of the data. (Not chunked in time.)

Argue that we do not get data leakage from the future, but get better validation that we are able to model dependencies across the whole dataset. 

In [12]:
def split_trn_val(df, frac=0.2):
    df = df.copy()
    val_inds = np.random.choice(df.index.values, size=int(len(df)*frac), replace=False)
    val_df = df.iloc[df.index.isin(val_inds)]
    trn_df = df.iloc[~df.index.isin(val_inds)]
    return trn_df, val_df

In [13]:
trn_df, val_df = split_trn_val(df)

In [14]:
len(trn_df), len(val_df)

(1400000, 350000)

In [15]:
val_df_dt = val_df["datetime"]

In [16]:
#trn_df.to_pickle("../container-data/trn_df.pkl")

In [17]:
#val_df.to_pickle("../container-data/val_df.pkl")

In [18]:
y_cols = ["Bolt_"+str(count)+"_Tensile" for count in range(1,7)]
y_cols

['Bolt_1_Tensile',
 'Bolt_2_Tensile',
 'Bolt_3_Tensile',
 'Bolt_4_Tensile',
 'Bolt_5_Tensile',
 'Bolt_6_Tensile']

In [19]:
def add_diff_cols(df):
    return df

In [20]:
def diff_bolt_pretension(df):
    pret = pd.read_csv("../container-data/bolt_pretension.csv", header=None)
    pret_dict = pd.Series(pret.iloc[:,1].values, index=pret.iloc[:,0]).to_dict()
    for y in y_cols:
        df[y] = df[y]-pret_dict[y.strip("_Tensile").replace("_", " ")]
    return df

def binarize_mode(df):
    df["mode"] = df["mode"].apply(lambda x: 1 if x=="start" else 0)
    return df

def categorize_mode(df):
    df["mode"] = df["mode"].apply(lambda x: "start" if x==1 else "operation")
    return df

def add_mode_switch(df):
    df["mode_switch"] = df["mode"]-(df["mode"].shift(fill_value=False))
    return df

def add_sec_since_switch(df):
    df["sec_since_switch"] = (df.groupby('mode')['datetime'].transform(lambda x: (x-x.min()).dt.seconds))
    return df

def add_days_since_start(df):
    df["days_since_start"] = (df["datetime"]-start_time).dt.days
    return df

def add_rolling_sum_speed(df, periods=[60,60*5, 60*60]):
    for period in periods:
        df["Turbine_Rotational Speed_sum"+str(period)] = df["Turbine_Rotational Speed"].rolling(period, min_periods=1).sum()
    return df

def remove_cols(df):
    remove_cols = ["datetime"]
    df = df.drop(remove_cols, axis=1)
    return df

def preprocessing(df):
    # Those that are commented out was left out as no improvement in cv score was shown
    #df = diff_bolt_pretension(df)
    df = binarize_mode(df)
    df = add_mode_switch(df)
    df = add_sec_since_switch(df)
    df = add_days_since_start(df)
    #df = add_rolling_sum_speed(df)
    df = remove_cols(df)
    df = categorize_mode(df)
    return df

def split_x_y(df):
    X = df.drop(y_cols, axis=1)
    y = df.loc[:, y_cols]
    return X, y

In [21]:
trn_df = preprocessing(trn_df)
df_X_train, df_y_train = split_x_y(trn_df)

In [22]:
val_df = preprocessing(val_df)
df_X_val, df_y_val = split_x_y(val_df)

In [23]:
x_cols = set([x for x in df_X_train.columns])-set(y_cols)
x_cols

{'Turbine_Guide Vane Opening',
 'Turbine_Pressure Drafttube',
 'Turbine_Pressure Spiral Casing',
 'Turbine_Rotational Speed',
 'Unit_4_Power',
 'Unit_4_Reactive Power',
 'days_since_start',
 'mode',
 'mode_switch',
 'sec_since_switch'}

# Make column selectors

In [24]:
X = df_X_train
y = df_y_train

In [25]:
from sklearn.compose import make_column_selector

In [26]:
df.dtypes

Bolt_1_Tensile                           float64
Bolt_2_Tensile                           float64
Bolt_3_Tensile                           float64
Bolt_4_Tensile                           float64
Bolt_5_Tensile                           float64
Bolt_6_Tensile                           float64
Unit_4_Power                             float64
Unit_4_Reactive Power                    float64
Turbine_Guide Vane Opening               float64
Turbine_Pressure Drafttube               float64
Turbine_Pressure Spiral Casing           float64
Turbine_Rotational Speed                 float64
mode                                      object
datetime                          datetime64[ns]
dtype: object

In [27]:
cat_selector = make_column_selector(dtype_include=object)
num_selector = make_column_selector(dtype_include=np.number)
cat_selector(X), num_selector(X)

(['mode', 'mode_switch'],
 ['Unit_4_Power',
  'Unit_4_Reactive Power',
  'Turbine_Guide Vane Opening',
  'Turbine_Pressure Drafttube',
  'Turbine_Pressure Spiral Casing',
  'Turbine_Rotational Speed',
  'sec_since_switch',
  'days_since_start'])

# Make column transformers

## Tree preprocessor

In [28]:
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

cat_tree_processor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)
num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)

tree_preprocessor = make_column_transformer(
    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector)
)
tree_preprocessor

ColumnTransformer(transformers=[('simpleimputer',
                                 SimpleImputer(add_indicator=True),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x0000023AACDDA250>),
                                ('ordinalencoder',
                                 OrdinalEncoder(handle_unknown='use_encoded_value',
                                                unknown_value=-1),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x0000023AACDDA730>)])

## Linear preprocessor

In [29]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

cat_linear_processor = OneHotEncoder(handle_unknown="ignore")
num_linear_processor = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)

linear_preprocessor = make_column_transformer(
    (num_linear_processor, num_selector), (cat_linear_processor, cat_selector)
)
linear_preprocessor

ColumnTransformer(transformers=[('pipeline',
                                 Pipeline(steps=[('standardscaler',
                                                  StandardScaler()),
                                                 ('simpleimputer',
                                                  SimpleImputer(add_indicator=True))]),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x0000023AACDDA250>),
                                ('onehotencoder',
                                 OneHotEncoder(handle_unknown='ignore'),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x0000023AACDDA730>)])

# Make pipelines

## Lasso pipeline

In [35]:
from sklearn.linear_model import LassoCV, RidgeCV, MultiTaskLassoCV

lasso_pipeline = make_pipeline(linear_preprocessor, LassoCV(n_jobs=4, cv=3, verbose=True, selection="random", max_iter=1000))
lasso_pipeline


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler()),
                                                                  ('simpleimputer',
                                                                   SimpleImputer(add_indicator=True))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000023AACDDA250>),
                                                 ('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000023AACDDA730>)])),
                ('lassocv',
                 LassoCV(cv=3, n_jobs=4, selection='random', verbose=Tru

## Random forest pipeline

In [31]:
from sklearn.ensemble import RandomForestRegressor

rf_pipeline = make_pipeline(tree_preprocessor, RandomForestRegressor(n_estimators=3, n_jobs=6, random_state=42))
rf_pipeline

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('simpleimputer',
                                                  SimpleImputer(add_indicator=True),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000023AACDDA250>),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                 unknown_value=-1),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x0000023AACDDA730>)])),
                ('randomforestregressor',
                 RandomForestRegressor(n_estimators=3, n_jobs=6,
                                       random_state=42))])

# Stacking regressor  

In [37]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

estimators = [
    ("Lasso", lasso_pipeline),
    ("Random Forest", rf_pipeline),
]

stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV(cv=3))
stacking_regressor

StackingRegressor(estimators=[('Lasso',
                               Pipeline(steps=[('columntransformer',
                                                ColumnTransformer(transformers=[('pipeline',
                                                                                 Pipeline(steps=[('standardscaler',
                                                                                                  StandardScaler()),
                                                                                                 ('simpleimputer',
                                                                                                  SimpleImputer(add_indicator=True))]),
                                                                                 <sklearn.compose._column_transformer.make_column_selector object at 0x0000023AACDDA250>),
                                                                                ('onehotencoder',
                                             

# Cross-validation

In [None]:
kf = KFold(n_splits=3, random_state=42, shuffle=True)
fold = 1
scores = {}
preds = {}
residuals = {}
trained_models = {}
for train_index, test_index in kf.split(df_X_train):
    scores[fold] = {}
    preds[fold] = {}
    residuals[fold] = {}
    trained_models[fold] = {}
    print(fold)
    X_train, X_test = df_X_train.iloc[train_index], df_X_train.iloc[test_index]
    for y in y_cols:
        print(y)
        scores[fold][y] = {}
        preds[fold][y] = {}
        residuals[fold][y] = {}
        trained_models[fold][y] = {}
        drop_cols = [x for x in y_cols if x!=y]
        print(drop_cols)
        y_train, y_test = df_y_train.iloc[train_index].drop(drop_cols, axis=1).values.flatten(), df_y_train.iloc[test_index].drop(drop_cols, axis=1)
        for (name, model) in estimators + [("Stacking Regressor", stacking_regressor)]:
            print(name)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = mean_absolute_percentage_error(y_test, y_pred, multioutput="raw_values")
            print(score)
            scores[fold][y][name] = score
            preds[fold][y][name] = y_pred
            residuals[fold][y][name] = y_pred-y_test.values.flatten()
            trained_models[fold][y][name] = model
            break
    fold = fold+1

1
Bolt_1_Tensile
['Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
............................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    6.5s finished


[0.0004125]
Bolt_2_Tensile
['Bolt_1_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
............................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    6.8s finished


[0.00047392]
Bolt_3_Tensile
['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
............................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    6.9s finished


[0.00035049]
Bolt_4_Tensile
['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
......................................................................................................................[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    6.9s finished


[0.0004908]
Bolt_5_Tensile
['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_6_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model 

.[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.1s finished


[0.00049964]
Bolt_6_Tensile
['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
............................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.3s finished


[0.00042319]
2
Bolt_1_Tensile
['Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
............................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.2s finished


[0.00041253]
Bolt_2_Tensile
['Bolt_1_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
............................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.6s finished


[0.00047372]
Bolt_3_Tensile
['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_4_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
............................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.2s finished


[0.0003485]
Bolt_4_Tensile
['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_5_Tensile', 'Bolt_6_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
.....................................................................................................[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.3s finished


[0.00048929]
Bolt_5_Tensile
['Bolt_1_Tensile', 'Bolt_2_Tensile', 'Bolt_3_Tensile', 'Bolt_4_Tensile', 'Bolt_6_Tensile']
Lasso


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model 

.[Parallel(n_jobs=4)]: Done   3 out of   3 | elapsed:    7.1s finished


# Make predictions on validation set

In [None]:
scores = {}
preds = {}
residuals = {}
models = {}
for y in y_cols:
    print(y)
    drop_cols = [x for x in y_cols if x!=y]
    y_train = df_y_train.drop(drop_cols, axis=1).values.flatten()
    y_val = df_y_val.drop(drop_cols, axis=1).values.flatten()
    model = stackingregressor
    model.fit(df_X_train, y_train)
    y_pred = model.predict(df_X_val)
    score = mean_absolute_percentage_error(y_val, y_pred)
    print(score)
    scores[y] = score
    preds[y] = y_pred
    residuals[y] = y_pred-y_val
    models[y] = model

In [None]:
len(df_y_val), len(df_y_train), len(df)

# Investigate results

In [None]:
scores

In [None]:
score_df = pd.DataFrame.from_dict(scores, orient="index")
score_df.plot(kind="bar", figsize=(24,18))

In [None]:
score_df.mean()

# Plot feature importances

In [None]:
models

In [None]:
nrow= 3
ncol=2

# make a list of all dataframes 
fig, axes = plt.subplots(nrow, ncol, figsize=(30,24))

# plot counter
count=1
for r in range(nrow):
    for c in range(ncol):
        name = "Bolt_"+str(count)+"_Tensile"
        model = models[name]
        imps = pd.Series(model.feature_importances_, index=df_X_val.columns)
        imps.plot(kind="bar", ax=axes[r,c], title=name)
        count+=1

It is interesting to see that days_since_start matters more for the bolts that are more difficult to predict.

# Plot residuals

Where do we miss?

In [None]:
res_df = pd.DataFrame(data=np.transpose([residuals[y] for y in y_cols]), index=val_df_dt, columns=y_cols)

In [None]:
nrow= 3
ncol=2

# make a list of all dataframes 
fig, axes = plt.subplots(nrow, ncol, figsize=(30,24))

# plot counter
count=1
for r in range(nrow):
    for c in range(ncol):
        name = "Bolt_"+str(count)+"_Tensile"
        res_df[name].plot(style=".", ax=axes[r,c], title=name)
        count+=1

# Train final model with whole dataset

In [None]:
df.columns

In [None]:
final_df = preprocessing(df)

In [None]:
df_X, df_y = split_x_y(final_df) 

In [None]:
df_X.columns, df_y.columns

In [None]:
model = RandomForestRegressor(n_estimators=10, n_jobs=4)
model.fit(df_X, df_y)

In [None]:
pred_df = common_prep(pred_df)
pred_df_prep = preprocessing(pred_df)

## Make predictions with final model for submission

In [None]:
preds = model.predict(pred_df.loc[:,df_X.columns])

In [None]:
df_preds = pd.DataFrame(preds, columns=df_y.columns)
df_preds

In [None]:
out_df = pd.concat([df_preds, pred_df["datetime"]], axis=1).set_index("datetime")
out_df

In [None]:
orig = pd.read_parquet("../container-data/prediction_input.parquet")

In [None]:
assert (orig.index == out_df.index).all()

In [None]:
pred_df.to_csv("../container-data/prediction_output.csv")