In [11]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns

## Assumptions
* no_of_days means days up or online, not how many days remain until an observed failure happened

## First we will do some feature engineering, we are given no_of_days which mean uptime. We are trying to predict days until failure which would be MAX_DAYS_UP - no_of_days so we will add that column to our datasets

In [18]:
dataset_10col = pd.read_csv("./hfm_10cols.csv")
dataset_14col = pd.read_csv("./hfm_14cols.csv")

dataset_10col_groups = dataset_10col.groupby("hfm_runs")
dataset_14col_groups = dataset_14col.groupby("hfm_runs")

# get the max no_of_days for each dataset to calculate days_until_failure
max_days_of_groups_10col = dataset_10col.groupby("hfm_runs")["no_of_days"].max()
max_days_of_groups_14col = dataset_14col.groupby("hfm_runs")["no_of_days"].max()

# insert dummy column for days_until_failure
dataset_10col["days_until_failure"] = np.NaN
dataset_14col["days_until_failure"] = np.NaN

for idx, row in dataset_10col.iterrows():
    max_day_of_group = max_days_of_groups_10col[int(row["hfm_runs"])]
    dataset_10col.at[idx, "days_until_failure"] = max_day_of_group - row["no_of_days"] + 1 # so we dont hit 0

for idx, row in dataset_14col.iterrows():
    max_day_of_group = max_days_of_groups_14col[int(row["hfm_runs"])]
    dataset_14col.at[idx, "days_until_failure"] = max_day_of_group - row["no_of_days"] + 1 # so we dont hit 0


## Next we will look at graphs of the sensors over all the runs, I have always found it very useful to start with some data viz, especially with sensor data

### Data viz for the 10 column dataset

In [None]:
fig = make_subplots(rows=8, cols=1, subplot_titles=["sensor_2", "sensor_3", "sensor_10", "sensor_12", "sensor_13", "sensor_14", "sensor_16", "sensor_17"])
sensor_plots = []
i = 1
for sensor in ["sensor_2", "sensor_3", "sensor_10", "sensor_12", "sensor_13", "sensor_14", "sensor_16", "sensor_17"]:
    #sns.lineplot(data=dataset_10col, x="no_of_days", y=sensor, hue= "run " + dataset_10col["hfm_runs"].astype(str))
    sensor_plot = px.line(dataset_10col, x="no_of_days", y=sensor, line_group="hfm_runs", color="hfm_runs")
    sensor_plots.append(sensor_plot)
    for d in sensor_plot.data:
        fig.add_trace((go.Scatter(x=d["x"], y=d["y"], name=d['name'])), row=i, col=1)
    i += 1
    
fig.update_layout(height=4000)
fig.show()

## We see here that sensor 17 does not provide any information so we will drop it. It could be a setpoint which would explain why it wouldn't change

In [28]:
print(f'The min and max for sensor 17 respectively are: {dataset_10col["sensor_17"].min()} {dataset_10col["sensor_17"].max()}')
dataset_10col = dataset_10col.drop(columns="sensor_17")

The min and max for sensor 17 respectively are: 2388 2388


### Data viz for the 14 column dataset

In [None]:
fig = make_subplots(rows=6, cols=1, subplot_titles=["sensor_1", "sensor_4", "sensor_5", "sensor_6", "sensor_7", "sensor_8"])
sensor_plots = []
i = 1
for sensor in ["sensor_1", "sensor_4", "sensor_5", "sensor_6", "sensor_7", "sensor_8"]:
    #sns.lineplot(data=dataset_10col, x="no_of_days", y=sensor, hue= "run " + dataset_10col["hfm_runs"].astype(str))
    sensor_plot = px.line(dataset_14col, x="no_of_days", y=sensor, line_group="hfm_runs", color="hfm_runs")
    sensor_plots.append(sensor_plot)
    for d in sensor_plot.data:
        fig.add_trace((go.Scatter(x=d["x"], y=d["y"], name=d['name'])), row=i, col=1)
    i += 1

fig.update_layout(height=4000)
fig.show()


In [17]:
fig = make_subplots(rows=6, cols=1, subplot_titles=["sensor_9", "sensor_11", "sensor_15", "sensor_18", "sensor_19", "sensor_20"])
sensor_plots = []
i = 1
for sensor in ["sensor_9", "sensor_11", "sensor_15", "sensor_18", "sensor_19", "sensor_20"]:
    #sns.lineplot(data=dataset_10col, x="no_of_days", y=sensor, hue= "run " + dataset_10col["hfm_runs"].astype(str))
    sensor_plot = px.line(dataset_14col, x="no_of_days", y=sensor, line_group="hfm_runs", color="hfm_runs")
    sensor_plots.append(sensor_plot)
    for d in sensor_plot.data:
        fig.add_trace((go.Scatter(x=d["x"], y=d["y"], name=d['name'])), row=i, col=1)
    i += 1

fig.update_layout(height=4000)
fig.show()


## In the 14 column dataset sensors 4, 5, 9, 15, and 18 also appear to be nonfunctional or setpoints and do not provide us with any information so we will drop these

In [29]:
print(f'The min and max for sensor 4 respectively are: {dataset_14col["sensor_4"].min()} {dataset_14col["sensor_4"].max()}')
print(f'The min and max for sensor 5 respectively are: {dataset_14col["sensor_5"].min()} {dataset_14col["sensor_5"].max()}')
print(f'The min and max for sensor 9 respectively are: {dataset_14col["sensor_9"].min()} {dataset_14col["sensor_9"].max()}')
print(f'The min and max for sensor 15 respectively are: {dataset_14col["sensor_15"].min()} {dataset_14col["sensor_15"].max()}')
print(f'The min and max for sensor 18 respectively are: {dataset_14col["sensor_18"].min()} {dataset_14col["sensor_18"].max()}')

dataset_14col = dataset_14col.drop(columns="sensor_4")
dataset_14col = dataset_14col.drop(columns="sensor_5")
dataset_14col = dataset_14col.drop(columns="sensor_9")
dataset_14col = dataset_14col.drop(columns="sensor_15")
dataset_14col = dataset_14col.drop(columns="sensor_18")



The min and max for sensor 4 respectively are: 14.62 14.62
The min and max for sensor 5 respectively are: 21.6 21.61
The min and max for sensor 9 respectively are: 1.3 1.3
The min and max for sensor 15 respectively are: 0.03 0.03
The min and max for sensor 18 respectively are: 100.0 100.0
