In [1]:
import pandas as pd
import plotly.express as px
from matplotlib import pyplot as plt
import os
import numpy as np
from data_helpers import (
    get_running_state, 
    load_training_data, 
    get_tensile_sensor_names, 
    get_torsion_sensor_names, 
    get_transient_state
)

In [2]:
%load_ext autoreload
%autoreload 2

# Krafthack 2022 Data Prep
## Dividing the the data into different modes
After exploring the raw data, we decided to classify the data in three different categories, with the intention of trying to classify which operating state the machine is in, and then using a model for each mode. The modes are:
1. Starting mode
2. Transient mode (from one load to another)
3. Steady state (running on a stable load)

In [3]:
minutes_of_masking = 60
full_steady_data = load_training_data(full=True, steady=True, minutes_of_masking=minutes_of_masking)

In [5]:
full_unsteady_data = load_training_data(full=True,steady=False,minutes_of_masking=minutes_of_masking)

In [6]:
px.line(full_unsteady_data.loc[:,'Unit_4_Power'].sample(10000).sort_index())

In [7]:
steady_tensile_df = full_steady_data.loc[:,get_tensile_sensor_names()]
steady_tensile_df['operation_mode'] = 'steady'
unsteady_tensile_df = full_unsteady_data.loc[:,get_tensile_sensor_names()]
unsteady_tensile_df['operation_mode'] = 'unsteady'
tensile_df = pd.concat([unsteady_tensile_df,steady_tensile_df])


In [8]:
from plot_helpers import plot_tensile_values,plot_torsion_values

plot_tensile_values(tensile_df,n_samples=10000)

In [89]:
prediction_data = pd.read_parquet('../data/prediction_input.parquet')
prediction_data = prediction_data.rename(columns={'mode':'operation_mode'})
print(prediction_data.columns)
prediction_data.head()

Index(['Unit_4_Power', 'Unit_4_Reactive Power', 'Turbine_Guide Vane Opening',
       'Turbine_Pressure Drafttube', 'Turbine_Pressure Spiral Casing',
       'Turbine_Rotational Speed', 'operation_mode'],
      dtype='object')


Unnamed: 0_level_0,Unit_4_Power,Unit_4_Reactive Power,Turbine_Guide Vane Opening,Turbine_Pressure Drafttube,Turbine_Pressure Spiral Casing,Turbine_Rotational Speed,operation_mode
timepoints,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1971-01-25 11:06:49,308.867868,5.592261,94.442351,158.159044,5279.876581,108.057467,operation
1971-01-25 11:06:50,308.898237,6.251969,94.445687,158.202829,5279.930843,108.05746,operation
1971-01-25 11:06:51,308.928605,7.037091,94.449024,158.246614,5279.985105,108.057454,operation
1971-01-25 11:06:52,308.958974,7.822213,94.452361,158.290399,5280.039368,108.057448,operation
1971-01-25 11:06:53,308.989343,8.607335,94.455698,158.302931,5280.058748,108.057442,operation


In [9]:
for col in full_steady_data:
    fig = px.histogram(full_steady_data[col].sample(1000))
    fig.show()

In [19]:
def clean_outliers(df):
    df = df.copy()
    df.loc[df['Turbine_Pressure Spiral Casing']<4850,:] = np.nan
    df.loc[df['Unit_4_Power']<160,:] = np.nan
    df.loc[df['Turbine_Rotational Speed']<107.5,:] = np.nan
    return df

    


In [20]:
full_steady_data = clean_outliers(full_steady_data)
full_unsteady_data = clean_outliers(full_unsteady_data)

In [14]:
full_steady_data.to_parquet('../data/steady_data.parquet')
full_unsteady_data.to_parquet('../data/unsteady_data.parquet')