In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("../../Datasets/Machine Predictive Maintenance Classification/predictive_maintenance.csv")

### Feature Columns

- UID: unique identifier ranging from 1 to 10000
- productID: consisting of a letter L, M, or H for low (50% of all products), medium (30%), and high (20%) as product quality variants and a variant-specific serial number
- air temperature [K]: generated using a random walk process later normalized to a standard deviation of 2 K around 300 K
- process temperature [K]: generated using a random walk process normalized to a standard deviation of 1 K, added to the air temperature plus 10 K.
- rotational speed [rpm]: calculated from powepower of 2860 W, overlaid with a normally distributed noise
- torque [Nm]: torque values are normally distributed around 40 Nm with an Ïƒ = 10 Nm and no negative values.
- tool wear [min]: The quality variants H/M/L add 5/3/2 minutes of tool wear to the used tool in the process. and a
-'machine failure' label that indicates, whether the machine has failed in this particular data point for any of the following failure modes are true.

### Target Columns

There are two Targets (Do not make the mistake of using one of them as feature, as it will lead to leakage)
- Target : Failure or Not
- Failure Type : Type of Failure

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Target                   10000 non-null  int64  
 9   Failure Type             10000 non-null  object 
dtypes: float64(3), int64(4), object(3)
memory usage: 781.4+ KB


Type
H    1003
L    6000
M    2997
dtype: int64

In [36]:
# Concatenate the sampled dataframes
df_sampled = pd.concat([df[df['Type'] == 'H'].sample(n=1003, random_state=42),
                        df[df['Type'] == 'L'].sample(n=1003, random_state=42), 
                        df[df['Type'] == 'M'].sample(n=1003, random_state=42)])

In [37]:
df_sampled.groupby('Type').size()

Type
H    1003
L    1003
M    1003
dtype: int64

In [3]:
# scaler = StandardScaler()
scaling_features = ["Air temperature [K]", "Process temperature [K]", "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"] 
# df_scaled = scaler.fit_transform(df_sampled[scaling_features])
for feature in scaling_features:
    df[feature] = df[feature]/df[feature].max()


In [4]:
# df_sampled.describe()
df.describe()

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,0.985238,0.987908,0.533186,0.522022,0.426684,0.0339
std,2886.89568,0.006569,0.004728,0.062122,0.130143,0.251597,0.180981
min,1.0,0.969787,0.974187,0.404712,0.049608,0.0,0.0
25%,2500.75,0.979639,0.984066,0.49307,0.43342,0.209486,0.0
50%,5000.5,0.98555,0.988209,0.52079,0.523499,0.426877,0.0
75%,7500.25,0.990148,0.991396,0.558559,0.610966,0.640316,0.0
max,10000.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
df = df.drop(['UDI', 'Product ID', 'Type'], axis=1)

In [15]:
df.head()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,0.978982,0.983429,0.537422,0.558747,0.0,0,No Failure
1,0.97931,0.983748,0.487872,0.604439,0.011858,0,No Failure
2,0.978982,0.98311,0.519058,0.644909,0.019763,0,No Failure
3,0.97931,0.983429,0.496535,0.515666,0.027668,0,No Failure
4,0.97931,0.983748,0.487872,0.522193,0.035573,0,No Failure


In [17]:
binary_df = multi_class_df = df
binary_df = binary_df.drop(columns=["Failure Type"])
multi_class_df = multi_class_df.drop(columns=["Target"])
binary_df.to_csv("../../Datasets/Machine Predictive Maintenance Classification/binary_classification.csv")
multi_class_df.to_csv("../../Datasets/Machine Predictive Maintenance Classification/multi_class_classification.csv")