## 1. Load Required Libraries

In [1]:
import os

#input file_location
file_location = 'D:\Project\Course\Pacmann\Tugas\Machine Learning Process\Predictive Maintenance\Coding'

os.chdir(os.path.expanduser(file_location))

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import src.util as utils
import joblib

## 2. Load Configuration File

In [3]:
config = utils.load_config()

## 3. Load Dataset

In [4]:
dataset = pd.read_csv(config['dataset']['data_directory']+ config['dataset']['file_name'])

In [5]:
dataset

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure
...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,No Failure
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,No Failure
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,No Failure
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,No Failure


In [6]:
dataset = dataset.drop(config['dataset']['drop_columns'], axis=1)

In [7]:
dataset.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,M,298.1,308.6,1551,42.8,0,0
1,L,298.2,308.7,1408,46.3,3,0
2,L,298.1,308.5,1498,49.4,5,0
3,L,298.2,308.6,1433,39.5,7,0
4,L,298.2,308.7,1408,40.0,9,0


## 4. Data Validation

In [8]:
dataset.isnull().sum()

Type                       0
Air temperature [K]        0
Process temperature [K]    0
Rotational speed [rpm]     0
Torque [Nm]                0
Tool wear [min]            0
Target                     0
dtype: int64

In [9]:
missing_data = dataset.isnull().any(axis=1)
missing_data

0       False
1       False
2       False
3       False
4       False
        ...  
9995    False
9996    False
9997    False
9998    False
9999    False
Length: 10000, dtype: bool

In [10]:
def drop_rows_with_missing_data(dataframe):
    dataframe.dropna()
    
    return dataframe

In [11]:
drop_rows_with_missing_data(dataset)

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,M,298.1,308.6,1551,42.8,0,0
1,L,298.2,308.7,1408,46.3,3,0
2,L,298.1,308.5,1498,49.4,5,0
3,L,298.2,308.6,1433,39.5,7,0
4,L,298.2,308.7,1408,40.0,9,0
...,...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14,0
9996,H,298.9,308.4,1632,31.8,17,0
9997,M,299.0,308.6,1645,33.4,22,0
9998,H,299.0,308.7,1408,48.5,25,0


In [12]:
dataset.dtypes

Type                        object
Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
Target                       int64
dtype: object

In [13]:
dataset.describe()

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,300.00493,310.00556,1538.7761,39.98691,107.951,0.0339
std,2.000259,1.483734,179.284096,9.968934,63.654147,0.180981
min,295.3,305.7,1168.0,3.8,0.0,0.0
25%,298.3,308.8,1423.0,33.2,53.0,0.0
50%,300.1,310.1,1503.0,40.1,108.0,0.0
75%,301.5,311.1,1612.0,46.8,162.0,0.0
max,304.5,313.8,2886.0,76.6,253.0,1.0


In [14]:
dataset['Target'].nunique()

2

## 5. Data Defense

In [15]:
def check_data(input_data, config):
    # Measure the range of input data
    len_input_data = len(input_data)

    # Check data types
    assert input_data.select_dtypes("int").columns.to_list() == config['dataset']["int_columns"], "an error occurs in int column(s)."
    assert input_data.select_dtypes("float").columns.to_list() == config['dataset']["float_columns"], "an error occurs in float column(s)."
    assert input_data.select_dtypes("object").columns.to_list() == config['dataset']["category_columns"], "an error occurs in category column(s)."

    # Check range of data   
    ##float data
    assert input_data[config['dataset']["float_columns"][0]].between(config['data_defense']["range_air_temperature"]['min_value'], config['data_defense']["range_air_temperature"]['max_value']).sum() == len_input_data, "an error occurs in air temperature range."
    assert input_data[config['dataset']["float_columns"][1]].between(config['data_defense']["range_process_temperature"]['min_value'], config['data_defense']["range_process_temperature"]['max_value']).sum() == len_input_data, "an error occurs in process temperature range."
    assert input_data[config['dataset']["float_columns"][2]].between(config['data_defense']["range_torque"]['min_value'], config['data_defense']["range_torque"]['max_value']).sum() == len_input_data, "an error occurs in torque range."
    ##int data
    assert input_data[config['dataset']["int_columns"][0]].between(config['data_defense']["range_rotational_speed"]['min_value'], config['data_defense']["range_rotational_speed"]['max_value']).sum() == len_input_data, "an error occurs in rotational range."
    assert input_data[config['dataset']["int_columns"][1]].between(config['data_defense']["range_tool_wear"]['min_value'], config['data_defense']["range_tool_wear"]['max_value']).sum() == len_input_data, "an error occurs in tool wear range."
    assert input_data[config['dataset']["int_columns"][2]][0] in config['data_defense']['target']['value'], "an error occurs in target range."
    ##Object data
    assert input_data[config['dataset']['category_columns'][0]][0] in config['data_defense']['type']['value'], "an error occurs in type."

In [16]:
check_data(dataset, config)

## 6. Data Splitting

In [17]:
X = dataset[config['dataset']["predictors"]].copy()
y = dataset[config['dataset']["label"]].copy()

In [18]:
X

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,M,298.1,308.6,1551,42.8,0
1,L,298.2,308.7,1408,46.3,3
2,L,298.1,308.5,1498,49.4,5
3,L,298.2,308.6,1433,39.5,7
4,L,298.2,308.7,1408,40.0,9
...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14
9996,H,298.9,308.4,1632,31.8,17
9997,M,299.0,308.6,1645,33.4,22
9998,H,299.0,308.7,1408,48.5,25


In [19]:
y

0       0
1       0
2       0
3       0
4       0
       ..
9995    0
9996    0
9997    0
9998    0
9999    0
Name: Target, Length: 10000, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = config['dataset']["test_size"], random_state = 42, stratify = y)

In [21]:
utils.pickle_dump(dataset, config['train_test']["directory"] + config['train_test']['clean_data'])

utils.pickle_dump(X_train, config['train_test']['directory'] + config['train_test']["X_train"])
utils.pickle_dump(y_train, config['train_test']['directory'] + config['train_test']["y_train"])

utils.pickle_dump(X_test, config['train_test']['directory'] + config['train_test']["X_test"])
utils.pickle_dump(y_test, config['train_test']['directory'] + config['train_test']["y_test"])