## 1. Load Required Libraries

In [1]:
import os

#input file_location
file_location = 'D:\Project\Course\Pacmann\Tugas\Machine Learning Process\Predictive Maintenance\Coding'

os.chdir(os.path.expanduser(file_location))

In [2]:
import src.util as utils
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
import joblib
import pickle5

## 2. Load Configuration File

In [3]:
config = utils.load_config()

## 3. Load Dataset

In [4]:
def load_dataset(config_data: dict):
    # Load every set of data
    X_train = utils.pickle_load(config['train_test']['directory'] + config['train_test']["X_train"])
    y_train = utils.pickle_load(config['train_test']['directory'] + config['train_test']["y_train"])
    X_test = utils.pickle_load(config['train_test']['directory'] + config['train_test']["X_test"])
    y_test = utils.pickle_load(config['train_test']['directory'] + config['train_test']["y_test"])
        
    # Concatenate x and y each set
    train_set = pd.concat([X_train, y_train], axis = 1)
    test_set = pd.concat([X_test, y_test], axis = 1)

    # Return 3 set of data
    return train_set, test_set

In [5]:
train_set, test_set = load_dataset(config)

## 4. Balancing Train Label

In [6]:
X_rus, y_rus = RandomUnderSampler(random_state = 42).fit_resample(
    train_set.drop(columns = config['dataset']["label"]),
    train_set[config['dataset']["label"]]
)
train_set_bal = pd.concat([X_rus, y_rus], axis = 1)

In [7]:
train_set_bal.shape

(542, 7)

## 5. Split Data Type

In [8]:
def splittypedata(data):
    data_float = data[config['dataset']["float_columns"]]
    data_int = data[config['dataset']["int_columns"]]
    data_category = data[config['dataset']["category_columns"]]

    return data_float, data_int, data_category

In [9]:
train_set_bal_float, train_set_bal_int, train_set_bal_category = splittypedata(train_set_bal)

In [10]:
train_set_bal_float.shape

(542, 3)

In [11]:
train_set_bal_int.shape

(542, 3)

In [12]:
train_set_bal_category.shape

(542, 1)

## 6. Removing Outliers

In [13]:
def remove_outliers(set_data):
    set_data = set_data.copy()
    list_of_set_data = list()

    for col_name in set_data.columns[:-1]:
        q1 = set_data[col_name].quantile(0.25)
        q3 = set_data[col_name].quantile(0.75)
        iqr = q3 - q1
        set_data_cleaned = set_data[~((set_data[col_name] < (q1 - 1.5 * iqr)) | (set_data[col_name] > (q3 + 1.5 * iqr)))].copy()
        list_of_set_data.append(set_data_cleaned.copy())
    
    set_data_cleaned = pd.concat(list_of_set_data)
    count_duplicated_index = set_data_cleaned.index.value_counts()
    used_index_data = count_duplicated_index[count_duplicated_index == (set_data.shape[1]-1)].index
    set_data_cleaned = set_data_cleaned.loc[used_index_data].drop_duplicates()

    return set_data_cleaned

In [14]:
train_set_bal_cleaned_float = remove_outliers(train_set_bal_float)

In [15]:
train_set_bal_cleaned_int = remove_outliers(train_set_bal_int)

In [16]:
train_set_bal_cleaned_float.shape

(541, 3)

In [17]:
train_set_bal_cleaned_int.shape

(504, 3)

## 7. Label Encoder

In [18]:
def encoding_category(data):
    # Make a copy of the DataFrame to avoid inplace modification
    encoded_data = data.copy()

    # Replace values in the specified 'category_columns'
    category_columns = config['dataset']['category_columns']
    encoded_data[category_columns] = encoded_data[category_columns].replace({'L': 1, 'M': 2, 'H': 3})

    return encoded_data

In [19]:
train_set_bal_cat_encoder = encoding_category(train_set_bal_category)

In [20]:
train_set_bal_cat_encoder.shape

(542, 1)

In [21]:
train_set_bal_cat_encoder.head()

Unnamed: 0,Type
0,1
1,1
2,1
3,2
4,2


## 8. Concat Label Encoder

In [22]:
train_set_bal_cleaned_concat = pd.concat([train_set_bal_cat_encoder, 
                                          train_set_bal_cleaned_float, 
                                          train_set_bal_cleaned_int 
                                          ],
                                        axis=1
                                        )

In [23]:
train_set_bal_cleaned_concat.shape

(542, 7)

In [24]:
def concatcleandata(set_data_cleaned):
    count_duplicated_index = set_data_cleaned.index.value_counts()
    used_index_data = count_duplicated_index[count_duplicated_index == 1].index
    set_data_cleaned = set_data_cleaned.loc[used_index_data].drop_duplicates()
    set_data_cleaned = set_data_cleaned.dropna()

    return set_data_cleaned

In [25]:
train_set_bal_cleaned_concat = concatcleandata(train_set_bal_cleaned_concat)

In [26]:
train_set_bal_cleaned_concat

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Torque [Nm],Rotational speed [rpm],Tool wear [min],Target
0,1,302.5,310.3,36.0,1523.0,72.0,0.0
370,1,304.4,313.7,35.0,1509.0,205.0,1.0
368,1,301.7,310.9,46.4,1405.0,207.0,1.0
367,1,303.2,311.4,53.8,1351.0,8.0,1.0
366,1,301.2,311.6,52.9,1461.0,208.0,1.0
...,...,...,...,...,...,...,...
176,1,300.3,310.3,29.5,1704.0,0.0,0.0
175,1,300.9,310.9,52.5,1370.0,107.0,0.0
174,1,300.8,311.4,57.6,1316.0,43.0,0.0
173,2,295.9,306.6,38.6,1532.0,104.0,0.0


## 9. Data Test

In [27]:
test_float, test_int, test_category = splittypedata(test_set)

In [29]:
test_category_encoder = encoding_category(test_category)

In [30]:
test_set_encoder_concat = pd.concat([test_category_encoder, test_float, test_int],
                                     axis=1
                                    )

In [31]:
test_set_encoder_concat = concatcleandata(test_set_encoder_concat)

In [32]:
test_set_encoder_concat.shape

(2000, 7)

In [33]:
test_set_encoder_concat.head()

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Torque [Nm],Rotational speed [rpm],Tool wear [min],Target
2997,1,300.5,309.8,62.7,1345,153,0
6302,1,301.2,310.6,51.2,1437,112,0
6053,1,300.8,310.9,29.9,1748,140,0
6014,3,300.8,310.8,32.8,1618,40,0
9538,1,299.3,310.6,45.1,1480,107,0


## 8. Dump Trainset

In [34]:
utils.pickle_dump(train_set_bal_cleaned_concat[config['dataset']["predictors"]], config['train_test']['directory'] + config['train_test']['X_train_feng'])
utils.pickle_dump(train_set_bal_cleaned_concat[config['dataset']["label"]], config['train_test']['directory'] + config['train_test']['y_train_feng'])

utils.pickle_dump(test_set_encoder_concat[config['dataset']["predictors"]], config['train_test']['directory'] + config['train_test']['X_test_feng'])
utils.pickle_dump(test_set_encoder_concat[config['dataset']["label"]], config['train_test']['directory'] + config['train_test']['y_test_feng'])