# **MLProcess - Air Quality Prediction**
---
**Data Pipeline**

In [1]:
# Import the required libraries.
import os

# Need to be installed.
import yaml
import joblib
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import train_test_split

# **1 - Configuration File**
---

- Create two functions: `load_config()` and `update_config()`.


In [2]:
def load_config(config_path):
    """
    Load the configuration file (config.yaml).

    Parameters:
    ----------
    config_path : str
        Configuration file location.

    Returns:
    -------
    params : dict
        The configuration parameters.
    """

    # Try to load config.yaml file.
    try:
        with open(config_path, 'r') as file:
            params = yaml.safe_load(file)
    except FileNotFoundError as err:
        raise RuntimeError(f"Configuration file not found in {config_path}")

    return params

In [3]:
def update_config(key, value, params, config_path):
    """
    Update the configuration parameter values.

    Parameters:
    ----------
    key : str
        The key to be updated.

    value : any type supported in Python
        The updated value.

    params : dict
        Loaded configuration parameters.

    config_path : str
        Configuration file location.

    Returns:
    -------
    config : dict
        Updated configuration parameters.
    """

    # To maintain the raw config immutable.
    params = params.copy()

    # Update the configuration parameters.
    params[key] = value

    with open(config_path, 'w') as file:
        yaml.dump(params, file)

    print(f"Params Updated! \nKey: {key} \nValue: {value}\n")

    # Reload the updated configuration parameters.
    config = load_config(config_path)

    return config

In [4]:
# Load the configuration file.
PATH_CONFIG = "../config/config.yaml"
config = load_config(PATH_CONFIG)

In [5]:
config["path_raw_data"]

'../data/raw/'

# **2 - Data Collection**
---
- Create `load_data()` function.
- It receives one argument: `data_path`
- This function load all csv raw data and return the joined dataframe.

In [6]:
def load_data(data_path):
    """
    Load csv files and join into one dataframe.

    Parameters:
    ----------
    data_path : str
        Raw dataset location.

    Returns:
    -------
    raw_dataset : pd.DataFrame
        Loaded and joined data.
    """

    # Create variable to store raw dataset.
    raw_dataset = pd.DataFrame()

    # Load and join the csv files.
    for i in tqdm(os.listdir(data_path)):
        raw_dataset = pd.concat([
            pd.read_csv(data_path + i), raw_dataset
        ])

    return raw_dataset

In [7]:
# Load the raw dataset.
PATH_RAW_DATA = config["path_raw_data"]
raw_dataset = load_data(PATH_RAW_DATA)

100%|████████████████████████████████████████████████| 12/12 [00:00<00:00, 512.71it/s]


In [8]:
# Check the raw dataset.
raw_dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG
1,2021-08-02,DKI1 (Bunderan HI),47,63,24,10,25,28,63,PM25,SEDANG
2,2021-08-03,DKI1 (Bunderan HI),50,68,26,11,19,35,68,PM25,SEDANG
3,2021-08-04,DKI1 (Bunderan HI),52,70,29,8,24,26,70,PM25,SEDANG
4,2021-08-05,DKI1 (Bunderan HI),52,66,29,9,21,27,66,PM25,SEDANG
...,...,...,...,...,...,...,...,...,...,...,...
135,2021-02-24,DKI5 (Kebon Jeruk) Jakarta Barat,24,40,28,4,11,7,40,PM25,BAIK
136,2021-02-25,DKI5 (Kebon Jeruk) Jakarta Barat,28,52,31,7,13,23,52,PM25,SEDANG
137,2021-02-26,DKI5 (Kebon Jeruk) Jakarta Barat,24,49,21,7,22,18,49,PM25,BAIK
138,2021-02-27,DKI5 (Kebon Jeruk) Jakarta Barat,39,64,27,10,25,24,64,PM25,SEDANG


- We found that:
  1. Index only ranged from 0 to 139, while there are 1830 rows.
  2. Date ranged from month 8 to 2, while there are 12 months.

In [9]:
# Try to reset the index to solve the first problem.
raw_dataset = raw_dataset.reset_index(drop=True)

In [10]:
# Check the updated index.
raw_dataset

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
0,2021-08-01,DKI1 (Bunderan HI),51,68,25,8,29,22,68,PM25,SEDANG
1,2021-08-02,DKI1 (Bunderan HI),47,63,24,10,25,28,63,PM25,SEDANG
2,2021-08-03,DKI1 (Bunderan HI),50,68,26,11,19,35,68,PM25,SEDANG
3,2021-08-04,DKI1 (Bunderan HI),52,70,29,8,24,26,70,PM25,SEDANG
4,2021-08-05,DKI1 (Bunderan HI),52,66,29,9,21,27,66,PM25,SEDANG
...,...,...,...,...,...,...,...,...,...,...,...
1825,2021-02-24,DKI5 (Kebon Jeruk) Jakarta Barat,24,40,28,4,11,7,40,PM25,BAIK
1826,2021-02-25,DKI5 (Kebon Jeruk) Jakarta Barat,28,52,31,7,13,23,52,PM25,SEDANG
1827,2021-02-26,DKI5 (Kebon Jeruk) Jakarta Barat,24,49,21,7,22,18,49,PM25,BAIK
1828,2021-02-27,DKI5 (Kebon Jeruk) Jakarta Barat,39,64,27,10,25,24,64,PM25,SEDANG


In [11]:
# Serialize the joined dataset.
PATH_JOINED_DATA = f"../data/interim/joined_dataset.pkl"
joblib.dump(raw_dataset, PATH_JOINED_DATA)

['../data/interim/joined_dataset.pkl']

In [12]:
# Update the configuration parameter.
config = update_config(
    key = "path_joined_data",
    value = PATH_JOINED_DATA,
    params = config,
    config_path = PATH_CONFIG
)

Params Updated! 
Key: path_joined_data 
Value: ../data/interim/joined_dataset.pkl



# **3 - Data Validation**
---

In [13]:
# Check the data type for each feature.
raw_dataset.dtypes

tanggal        str
stasiun        str
pm10        object
pm25        object
so2         object
co          object
o3          object
no2         object
max         object
critical       str
categori       str
dtype: object

- Several features don't have the same configuration data type.
- We need to handle those error columns.

## 3.1. Handling Column `tanggal`

In [14]:
# Try to cast the "tanggal" column to datetime type.
raw_dataset["tanggal"] = pd.to_datetime(raw_dataset["tanggal"])

## 3.2. Handling Column `pm10`

In [15]:
# Try to cast the "pm10" column to int type.
raw_dataset["pm10"] = raw_dataset["pm10"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

- `ValueError` occurs, it tells us that there is data that isn't integer (`"---"`).
- We will replace those `"---"` with value that don't exists in the column.
- Based on data definition, we know that we can use `-1`.

In [16]:
# Ensure no single data that is -1.
raw_dataset.eq("-1").any() | raw_dataset.eq(-1).any()

tanggal     False
stasiun     False
pm10        False
pm25        False
so2         False
co          False
o3          False
no2         False
max         False
critical    False
categori    False
dtype: bool

In [17]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["pm10"] = raw_dataset["pm10"].replace("---", -1).astype(int)

## 3.3. Handling Column `pm25`

In [18]:
raw_dataset["pm25"] = raw_dataset["pm25"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

In [19]:
raw_dataset["pm25"] = raw_dataset["pm25"].replace("---", -1).astype(int)

ValueError: cannot convert float NaN to integer

- There are `NaN` values, thus we can't convert.
- We need to handle this problem first.

In [20]:
# Sanity check the missing values.
raw_dataset["pm25"].isna().sum()

np.int64(62)

In [21]:
# Replace the NaN values with -1.
raw_dataset["pm25"] = raw_dataset["pm25"].fillna(-1)

# Sanity check the missing values.
raw_dataset["pm25"].isna().sum()

np.int64(0)

In [22]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["pm25"] = raw_dataset["pm25"].replace("---", -1).astype(int)

## 3.4. Handling Column `so2`

In [23]:
raw_dataset["so2"] = raw_dataset["so2"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

In [24]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["so2"] = raw_dataset["so2"].replace("---", -1).astype(int)

## 3.5. Handling Column `co`

In [25]:
raw_dataset["co"] = raw_dataset["co"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

In [26]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["co"] = raw_dataset["co"].replace("---", -1).astype(int)

## 3.6. Handling Column `o3`

In [27]:
raw_dataset["o3"] = raw_dataset["o3"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

In [28]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["o3"] = raw_dataset["o3"].replace("---", -1).astype(int)

## 3.7. Handling Column `no2`

In [29]:
raw_dataset["no2"] = raw_dataset["no2"].astype(int)

ValueError: invalid literal for int() with base 10: '---'

In [30]:
# Replace the "---" with -1 and cast the column into int.
raw_dataset["no2"] = raw_dataset["no2"].replace("---", -1).astype(int)

## 3.8. Handling Column `max`

In [31]:
raw_dataset["max"] = raw_dataset["max"].astype(int)

ValueError: invalid literal for int() with base 10: 'PM25'

- Seems like the error is different.
- There is data with value `"PM25"` in the `max` column.
- Investigate the data.

In [32]:
raw_dataset[raw_dataset["max"] == "PM25"]

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
467,2021-12-03,DKI1 (Bunderan HI),49,31,9,19,7,49,PM25,BAIK,


- Looks like there are typos on row index 467.
    - The `"BAIK"` value must be on `categori` column.
    - We need to investigate what do `max` and `critical` column represents.
- Let's randomly sample 5 data.

In [33]:
raw_dataset.sample(5)

Unnamed: 0,tanggal,stasiun,pm10,pm25,so2,co,o3,no2,max,critical,categori
1472,2021-10-26,DKI3 (Jagakarsa),51,71,51,13,30,15,71,PM25,SEDANG
1438,2021-10-23,DKI2 (Kelapa Gading),67,95,81,10,53,28,95,PM25,SEDANG
1469,2021-10-23,DKI3 (Jagakarsa),59,87,52,13,39,21,87,PM25,SEDANG
1332,2021-03-10,DKI4 (Lubang Buaya),56,89,40,21,21,12,89,PM25,SEDANG
228,2021-05-12,DKI3 (Jagakarsa),-1,96,21,10,29,15,96,PM25,SEDANG


- Looks like the `max` column represents the maximum value between any other int columns.
- And looks like the `critical` column represents the column name of `max` value.

- Thus, let's fix the error on the row index 467:
    - Replace the `max` column with `pm10` or `no2` value, let's take from `pm10`
    - Replace the `critical` column with `"PM10"`
    - Replace the `categori` column with `"BAIK"`

In [34]:
# Fix the error.
raw_dataset.loc[467, "max"] = raw_dataset.loc[467, "pm10"]
raw_dataset.loc[467, "critical"] = "PM10"
raw_dataset.loc[467, "categori"] = "BAIK"

In [35]:
# Sanity check the result.
raw_dataset.loc[467]

tanggal     2021-12-03 00:00:00
stasiun      DKI1 (Bunderan HI)
pm10                         49
pm25                         31
so2                           9
co                           19
o3                            7
no2                          49
max                          49
critical                   PM10
categori                   BAIK
Name: 467, dtype: object

In [36]:
# Cast the column to int.
raw_dataset["max"] = raw_dataset["max"].astype(int)

## 3.9. Handling Column `critical`

In [37]:
# Check the unique value.
raw_dataset["critical"].value_counts()

critical
PM25    1631
PM10      65
O3        57
CO        34
SO2       26
Name: count, dtype: int64

- Seems like no action needed.

## 3.10. Handling Column `categori`

In [38]:
# Check the unique value.
raw_dataset["categori"].value_counts()

categori
SEDANG            1305
TIDAK SEHAT        319
BAIK               189
TIDAK ADA DATA      17
Name: count, dtype: int64

- There are 17 `"TIDAK ADA DATA"` values, indicate the missing label.
- We can drop those data.

In [39]:
# Drop the "TIDAK ADA DATA" category.
missing_labels = raw_dataset[raw_dataset["categori"] == "TIDAK ADA DATA"]
raw_dataset = raw_dataset.drop(index = missing_labels.index)

# Sanity check the result.
raw_dataset["categori"].value_counts()

categori
SEDANG         1305
TIDAK SEHAT     319
BAIK            189
Name: count, dtype: int64

In [40]:
# Sanity check the data types.
raw_dataset.info()

<class 'pandas.DataFrame'>
Index: 1813 entries, 0 to 1829
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1813 non-null   datetime64[us]
 1   stasiun   1813 non-null   str           
 2   pm10      1813 non-null   int64         
 3   pm25      1813 non-null   int64         
 4   so2       1813 non-null   int64         
 5   co        1813 non-null   int64         
 6   o3        1813 non-null   int64         
 7   no2       1813 non-null   int64         
 8   max       1813 non-null   int64         
 9   critical  1813 non-null   str           
 10  categori  1813 non-null   str           
dtypes: datetime64[us](1), int64(7), str(3)
memory usage: 170.0 KB


In [42]:
# Rename "categori" into "category".
raw_dataset = raw_dataset.rename(columns={"categori" : "category"})
raw_dataset.info()

<class 'pandas.DataFrame'>
Index: 1813 entries, 0 to 1829
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   tanggal   1813 non-null   datetime64[us]
 1   stasiun   1813 non-null   str           
 2   pm10      1813 non-null   int64         
 3   pm25      1813 non-null   int64         
 4   so2       1813 non-null   int64         
 5   co        1813 non-null   int64         
 6   o3        1813 non-null   int64         
 7   no2       1813 non-null   int64         
 8   max       1813 non-null   int64         
 9   critical  1813 non-null   str           
 10  category  1813 non-null   str           
dtypes: datetime64[us](1), int64(7), str(3)
memory usage: 170.0 KB


In [44]:
# Update the configuration parameters. 
config = update_config(
    key = "label",
    value = "category",
    params = config,
    config_path = PATH_CONFIG
)

Params Updated! 
Key: label 
Value: category



In [51]:
# Example: Fixing the typo 'categori' -> 'category' in the existing list
updated_list = [col.replace('categori', 'category') for col in config['object_columns']]

config = update_config(
    key = "object_columns",
    value = updated_list,
    params = config,
    config_path = PATH_CONFIG
)

Params Updated! 
Key: object_columns 
Value: ['stasiun', 'critical', 'category']



- All columns are already same as in the data definition.
- Now serialized the validated data.

In [52]:
PATH_VALIDATED_DATA = f"../data/interim/validated_data.pkl"
joblib.dump(raw_dataset, PATH_VALIDATED_DATA)

['../data/interim/validated_data.pkl']

In [53]:
# Update the configuration parameter.
config = update_config(
    key = "path_validated_data",
    value = PATH_VALIDATED_DATA,
    params = config,
    config_path = PATH_CONFIG
)

Params Updated! 
Key: path_validated_data 
Value: ../data/interim/validated_data.pkl



# **4 - Update the Range of Data in Configuration File**
---

In [54]:
# Get the min and max value of each column.
cols = ["pm10", "pm25", "so2", "co", "o3", "no2"]
param_keys = ["range_pm10", "range_pm25", "range_so2", "range_co", "range_o3", "range_no2"]

for col, key in zip(cols, param_keys):
    config = update_config(
        key = key,
        value = [int(np.min(raw_dataset[col])), int(np.max(raw_dataset[col]))],
        params = config,
        config_path = PATH_CONFIG
    )

Params Updated! 
Key: range_pm10 
Value: [-1, 179]

Params Updated! 
Key: range_pm25 
Value: [-1, 174]

Params Updated! 
Key: range_so2 
Value: [-1, 82]

Params Updated! 
Key: range_co 
Value: [-1, 47]

Params Updated! 
Key: range_o3 
Value: [-1, 151]

Params Updated! 
Key: range_no2 
Value: [-1, 65]



In [55]:
# Check the configuration parameters.
config

{'datetime_columns': ['tanggal'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'int32_columns': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'label': 'category',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'object_columns': ['stasiun', 'critical', 'category'],
 'path_joined_data': '../data/interim/joined_dataset.pkl',
 'path_raw_data': '../data/raw/',
 'path_validated_data': '../data/interim/validated_data.pkl',
 'range_co': [-1, 47],
 'range_no2': [-1, 65],
 'range_o3': [-1, 151],
 'range_pm10': [-1, 179],
 'range_pm25': [-1, 174],
 'range_so2': [-1, 82],
 'range_stasiun': ['DKI1 (Bunderan HI)',
  'DKI2 (Kelapa Gading)',
  'DKI3 (Jagakarsa)',
  'DKI4 (Lubang Buaya)',
  'DKI5 (Kebon Jeruk) Jakarta Barat']}

# **5 - Data Defense**
---

- Create the `check_data()` function.
- It receives 2 arguments: `input_data` and `params`
    - `input_data` is the raw dataset
    - `params` is the configuration parameters
- It is a void function (no return value).
- If `AssertionError` happens, there are exists data that don't match the configuration.

In [56]:
def check_data(input_data, params):
    """
    Do data defense for checking the data types and range of data.

    Parameters:
    ----------
    input_data : pd.DataFrame
        The data to be checked.

    params : dict
        Loaded configuration parameters.

    Returns:
    -------
    None, it's a void function.
    """

    # Check data types.
    assert input_data.select_dtypes("datetime").columns.to_list() == params["datetime_columns"], "an error occurs in datetime column(s)."
    assert input_data.select_dtypes("str").columns.to_list() == params["object_columns"], "an error occurs in object column(s)."
    assert input_data.select_dtypes("number").columns.to_list() == params["int32_columns"], "an error occurs in int32 column(s)."

    # Check range of data.
    assert set(input_data['stasiun']).issubset(set(params['range_stasiun'])), "an error occurs in stasiun range."
    assert input_data['pm10'].between(params['range_pm10'][0], params['range_pm10'][1]).sum() == len(input_data), "an error occurs in pm10 range."
    assert input_data['pm25'].between(params['range_pm25'][0], params['range_pm25'][1]).sum() == len(input_data), "an error occurs in pm25 range."
    assert input_data['so2'].between(params['range_so2'][0], params['range_so2'][1]).sum() == len(input_data), "an error occurs in so2 range."
    assert input_data['co'].between(params['range_co'][0], params['range_co'][1]).sum() == len(input_data), "an error occurs in co range."
    assert input_data['o3'].between(params['range_o3'][0], params['range_o3'][1]).sum() == len(input_data), "an error occurs in o3 range."
    assert input_data['no2'].between(params['range_no2'][0], params['range_no2'][1]).sum() == len(input_data), "an error occurs in no2 range."

In [57]:
# Do data defense.
check_data(raw_dataset, config)

- Seems like our data are in good condition!

# **6 - Data Split**
---

In [58]:
# Input-Output Split.
def split_input_output(input_data, params):
    """
    Split the input(X) and output (y).

    Parameters:
    ----------
    input_data : pd.DataFrame
        The processed dataset.

    params : dict
        Loaded configuration parameters.

    Returns:
    -------
    X : pd.DataFrame
        The input data.

    y : pd.Series
        The output data.
    """

    X = input_data[params["features"]].copy()
    y = input_data[params["label"]].copy()

    print(f"Original data shape : {input_data.shape}")
    print(f"Selected Features   : {params["features"]}")
    print(f"X data shape        : {X.shape}")
    print(f"y data shape        : {y.shape}")

    return X, y

In [59]:
X, y = split_input_output(
    input_data = raw_dataset,
    params = config
)

Original data shape : (1813, 11)
Selected Features   : ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2']
X data shape        : (1813, 7)
y data shape        : (1813,)


In [60]:
# Sanity check the input (X).
X.head()

Unnamed: 0,stasiun,pm10,pm25,so2,co,o3,no2
0,DKI1 (Bunderan HI),51,68,25,8,29,22
1,DKI1 (Bunderan HI),47,63,24,10,25,28
2,DKI1 (Bunderan HI),50,68,26,11,19,35
3,DKI1 (Bunderan HI),52,70,29,8,24,26
4,DKI1 (Bunderan HI),52,66,29,9,21,27


In [61]:
# Sanity check the output (y).
y.head()

0    SEDANG
1    SEDANG
2    SEDANG
3    SEDANG
4    SEDANG
Name: category, dtype: str

In [62]:
# Train-Test Split.
def split_train_test(X, y, test_size, random_state):
    """
    Split the train and test set.

    Parameters:
    ----------
    X : pd.DataFrame
        The input data.

    y : pd.Series
        The output data.

    test_size : float
        The proportion of test set.

    random_state : int
        For reproducibility

    Returns:
    -------
    X_train, X_test : pd.DataFrame
        The train and test input.

    y_train, y_test : pd.Series
        The train and test output.
    """

    X_train, X_test, y_train, y_test = train_test_split(
                                            X, y,
                                            test_size = test_size,
                                            random_state = random_state,
                                            stratify = y
                                       )

    print(f"X_train shape : {X_train.shape}")
    print(f"y_train shape : {y_train.shape}")
    print(f"X_test shape  : {X_test.shape}")
    print(f"y_test shape  : {y_test.shape}")

    return X_train, X_test, y_train, y_test

In [63]:
# train:valid:test -> 80:10:10

RANDOM_STATE = 123

# Train vs not-train.
X_train, X_not_train, y_train, y_not_train = split_train_test(
    X = X,
    y = y,
    test_size = 0.2,
    random_state = RANDOM_STATE
)

print()
# Valid vs test.
X_valid, X_test, y_valid, y_test = split_train_test(
    X = X_not_train,
    y = y_not_train,
    test_size = 0.5,
    random_state = RANDOM_STATE
)

X_train shape : (1450, 7)
y_train shape : (1450,)
X_test shape  : (363, 7)
y_test shape  : (363,)

X_train shape : (181, 7)
y_train shape : (181,)
X_test shape  : (182, 7)
y_test shape  : (182,)


In [64]:
# Serialize the splitted data.
PATH_SPLITTED_DATA = f"../data/interim/"

joblib.dump(X_train, f"{PATH_SPLITTED_DATA}X_train.pkl")
joblib.dump(y_train, f"{PATH_SPLITTED_DATA}y_train.pkl")
joblib.dump(X_valid, f"{PATH_SPLITTED_DATA}X_valid.pkl")
joblib.dump(y_valid, f"{PATH_SPLITTED_DATA}y_valid.pkl")
joblib.dump(X_test, f"{PATH_SPLITTED_DATA}X_test.pkl")
joblib.dump(y_test, f"{PATH_SPLITTED_DATA}y_test.pkl")

['../data/interim/y_test.pkl']

In [65]:
# Update the configuration parameters.
config = update_config(
    key = "path_train_set",
    value = [f"{PATH_SPLITTED_DATA}X_train.pkl", f"{PATH_SPLITTED_DATA}y_train.pkl"],
    params = config,
    config_path = PATH_CONFIG
)

config = update_config(
    key = "path_valid_set",
    value = [f"{PATH_SPLITTED_DATA}X_valid.pkl", f"{PATH_SPLITTED_DATA}y_valid.pkl"],
    params = config,
    config_path = PATH_CONFIG
)

config = update_config(
    key = "path_test_set",
    value = [f"{PATH_SPLITTED_DATA}X_test.pkl", f"{PATH_SPLITTED_DATA}y_test.pkl"],
    params = config,
    config_path = PATH_CONFIG
)

Params Updated! 
Key: path_train_set 
Value: ['../data/interim/X_train.pkl', '../data/interim/y_train.pkl']

Params Updated! 
Key: path_valid_set 
Value: ['../data/interim/X_valid.pkl', '../data/interim/y_valid.pkl']

Params Updated! 
Key: path_test_set 
Value: ['../data/interim/X_test.pkl', '../data/interim/y_test.pkl']



In [66]:
# Check the configuration parameters.
config

{'datetime_columns': ['tanggal'],
 'features': ['stasiun', 'pm10', 'pm25', 'so2', 'co', 'o3', 'no2'],
 'int32_columns': ['pm10', 'pm25', 'so2', 'co', 'o3', 'no2', 'max'],
 'label': 'category',
 'label_categories': ['BAIK', 'SEDANG', 'TIDAK SEHAT'],
 'label_categories_new': ['BAIK', 'TIDAK BAIK'],
 'object_columns': ['stasiun', 'critical', 'category'],
 'path_joined_data': '../data/interim/joined_dataset.pkl',
 'path_raw_data': '../data/raw/',
 'path_test_set': ['../data/interim/X_test.pkl', '../data/interim/y_test.pkl'],
 'path_train_set': ['../data/interim/X_train.pkl',
  '../data/interim/y_train.pkl'],
 'path_valid_set': ['../data/interim/X_valid.pkl',
  '../data/interim/y_valid.pkl'],
 'path_validated_data': '../data/interim/validated_data.pkl',
 'range_co': [-1, 47],
 'range_no2': [-1, 65],
 'range_o3': [-1, 151],
 'range_pm10': [-1, 179],
 'range_pm25': [-1, 174],
 'range_so2': [-1, 82],
 'range_stasiun': ['DKI1 (Bunderan HI)',
  'DKI2 (Kelapa Gading)',
  'DKI3 (Jagakarsa)',
  'DK