# **Data Preprocessing**
---
Before the data is modeled, it is required to perform data preprocessing so that it can be modeled.

### **Import Package and Data**

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [2]:
X_train = joblib.load("D:/ML/INTRO ML/Project/joblib_train_test/X_train.csv")
y_train = joblib.load("D:/ML/INTRO ML/Project/joblib_train_test/y_train.csv")
X_test = joblib.load("D:/ML/INTRO ML/Project/joblib_train_test/X_test.csv")
y_test = joblib.load("D:/ML/INTRO ML/Project/joblib_train_test/y_test.csv")
#data = joblib.load("D:/ML/INTRO ML/Project/joblib_train_test/data.csv")

### **Split data into categorical and numeric**

In [3]:
#split by numerical and category

X_train_numeric = X_train.select_dtypes(include=['int', 'float'])
X_train_category = X_train.select_dtypes(include=['object'])
X_test_numeric = X_test.select_dtypes(include=['int', 'float'])
X_test_category = X_test.select_dtypes(include=['object'])

In [4]:
X_all = [X_train_numeric, X_train_category, X_test_numeric, X_test_category]

def check_dataframe_shapes(dataframes_list):

    for df in dataframes_list:
        name = df.name if hasattr(df, 'name') else 'DataFrame'
        print(f"{name}: {df.shape[0]} rows, {df.shape[1]} columns")

In [5]:
check_dataframe_shapes(X_all)

DataFrame: 28032 rows, 6 columns
DataFrame: 28032 rows, 3 columns
DataFrame: 7008 rows, 6 columns
DataFrame: 7008 rows, 3 columns


In [6]:
#check null again
X_train_numeric.isna().sum()

LagRP     0
LeadRP    0
CO2       0
LagPF     0
LeadPF    0
NSM       0
dtype: int64

In [7]:
X_train_category.isna().sum()

WStatus      0
Dweek        0
Load_Type    0
dtype: int64

In [8]:
#check null again
X_test_numeric.isna().sum()

LagRP     0
LeadRP    0
CO2       0
LagPF     0
LeadPF    0
NSM       0
dtype: int64

In [9]:
X_test_category.isna().sum()

WStatus      0
Dweek        0
Load_Type    0
dtype: int64

### **Using OHE for categorical data to can use at step modeling**

In [10]:
from sklearn.preprocessing import OneHotEncoder

def encoderCat(data, encoder_col = None, encoder = None):
    if encoder == None:
        # Buat objek
        encoder = OneHotEncoder(handle_unknown = "ignore",
                                drop = "if_binary")
        encoder.fit(data)
        encoder_col = encoder.get_feature_names_out(data.columns)

    # Transform data
    data_encoded = encoder.transform(data).toarray()
    data_encoded = pd.DataFrame(data_encoded,
                                index = data.index,
                                columns = encoder_col)
    
    return data_encoded, encoder_col, encoder

In [11]:
X_train_cat_encoded, encoder_col, encoder_OHE = encoderCat(data = X_train_category)

In [12]:
X_train_cat_encoded.head(2)

Unnamed: 0,WStatus_Weekend,Dweek_Friday,Dweek_Monday,Dweek_Saturday,Dweek_Sunday,Dweek_Thursday,Dweek_Tuesday,Dweek_Wednesday,Load_Type_Light_Load,Load_Type_Maximum_Load,Load_Type_Medium_Load
32530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
17905,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
X_test_cat_encoded, encoder_col_test, encoder_OHE_test = encoderCat(data = X_test_category)

In [14]:
X_test_cat_encoded.head(2)

Unnamed: 0,WStatus_Weekend,Dweek_Friday,Dweek_Monday,Dweek_Saturday,Dweek_Sunday,Dweek_Thursday,Dweek_Tuesday,Dweek_Wednesday,Load_Type_Light_Load,Load_Type_Maximum_Load,Load_Type_Medium_Load
16785,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
17693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [15]:
X_train_concat = pd.concat([X_train_numeric, X_train_cat_encoded],
                           axis = 1).reset_index(drop = True)
X_train_concat.shape

(28032, 17)

In [16]:
X_test_concat = pd.concat([X_test_numeric, X_test_cat_encoded],
                           axis = 1).reset_index(drop = True)
X_test_concat.shape

(7008, 17)

### **Scaling**

In [17]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [18]:
#lets make to function

def scaler_transform(X, scaler =  None):
    
    if scaler != None:
        pass
    else:
        scaler = StandardScaler()
        scaler.fit(X)
        
    #transform data
    X_scaled = scaler.transform(X)
    X_scaled = pd.DataFrame(X_scaled,
                           columns = X.columns,
                           index = X.index)
    return X_scaled, scaler

In [19]:
X_train_clean , scaler = scaler_transform(X =  X_train_concat)
X_train_clean.shape

(28032, 17)

In [20]:
X_test_clean , scaler = scaler_transform(X =  X_test_concat)
X_test_clean.shape

(7008, 17)

In [25]:
y_test.isna().sum()

0

In [26]:
y_test.shape

(7008,)

In [27]:
y_train.isna().sum()

0

## **save preprocessing result data**

In [24]:
joblib.dump(X_train_clean,"D:/ML/INTRO ML/Project/joblib_train_test/X_train_clean.csv")
joblib.dump(X_test_clean,"D:/ML/INTRO ML/Project/joblib_train_test/X_test_clean.csv")

['D:/ML/INTRO ML/Project/joblib_train_test/X_test_clean.csv']