In [1]:
import os

In [2]:
%pwd

'/root/pacmann/loan-default-project/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
%pwd

'/root/pacmann/loan-default-project'

In [5]:
from dataclasses import dataclass
from pathlib import Path
  
@dataclass(frozen=True)
class DataDumpConfig:
    root_dir: Path
    loan_default_path: Path
    input_train_path: Path
    input_test_path: Path
    output_train_path: Path
    output_test_path: Path
    params_test_size: float

@dataclass(frozen=True)
class DataPreprocessingConfig:
    root_dir: Path
    input_train_path: Path
    input_test_path: Path
    output_train_path: Path
    output_test_path: Path
    encoded_train_path: Path
    encoded_test_path: Path
    model_dir: Path
    encoder_model_path: Path
    scaler_model_path: Path

In [6]:
from LoanDefault.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from LoanDefault.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_dump_data_config(self) -> DataDumpConfig:
        """read data dump config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        data_ingest_config = self.config.ingest_from_sql
        data_dump_config = self.config.dump_data
        dataset_params = self.params

        create_directories([data_dump_config.root_dir])

        config = DataDumpConfig(
            root_dir=data_dump_config.root_dir,
            loan_default_path=data_ingest_config.loan_default_path,
            input_train_path=data_dump_config.input_train_path,
            input_test_path=data_dump_config.input_test_path,
            output_train_path=data_dump_config.output_train_path,
            output_test_path=data_dump_config.output_test_path,
            params_test_size=dataset_params.TEST_SIZE
        )

        return config
    
    def get_preprocessing_data_config(self) -> DataPreprocessingConfig:
        """read preprocessing config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: PreprocessingConfig type
        """
        data_dump_config = self.config.dump_data
        encoded_data_config = self.config.encoded_data
        train_config = self.config.train_model

        create_directories([encoded_data_config.root_dir])

        config = DataPreprocessingConfig(
            root_dir=encoded_data_config.root_dir,
            input_train_path=Path(data_dump_config.input_train_path),
            input_test_path=Path(data_dump_config.input_test_path),
            output_train_path=Path(data_dump_config.output_train_path),
            output_test_path=Path(data_dump_config.output_test_path),
            encoded_train_path=Path(encoded_data_config.encoded_train_path),
            encoded_test_path=Path(encoded_data_config.encoded_test_path),
            model_dir=train_config.root_dir,
            encoder_model_path=Path(encoded_data_config.encoder_model_path),
            scaler_model_path=Path(encoded_data_config.scaler_model_path)
        )

        return config

In [8]:
import joblib
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from LoanDefault import logger

class DumpData:
    def __init__(self, config: DataDumpConfig):
        self.config = config

    def dump_data(self) -> None:
        logger.info(f"Read loan default file.")
        dataset = pd.read_csv(self.config.loan_default_path)
        
        if 'LoanID' in dataset.columns:
            dataset.drop(columns=['LoanID'], inplace=True)
        
        logger.info("Split loan default data to train and test.")
        X = dataset.drop(columns=['Default'])  # Features
        y = dataset['Default']
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, 
            y, 
            test_size=self.config.params_test_size, 
            random_state=42
        )
        
        logger.info(f"Dump training data into {self.config.input_train_path}.")
        X_train.to_pickle(self.config.input_train_path)
        y_train.to_pickle(self.config.output_train_path)
        
        logger.info(f"Dump testing data into {self.config.input_test_path}.")
        X_test.to_pickle(self.config.input_test_path)
        y_test.to_pickle(self.config.output_test_path)

class Preprocessing:
    def __init__(self, config: DataPreprocessingConfig):
        self.config = config

    def encode_data(self) -> None:
        logger.info(f"Load training data from {self.config.input_train_path}.")
        X_train = pd.read_pickle(self.config.input_train_path)
        y_train = pd.read_pickle(self.config.output_train_path)
        
        # Load testing data
        logger.info(f"Load testing data from {self.config.input_test_path}.")
        X_test = pd.read_pickle(self.config.input_test_path)
        y_test = pd.read_pickle(self.config.output_test_path)
        
        # Separate numerical and categorical features
        numerical_features = ["Age", "Income", "LoanAmount", "CreditScore", "MonthsEmployed", "NumCreditLines", "InterestRate", "LoanTerm", "DTIRatio"]
        categorical_features = ["Education", "EmploymentType", "MaritalStatus", "HasMortgage", "HasDependents", "LoanPurpose", "HasCoSigner"]

        # Encode categorical features
        logger.info(f"Encode categorical features.")
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        X_train_encoded = encoder.fit_transform(X_train[categorical_features])
        X_test_encoded = encoder.transform(X_test[categorical_features])
        
        # Scale numerical features
        logger.info(f"Scale numerical features.")
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train[numerical_features])
        X_test_scaled = scaler.transform(X_test[numerical_features])
        
        # Combine encoded and scaled features
        logger.info(f"Combine encoded and scaled features.")
        X_train_combined = np.hstack([X_train_scaled, X_train_encoded])
        X_test_combined = np.hstack([X_test_scaled, X_test_encoded])
        
        # Dump the encoded and scaled data
        logger.info(f"Dump the encoded training data into {self.config.encoded_train_path}.")
        joblib.dump((X_train_combined, y_train), self.config.encoded_train_path)
        
        logger.info(f"Dump the encoded testing data into {self.config.encoded_test_path}.")
        joblib.dump((X_test_combined, y_test), self.config.encoded_test_path)
        
        logger.info(f"Creating {self.config.model_dir} directory.")
        model_dir = str(self.config.model_dir)
        os.makedirs(model_dir, exist_ok=True)
        
        # Save the encoder and scaler models
        logger.info(f"Save the encoder model.")
        joblib.dump(encoder, self.config.encoder_model_path)
        
        logger.info(f"Save the scaler model.")
        joblib.dump(scaler, self.config.scaler_model_path)

In [9]:
try:
    config = ConfigurationManager()
    dump_data_config = config.get_dump_data_config()
    data_ingestion = DumpData(config=dump_data_config)
    data_ingestion.dump_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-13 08:50:33,935: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-13 08:50:33,942: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-13 08:50:33,945: INFO: common: created directory at: artifacts]
[2024-07-13 08:50:33,959: INFO: common: created directory at: artifacts/data]
[2024-07-13 08:50:33,961: INFO: 1105889720: Read loan default file.]
[2024-07-13 08:50:36,539: INFO: 1105889720: Split loan default data to train and test.]
[2024-07-13 08:50:36,739: INFO: 1105889720: Dump training data into artifacts/data/X_train.pkl.]
[2024-07-13 08:50:36,791: INFO: 1105889720: Dump testing data into artifacts/data/X_test.pkl.]


In [10]:
X_train = joblib.load(dump_data_config.input_train_path)
X_train

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
186723,36,32710,103395,661,51,2,22.46,60,0.25,PhD,Self-employed,Single,Yes,Yes,Other,No
112233,18,144701,78213,735,116,4,19.50,12,0.50,Master's,Part-time,Divorced,No,No,Education,Yes
1145,39,25070,82922,488,98,3,7.26,12,0.55,Master's,Unemployed,Divorced,Yes,No,Education,No
90811,28,27502,130369,613,23,3,16.59,60,0.14,PhD,Self-employed,Single,Yes,Yes,Education,No
221324,47,75122,189424,487,50,1,16.57,12,0.46,Bachelor's,Self-employed,Divorced,No,No,Home,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,40,116623,161673,651,79,2,23.44,12,0.87,Bachelor's,Part-time,Divorced,No,No,Home,Yes
103694,50,144334,216065,488,106,2,2.04,60,0.38,Master's,Part-time,Married,No,Yes,Education,No
131932,68,19918,63722,353,17,4,2.85,36,0.12,Master's,Unemployed,Divorced,Yes,No,Business,Yes
146867,26,27061,51707,466,78,1,12.89,24,0.52,Master's,Full-time,Divorced,No,No,Business,No


In [11]:
X_train.isnull().sum()

Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
dtype: int64

In [12]:
y_train = joblib.load(dump_data_config.output_train_path)
y_train

186723    0
112233    0
1145      0
90811     0
221324    0
         ..
119879    0
103694    0
131932    0
146867    0
121958    0
Name: Default, Length: 51069, dtype: int64

In [13]:
X_test = joblib.load(dump_data_config.input_test_path)
X_test

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner
51139,49,122237,87790,741,91,2,20.88,24,0.34,High School,Full-time,Divorced,No,No,Home,No
71005,56,91569,131575,641,54,1,15.19,12,0.43,High School,Part-time,Divorced,Yes,Yes,Education,Yes
35684,23,106978,230993,453,73,1,18.67,12,0.78,Master's,Part-time,Divorced,No,Yes,Other,Yes
174087,26,63033,10804,326,118,1,14.71,24,0.41,High School,Part-time,Single,No,No,Business,Yes
137952,24,29665,21182,662,102,3,15.02,60,0.69,PhD,Unemployed,Single,No,Yes,Business,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79821,66,116497,65020,569,85,3,10.81,24,0.16,High School,Full-time,Divorced,Yes,No,Business,No
59328,19,97798,180407,520,83,1,13.52,12,0.65,Master's,Part-time,Single,No,No,Other,Yes
195077,32,58817,34741,838,16,4,2.21,48,0.34,High School,Full-time,Married,Yes,Yes,Auto,Yes
144904,28,60125,144685,687,89,4,10.46,24,0.64,Bachelor's,Self-employed,Married,No,Yes,Education,Yes


In [14]:
X_test.isnull().sum()

Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
NumCreditLines    0
InterestRate      0
LoanTerm          0
DTIRatio          0
Education         0
EmploymentType    0
MaritalStatus     0
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
dtype: int64

In [15]:
y_test = joblib.load(dump_data_config.output_test_path)
y_test

51139     0
71005     0
35684     0
174087    0
137952    0
         ..
79821     0
59328     0
195077    0
144904    0
137314    0
Name: Default, Length: 204278, dtype: int64

In [16]:
try:
    config = ConfigurationManager()
    preprocessing_config = config.get_preprocessing_data_config()
    preprocessing = Preprocessing(config=preprocessing_config)
    preprocessing.encode_data()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-13 08:50:42,573: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-13 08:50:42,579: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-13 08:50:42,584: INFO: common: created directory at: artifacts]
[2024-07-13 08:50:42,588: INFO: common: created directory at: artifacts/preprocessing]
[2024-07-13 08:50:42,591: INFO: 1105889720: Load training data from artifacts/data/X_train.pkl.]
[2024-07-13 08:50:42,654: INFO: 1105889720: Load testing data from artifacts/data/X_test.pkl.]
[2024-07-13 08:50:42,951: INFO: 1105889720: Encode categorical features.]
[2024-07-13 08:50:44,032: INFO: 1105889720: Scale numerical features.]
[2024-07-13 08:50:44,109: INFO: 1105889720: Combine encoded and scaled features.]
[2024-07-13 08:50:44,170: INFO: 1105889720: Dump the encoded training data into artifacts/preprocessing/X_train_encoded.pkl.]
[2024-07-13 08:50:44,210: INFO: 1105889720: Dump the encoded testing data into artifacts/preprocessing/X_test_en

In [17]:
X_train_enc = joblib.load(preprocessing_config.encoded_train_path)
X_train_enc

(array([[-0.49614949, -1.27925561, -0.34566167, ...,  1.        ,
          1.        ,  0.        ],
        [-1.6988325 ,  1.59211763, -0.70186312, ...,  0.        ,
          0.        ,  1.        ],
        [-0.29570232, -1.47514003, -0.63525393, ...,  0.        ,
          1.        ,  0.        ],
        ...,
        [ 1.64195363, -1.60723381, -0.90683951, ...,  0.        ,
          0.        ,  1.        ],
        [-1.16430672, -1.42409214, -1.07679267, ...,  0.        ,
          1.        ,  0.        ],
        [-1.56520105, -0.12235815, -0.78158481, ...,  0.        ,
          0.        ,  1.        ]]),
 186723    0
 112233    0
 1145      0
 90811     0
 221324    0
          ..
 119879    0
 103694    0
 131932    0
 146867    0
 121958    0
 Name: Default, Length: 51069, dtype: int64)

In [18]:
X_test_enc = joblib.load(preprocessing_config.encoded_test_path)
X_test_enc

(array([[ 0.3724549 ,  1.01615592, -0.56639567, ...,  0.        ,
          1.        ,  0.        ],
        [ 0.84016496,  0.22984921,  0.05294674, ...,  0.        ,
          0.        ,  1.        ],
        [-1.36475389,  0.62492551,  1.45922251, ...,  1.        ,
          0.        ,  1.        ],
        ...,
        [-0.76341238, -0.60988985, -1.31677813, ...,  0.        ,
          0.        ,  1.        ],
        [-1.03067527, -0.57635361,  0.23838877, ...,  0.        ,
          0.        ,  1.        ],
        [-0.02843943,  0.61941306,  1.59826867, ...,  0.        ,
          1.        ,  0.        ]]),
 51139     0
 71005     0
 35684     0
 174087    0
 137952    0
          ..
 79821     0
 59328     0
 195077    0
 144904    0
 137314    0
 Name: Default, Length: 204278, dtype: int64)