In [1]:
import os

In [2]:
%pwd

'/root/pacmann/loan-default-project/notebooks'

In [3]:
# Change to the main directory
# So, it's executed from main directory
os.chdir("../")

In [4]:
%pwd

'/root/pacmann/loan-default-project'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    input_train_path: Path
    output_train_path: Path
    encoded_train_path: Path
    encoded_test_path: Path
    model_path: Path
    params_max_iter: int
    params_solver: str
    params_c: float
    params_class_weight: any

In [6]:
from LoanDefault.constants import CONFIG_FILE_PATH, PARAMS_FILE_PATH
from LoanDefault.utils.common import read_yaml, create_directories

In [7]:
class ConfigurationManager:
    def __init__(self, 
                 config_filepath = CONFIG_FILE_PATH,
                 params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])
    
    def get_training_config(self) -> TrainingConfig:
        """read training config file and store as config entity
        then apply the dataclasses
        
        Returns:
            config: TrainingConfig type
        """
        data_dump_config = self.config.dump_data
        encoded_data_config = self.config.encoded_data
        train_config = self.config.train_model
        train_params = self.params

        create_directories([train_config.root_dir])

        class_weight = None if train_params.CLASS_WEIGHT == "None" else train_params.CLASS_WEIGHT

        config = TrainingConfig(
            root_dir=train_config.root_dir,
            input_train_path=Path(data_dump_config.input_train_path),
            output_train_path=Path(data_dump_config.output_train_path),
            encoded_train_path=Path(encoded_data_config.encoded_train_path),
            encoded_test_path=Path(encoded_data_config.encoded_test_path),
            model_path=Path(train_config.model_path),
            params_max_iter=train_params.MAX_ITER,
            params_solver=train_params.SOLVER,
            params_c=train_params.C,
            params_class_weight=class_weight
        )

        return config

In [8]:
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from LoanDefault import logger

class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config

    def logistic_regression(self) -> None:
        logger.info(f"Load encoded training data from {self.config.encoded_train_path}.")
        X_train_combined, y_train = joblib.load(self.config.encoded_train_path)
        
        logger.info(f"Train the logistic regression model.")
        model = LogisticRegression(
            solver=self.config.params_solver,
            max_iter=self.config.params_max_iter,
            C=self.config.params_c,
            class_weight=self.config.params_class_weight
        )
        
        model.fit(X_train_combined, y_train)
        
        logger.info(f"Dump the trained model into {self.config.model_path}.")
        joblib.dump(model, self.config.model_path)


In [9]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.logistic_regression()
except Exception as e:
    logger.error(e)
    raise e

[2024-07-13 08:50:54,179: INFO: common: yaml file: config/config.yaml loaded successfully]
[2024-07-13 08:50:54,187: INFO: common: yaml file: metrics/params.yaml loaded successfully]
[2024-07-13 08:50:54,200: INFO: common: created directory at: artifacts]
[2024-07-13 08:50:54,203: INFO: common: created directory at: artifacts/models]
[2024-07-13 08:50:54,205: INFO: 1296080503: Load encoded training data from artifacts/preprocessing/X_train_encoded.pkl.]
[2024-07-13 08:50:55,172: INFO: 1296080503: Train the logistic regression model.]
[2024-07-13 08:50:55,467: INFO: 1296080503: Dump the trained model into artifacts/models/logistic_regression.pkl.]


In [10]:
import pandas as pd
import joblib

X_train_data = joblib.load(training_config.encoded_train_path)
y_train = joblib.load(training_config.output_train_path)
model = joblib.load(training_config.model_path)

# Check if X_train_data is a tuple and unpack it
if isinstance(X_train_data, tuple):
    X_train_enc = X_train_data[0]
    X_train = X_train_data[1] if len(X_train_data) > 1 else None
else:
    X_train_enc = X_train_data
    X_train = None  # or you can set it to some default value or raise an error

# Predict using the loaded model
y_pred = pd.Series(model.predict(X_train_enc), index=X_train_enc.index if X_train is None else X_train.index)

# Print the predictions
print(y_pred)


186723    0
112233    0
1145      0
90811     0
221324    0
         ..
119879    0
103694    0
131932    0
146867    0
121958    0
Length: 51069, dtype: int64


In [11]:
print(f"{classification_report(y_train, y_pred)}")

              precision    recall  f1-score   support

           0       0.89      1.00      0.94     45232
           1       0.61      0.03      0.05      5837

    accuracy                           0.89     51069
   macro avg       0.75      0.51      0.49     51069
weighted avg       0.86      0.89      0.84     51069

