importing file

import pandas as pd
import numpy as np

In [2]:
import pandas as pd
import numpy as np

In [3]:
def load_data(fname: str) -> pd.DataFrame:
    """
    Loads a dataset from a CSV file.

    Parameters:
    fname (str): The location of the CSV file.

    Returns:
    pd.DataFrame: The loaded dataset (DataFrame).
    """
    data = pd.read_csv(fname)
    print(f"Data Shape: {data.shape}")
    return data

In [4]:
FNAME = 'data/raw/credit_risk_dataset.csv'
data = load_data(FNAME)

Data Shape: (32581, 12)


In [5]:
print(data.head())

   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_status  \
0    PERSONAL          D      35000          16.02            1   
1   EDUCATION          B       1000          11.14            0   
2     MEDICAL          C       5500          12.87            1   
3     MEDICAL          C      35000          15.23            1   
4     MEDICAL          C      35000          14.27            1   

   loan_percent_income cb_person_default_on_file  cb_person_cred_hist_length  
0                 0.59                         Y                           3  


screenshot

![Screenshot data head](screenshot_for_data_head.png)

In [6]:
def split_input_output(data: pd.DataFrame, target_col: str) -> tuple:
    """
    Splits the dataset into input features and target variable.

    Parameters:
    data (pd.DataFrame): The complete dataset.
    target_col (str): The name of the column containing the target variable.

    Returns:
    tuple: A tuple containing:
        - X (pd.DataFrame): The input features after dropping the target column.
        - y (pd.Series): The target variable extracted from the dataset.
    """
    print(f"Original data shape: {data.shape}")

    X = data.drop(columns=[target_col])

    y = data[target_col]

    print(f"X data shape: {X.shape}")
    print(f"y data shape: {y.shape}")

    return X, y


In [7]:
TARGET_COL = 'loan_status'  

X, y = split_input_output(data, TARGET_COL)

print(X.head())
print(y.head())

Original data shape: (32581, 12)
X data shape: (32581, 11)
y data shape: (32581,)
   person_age  person_income person_home_ownership  person_emp_length  \
0          22          59000                  RENT              123.0   
1          21           9600                   OWN                5.0   
2          25           9600              MORTGAGE                1.0   
3          23          65500                  RENT                4.0   
4          24          54400                  RENT                8.0   

  loan_intent loan_grade  loan_amnt  loan_int_rate  loan_percent_income  \
0    PERSONAL          D      35000          16.02                 0.59   
1   EDUCATION          B       1000          11.14                 0.10   
2     MEDICAL          C       5500          12.87                 0.57   
3     MEDICAL          C      35000          15.23                 0.53   
4     MEDICAL          C      35000          14.27                 0.55   

  cb_person_default_on_file 

screenshot

![screenshot splitting x and y](screenshot_for_split_x_y.png)

In [8]:
from sklearn.model_selection import train_test_split

def split_train_test(X: pd.DataFrame, y: pd.Series, test_size: float, random_state: int = None) -> tuple:
    """
    Splits the dataset into training and testing sets.

    Parameters:
    X (DataFrame): The input features.
    y (Series): The target variable.
    test_size (float): The proportion of the dataset to include in the test split.
    random_state (int, optional): Controls the shuffling applied to the data before applying the split. Default is None.

    Returns:
    Tuple[DataFrame, DataFrame, Series, Series]: 
        - X_train (DataFrame): The training set for features.
        - X_test (DataFrame): The testing set for features.
        - y_train (Series): The training set for the target variable.
        - y_test (Series): The testing set for the target variable.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)

    print(f"X train shape: {X_train.shape}")
    print(f"X test shape: {X_test.shape}")
    print(f"y train shape: {y_train.shape}")
    print(f"y test shape: {y_test.shape}")

    return X_train, X_test, y_train, y_test


In [9]:
X_train, X_non_train, y_train, y_non_train = split_train_test(X, y, test_size=0.2, random_state=42)

X_valid, X_test, y_valid, y_test = split_train_test(X_non_train, y_non_train, test_size=0.5, random_state=42)


X train shape: (26064, 11)
X test shape: (6517, 11)
y train shape: (26064,)
y test shape: (6517,)
X train shape: (3258, 11)
X test shape: (3259, 11)
y train shape: (3258,)
y test shape: (3259,)


In [10]:
import joblib
import os

def serialize_data(data: object, path: str) -> None:
    """
    Serializes the given data and saves it to the specified path.

    Parameters:
    data (object): The instance or object that you want to serialize.
    path (str): The file path where the serialized data will be saved.

    Returns:
    None: This function does not return a value.
    """
    dir_name = os.path.dirname(path)
    if not os.path.exists(dir_name):
        print(f"Warning: The directory '{dir_name}' does not exist. Please create it before serialization.")
        return
    joblib.dump(data, path)

In [11]:
serialize_data(X_train, "data/interim/X_train.pkl")
serialize_data(y_train, "data/interim/y_train.pkl")
serialize_data(X_test, "data/interim/X_test.pkl")
serialize_data(y_test, "data/interim/y_test.pkl")
serialize_data(X_valid, "data/interim/X_valid.pkl")
serialize_data(y_valid, "data/interim/y_valid.pkl")

screenshot

![screenshot serialize working](screenshot_serialize_working.png)

In [12]:
import joblib

def deserialize_data(path: str) -> object:
    """
    Deserializes data from the specified path.

    Parameters:
    path (str): The path where the serialized data is located.

    Returns:
    object: The deserialized data.
    """
    data = joblib.load(path)
    return data


In [13]:
X_train = deserialize_data("data/interim/X_train.pkl")
y_train = deserialize_data("data/interim/y_train.pkl")
X_test = deserialize_data("data/interim/X_test.pkl")
y_test = deserialize_data("data/interim/y_test.pkl")
X_valid = deserialize_data("data/interim/X_valid.pkl")
y_valid = deserialize_data("data/interim/y_valid.pkl")