# Credit Risk Analysis

## Import packages

1. `sys`: System-specific parameters and functions.
2. `reload` (from `imp`): Reload previously imported modules.
3. `matplotlib.pyplot`: Data visualization.
4. `numpy`: Numerical computing.
5. `pandas`: Data manipulation and analysis.
6. `seaborn`: Statistical data visualization.
7. `SimpleImputer` (from `sklearn.impute`): Handling missing data.
8. `LogisticRegression` (from `sklearn.linear_model`): Logistic regression for classification.

In [30]:
import sys

sys.path.append("..")

from imp import reload

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

from helper_functions import config, data_utils, evaluation, plot, preprocessing

In [31]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore', category = FutureWarning)

## Load normalized data set


In this notebook, we are going to encode a previously normalized `dataset` followed by the creation of the `ML` model.

In [32]:
app_normalized = data_utils.get_normalized_model()
app_normalized['TARGET_LABEL_BAD=1'] = app_normalized.pop('TARGET_LABEL_BAD=1')

In [33]:
app_normalized = preprocessing.categorical_columns(app_normalized)
app_normalized.head()

Unnamed: 0,PAYMENT_DAY,APPLICATION_SUBMISSION_TYPE,SEX,MARITAL_STATUS,QUANT_DEPENDANTS,RESIDENCIAL_STATE,FLAG_RESIDENCIAL_PHONE,MONTHS_IN_RESIDENCE,FLAG_EMAIL,COMPANY,...,PRODUCT,AGE,HAS_DEPENDANTS,HAS_RESIDENCE,MONTHLY_INCOMES_TOT,HAS_CARDS,HAS_BANKING_ACCOUNTS,HAS_PERSONAL_ASSETS,HAS_CARS,TARGET_LABEL_BAD=1
0,1 - 14,Web,F,other,1,RN,Y,+ 1 year,1,N,...,1,26 - 35,True,True,[650 - 1320],True,False,False,False,1
1,15 - 30,Carga,F,married,0,RJ,Y,0 - 6 months,1,Y,...,1,26 - 35,False,True,[650 - 1320],False,False,False,False,1
2,1 - 14,Web,F,married,0,RN,Y,+ 1 year,1,N,...,1,26 - 35,False,True,[0 - 650],False,False,False,False,0
3,15 - 30,Web,F,married,0,PE,N,+ 1 year,1,N,...,1,> 60,False,False,[0 - 650],False,False,False,False,0
4,1 - 14,Web,M,married,0,RJ,Y,6 months - 1 year,1,N,...,1,46 - 60,False,True,[650 - 1320],False,False,False,False,1


In [36]:
print(app_normalized.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49935 entries, 0 to 49934
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   PAYMENT_DAY                  49935 non-null  category
 1   APPLICATION_SUBMISSION_TYPE  49935 non-null  category
 2   SEX                          49935 non-null  category
 3   MARITAL_STATUS               49935 non-null  category
 4   QUANT_DEPENDANTS             49935 non-null  category
 5   RESIDENCIAL_STATE            49935 non-null  category
 6   FLAG_RESIDENCIAL_PHONE       49935 non-null  category
 7   MONTHS_IN_RESIDENCE          49935 non-null  category
 8   FLAG_EMAIL                   49935 non-null  category
 9   COMPANY                      49935 non-null  category
 10  FLAG_PROFESSIONAL_PHONE      49935 non-null  category
 11  PRODUCT                      49935 non-null  category
 12  AGE                          49935 non-null  category
 13  H

In [85]:
reload(preprocessing)
print(f'x1: {a1}')
print(f'x2: {a2}')

 # x2.remove('RESIDENCIAL_STATE') convert to 


x1: ['PAYMENT_DAY', 'APPLICATION_SUBMISSION_TYPE', 'SEX', 'FLAG_RESIDENCIAL_PHONE', 'FLAG_EMAIL', 'COMPANY', 'FLAG_PROFESSIONAL_PHONE', 'HAS_DEPENDANTS', 'HAS_RESIDENCE', 'HAS_CARDS', 'HAS_BANKING_ACCOUNTS', 'HAS_PERSONAL_ASSETS', 'HAS_CARS', 'TARGET_LABEL_BAD=1']
x2: ['MARITAL_STATUS', 'QUANT_DEPENDANTS', 'RESIDENCIAL_STATE', 'MONTHS_IN_RESIDENCE', 'PRODUCT', 'AGE', 'MONTHLY_INCOMES_TOT']


In [84]:
print(x2)

['MARITAL_STATUS', 'QUANT_DEPENDANTS', 'MONTHS_IN_RESIDENCE', 'PRODUCT', 'AGE', 'MONTHLY_INCOMES_TOT']


### Encoding

In [74]:
import pandas as pd
from sklearn.preprocessing import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder

def encoding(df, get_dummies=False, target='TARGET_LABEL_BAD=1_1'):
    
    if get_dummies:
        cols_to_encode = [col for col in df.columns if col != target]
        df_concat_encoded = pd.get_dummies(data=df, columns=cols_to_encode, drop_first=True)
        return(df_concat_encoded)
    else:
        binary_columns, non_binary_columns = preprocessing.separate_binary_columns(app_normalized)
        non_binary_columns.remove('RESIDENCIAL_STATE') # this column will have category encoder due to amount of unic values

        
        oh_encoder = OneHotEncoder(drop='first', sparse=False) 
        encoded_data = oh_encoder.fit_transform(df[non_binary_columns])

        # Convertir el array numpy resultante en un DataFrame y asignar nombres a las columnas.
        encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names(cols_to_encode))

        
        
        df1 = pd.get_dummies(data=df, columns=non_binary_columns, drop_first=True)
        encoder = ce.BinaryEncoder(cols=['column_to_binary_encode'])
        
        df_encoded_binary = encoder.fit_transform(df[['column_to_binary_encode']])

        










In [None]:
import pandas as pd
import category_encoders as ce

# Supongamos que df es tu DataFrame y 'column_to_binary_encode' es el nombre de la columna con 27 valores únicos.

# Paso 1: Aplicar One-Hot Encoding a todas las columnas categóricas, excepto 'column_to_binary_encode'.
cols_to_one_hot_encode = [col for col in df.columns if col != 'column_to_binary_encode']
df_encoded_one_hot = pd.get_dummies(data=df, columns=cols_to_one_hot_encode, drop_first=True)

# Paso 2: Aplicar Binary Encoding solo a la columna 'column_to_binary_encode'.
encoder = ce.BinaryEncoder(cols=['column_to_binary_encode'])
df_encoded_binary = encoder.fit_transform(df[['column_to_binary_encode']])

# Combinar los DataFrames resultantes.
df_final = pd.concat([df_encoded_one_hot, df_encoded_binary], axis=1)


In [75]:
x = encoding(app_normalized, True)

KeyError: 'TARGET_LABEL_BAD=1_1'

In [71]:
x 


Unnamed: 0,PAYMENT_DAY_15 - 30,APPLICATION_SUBMISSION_TYPE_Web,SEX_M,MARITAL_STATUS_other,MARITAL_STATUS_single,QUANT_DEPENDANTS_1,QUANT_DEPENDANTS_2,QUANT_DEPENDANTS_3,RESIDENCIAL_STATE_AL,RESIDENCIAL_STATE_AM,...,HAS_RESIDENCE_True,MONTHLY_INCOMES_TOT_[1320 - 3323],MONTHLY_INCOMES_TOT_[3323 - 8560],MONTHLY_INCOMES_TOT_[650 - 1320],MONTHLY_INCOMES_TOT_[> 8560],HAS_CARDS_True,HAS_BANKING_ACCOUNTS_True,HAS_PERSONAL_ASSETS_True,HAS_CARS_True,TARGET_LABEL_BAD=1_1
0,0,1,0,1,0,1,0,0,0,0,...,1,0,0,1,0,1,0,0,0,1
1,1,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49930,0,0,0,0,1,0,1,0,0,0,...,1,1,0,0,0,0,1,0,1,1
49931,1,0,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
49932,0,1,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
49933,0,1,0,0,1,1,0,0,0,0,...,1,1,0,0,0,0,0,0,0,1


In [65]:
print(x)

None


In [40]:
# split in sets
reload(data_utils)
train_set, test_set, val_set = data_utils.get_feature_in_set(app_normalized)

In [45]:
# Print shape of input data
print("Input train data shape: ", train_set.shape)
print("Input val data shape: ", test_set.shape)
print("Input test data shape: ", val_set.shape, "\n")

Input train data shape:  (33955, 21)
Input val data shape:  (9987, 21)
Input test data shape:  (5993, 21) 



In [52]:
x = pd.get_dummies(train_set)
print(len(x.columns))

73


In [53]:
x = pd.get_dummies(train_set, drop_first = True)
len(x.columns)

58

In [54]:
x.head()

Unnamed: 0,HAS_DEPENDANTS,HAS_RESIDENCE,HAS_CARDS,HAS_BANKING_ACCOUNTS,HAS_PERSONAL_ASSETS,HAS_CARS,PAYMENT_DAY_15 - 30,APPLICATION_SUBMISSION_TYPE_Web,SEX_M,MARITAL_STATUS_other,...,AGE_26 - 35,AGE_36 - 45,AGE_46 - 60,AGE_< 18,AGE_> 60,MONTHLY_INCOMES_TOT_[1320 - 3323],MONTHLY_INCOMES_TOT_[3323 - 8560],MONTHLY_INCOMES_TOT_[650 - 1320],MONTHLY_INCOMES_TOT_[> 8560],TARGET_LABEL_BAD=1_1
15631,False,True,True,False,False,False,0,1,1,0,...,0,0,1,0,0,0,0,0,0,0
1480,False,True,False,True,False,True,1,0,0,0,...,0,0,1,0,0,0,0,1,0,1
22852,True,True,False,False,False,False,0,1,1,0,...,0,1,0,0,0,0,0,1,0,1
24418,True,False,False,True,False,False,1,0,1,0,...,0,1,0,0,0,0,0,1,0,0
7103,False,True,False,True,False,True,1,0,1,0,...,1,0,0,0,0,0,0,1,0,0


In [None]:
X_train, y_train, X_test, y_test, X_val, y_val = xxx(train, test, val)

In [48]:
def preprocess_data(train_df, test_df, val_df, get_dummies='False', target='TARGET_LABEL_BAD=1_1'):

    # Print shape of input data
    print("Input train data shape: ", train_df.shape)
    print("Input test data shape: ", test_df.shape)
    print("Input val data shape: ", val_df.shape, "\n")
    
    # Utilizando una función lambda para extraer características y etiquetas
    extract_features_labels = lambda df, target: (df.drop(columns=target), df[target])

    # Aplicar la función lambda a los DataFrames
    X_train, y_train = extract_features_labels(train_df, target)
    X_test, y_test = extract_features_labels(test_df, target)
    X_val, y_val = extract_features_labels(val_df, target)


    if get_dummies:
        pd.get_dummies(X_train)
        pd.get_dummies(X_test)
        pd.get_dummies(X_val)
    else: 

    # Make a copy of the dataframes
    working_train_df = train_df.copy()
    working_test_df = test_df.copy()
    working_val_df = val_df.copy()

    # Taking the columns that contain objects.
    category_columns = working_train_df.select_dtypes(exclude="number").columns.to_list()
    print("cat_cols: ", working_train_df.select_dtypes(exclude="number").columns)
    numeric_columns = working_train_df.select_dtypes(include="number").columns.to_list()
    numeric_columns.remove("TARGET_LABEL_BAD=1")
    print(numeric_columns)
    
    # Filtering the dataset.
    aux_dataframe = working_train_df[category_columns].copy()
    mask_2 = (aux_dataframe.nunique() == 2).values
    cat_2 = aux_dataframe.loc[:, mask_2].columns
    print(cat_2)
    mask_gt_2 = (aux_dataframe.nunique() > 2).values
    cat_gt_2 = aux_dataframe.loc[:, mask_gt_2].columns
    print(cat_gt_2)

    numeric_transformer = Pipeline(
        steps=[
            # ("imputer", SimpleImputer(strategy='median')), 
            ("scaler", RobustScaler())
        ]
    )
    categorical_transformer = Pipeline(
        steps=[
            # ("imputer", SimpleImputer(strategy='most_frequent')),
            ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]
    )
    bincategorical_transformer = Pipeline(
        steps=[
            # ("imputer", SimpleImputer(strategy='most_frequent')),
            ("encoder", OrdinalEncoder()),
        ]
    )
    
    ct_preprocessing = ColumnTransformer(transformers=[
        ('transform_cat', categorical_transformer, cat_gt_2),
        ('transform_bin', bincategorical_transformer, cat_2),
        ('transform_num', numeric_transformer, numeric_columns),
        
    ], remainder='passthrough')

    ct_preprocessing.fit(working_train_df)
    # get columns of new dataframe
    columns_name = ct_preprocessing.get_feature_names_out()

    working_train_df = ct_preprocessing.transform(working_train_df)
    working_val_df = ct_preprocessing.transform(working_val_df)
    working_test_df = ct_preprocessing.transform(working_test_df)

    return working_train_df, working_val_df, working_test_df, columns_name

SyntaxError: invalid syntax (2652394240.py, line 9)

In [None]:
def get_feature_target(
    app_train: pd.DataFrame, app_val: pd.DataFrame, app_test: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series,pd.DataFrame, pd.Series]:
    """
    Separates our train and test datasets columns between Features
    (the input to the model) and Targets (what the model has to predict with the
    given features).

    Arguments:
        app_train : pd.DataFrame
            Training datasets
        app_test : pd.DataFrame
            Test datasets

    Returns:
        X_train : pd.DataFrame
            Training features
        y_train : pd.Series
            Training target
        X_test : pd.DataFrame
            Test features
        y_test : pd.Series
            Test target
    """
    X_train, y_train, X_val, y_val,X_test, y_test = None, None, None, None,None,None

    # training
    X_train = app_train[:,:-1]
    y_train = app_train[:,-1:]

    # validation
    X_val = app_val[:,:-1]
    y_val = app_val[:,-1:]

    
    # testing
    X_test = app_test[:,:-1]
    y_test = app_test[:,-1:]

    return X_train, y_train, X_val, y_val, X_test,y_test