In [206]:
# Cargar el dataset
import pandas as pd
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('credit_risk_dataset.csv')

In [207]:
# Eliminar Outliers de las columnas person_age y person_emp_length
df2 = pd.DataFrame(df[(df['person_age'] != 144) & (df['person_age'] != 123)])
df3 = pd.DataFrame(df2[df2['person_emp_length'] != 123])

In [209]:
# Detectar valores nulos de la columna person_emp_length
column = 'person_emp_length'
null_values = df3[df3[column].isnull()]
display(null_values)

null_values_in_column = df3[column].isnull()
display(null_values_in_column.sum())

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length


0

In [208]:
# Completar los valores nulos de la columna person_emp_length utilizando la mediana

median_emp_length = df3['person_emp_length'].median()
display(median_emp_length)
df3['person_emp_length'].fillna(median_emp_length, inplace=True)


4.0

In [None]:
# Completar valores vacios de person_emp_length utilizando informacion de loan_grade

# Calcular la mediana de la columna person_emp_length para cada categoría de loan_grade
median_emp_length_by_grade = df.groupby('loan_grade')['person_emp_length'].median()

# Función de imputación basada en la mediana por categoría
def impute_emp_length(row):
    if pd.isnull(row['person_emp_length']):
        return median_emp_length_by_grade.get(row['loan_grade'], row['person_emp_length'])
    return row['person_emp_length']

# Aplicar la función de imputación a los valores faltantes
df['person_emp_length'] = df.apply(impute_emp_length, axis=1)


In [211]:
# Detectar valores nulos de la columna loan_int_rate
column = 'loan_int_rate'
null_values = df3[df3[column].isnull()]
display(null_values)

null_values_in_column = df3[column].isnull()
display(null_values_in_column.sum())

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length


0

In [13]:
# Completar los valores nulos de la columna loan_int_rate utilizando la mediana

median_loan_int_rate = df['loan_int_rate'].median()
print(median_loan_int_rate)
df['loan_int_rate'].fillna(median_loan_int_rate, inplace=True)

10.99


In [210]:
# Completar valores vacios de loan_int_rate utilizando informacion de loan_grade

# Calcular la mediana de la columna person_emp_length para cada categoría de loan_grade
median_loan_int_rate_by_grade = df.groupby('loan_grade')['loan_int_rate'].median()

display(median_loan_int_rate_by_grade)

# Función de imputación basada en la mediana por categoría
def impute_int_rate(row):
    if pd.isnull(row['loan_int_rate']):
        return median_loan_int_rate_by_grade.get(row['loan_grade'], row['loan_int_rate'])
    return row['loan_int_rate']

# Aplicar la función de imputación a los valores faltantes
df3['loan_int_rate'] = df.apply(impute_int_rate, axis=1)

loan_grade
A     7.490
B    10.990
C    13.480
D    15.310
E    16.820
F    18.535
G    20.160
Name: loan_int_rate, dtype: float64

In [18]:
loan_intent_groups= {
    'DEBTCONSOLIDATION': 1,
    'EDUCATION': 2,
    'HOMEIMPROVEMENT': 3,
    'MEDICAL': 4,
    'PERSONAL': 5,
    'VENTURE': 6
}

person_home_ownership_groups = {
    'MORTGAGE': 1,
    'OTHER': 2,
    'OWN': 3,
    'RENT': 4
}

loan_grade_groups = {
    'A': 1,
    'B': 2,
    'C': 3,
    'D': 4,
    'E': 5,
    'F': 6,
    'G': 7
}

cb_person_default_on_file_groups = {
    'N': 1,
    'Y': 0
}

df['person_home_ownership'] = df['person_home_ownership'].map(person_home_ownership_groups)
df['loan_intent'] = df['loan_intent'].map(loan_intent_groups)
df['loan_grade'] = df['loan_grade'].map(loan_grade_groups)
df['cb_person_default_on_file'] = df['cb_person_default_on_file'].map(cb_person_default_on_file_groups)

In [19]:
display(df.groupby('person_home_ownership').size().reset_index(name='counts'))
# display(df)

Unnamed: 0,person_home_ownership,counts
0,1,13441
1,2,107
2,3,2584
3,4,16442


In [212]:
# En esta parte se hace el proceso de estandarizar y hacer One Hot Endoding en el dataset

# Eliminar columnas categoricas del dataset
person_home_ownership = df.pop('person_home_ownership')
loan_intent = df.pop('loan_intent')
loan_grade = df.pop('loan_grade')
cb_person_default_on_file = df.pop('cb_person_default_on_file')

numeric_columns = [
    'person_age',
    'person_income',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_status',
    'loan_percent_income',
    'cb_person_cred_hist_length'
]

# Estandarizar las columnas numéricas
scaler = StandardScaler()
numeric_df = pd.DataFrame(scaler.fit_transform(df), columns=numeric_columns)

# One Hot Encoding sobre las columnas categóricas
person_home_ownership_encoded = pd.get_dummies(person_home_ownership, prefix='person_home_ownership', dtype=int)
loan_intent_encoded = pd.get_dummies(loan_intent, prefix='loan_intent', dtype=int)
loan_grade_encoded = pd.get_dummies(loan_grade, prefix='loan_grade', dtype=int)
cb_person_default_on_file_encoded = pd.get_dummies(cb_person_default_on_file, prefix='cb_person_default_on_file', dtype=int)

# Concatenar las columnas estandarizadas y codificadas al DataFrame original
final_df = pd.concat([numeric_df, person_home_ownership_encoded, loan_intent_encoded, loan_grade_encoded, cb_person_default_on_file_encoded], axis=1)
display(final_df)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,-0.903374,-0.114143,28.535538,4.019404,1.545580,1.893069,3.931411,-0.691554,0,0,...,0,0,0,0,1,0,0,0,0,1
1,-1.060904,-0.911147,0.050769,-1.358650,0.039595,-0.528243,-0.657458,-0.938167,0,0,...,0,0,1,0,0,0,0,0,1,0
2,-0.430783,-0.911147,-0.914816,-0.646849,0.573479,1.893069,3.744110,-0.691554,1,0,...,0,0,0,1,0,0,0,0,1,0
3,-0.745843,-0.009274,-0.190627,4.019404,1.301784,1.893069,3.369508,-0.938167,0,0,...,0,0,0,1,0,0,0,0,1,0
4,-0.588313,-0.188358,0.774958,4.019404,1.005524,1.893069,3.556809,-0.444942,0,0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,4.610190,-0.210945,-0.914816,-0.599395,0.662974,-0.528243,-0.563808,5.966992,1,0,...,0,0,0,1,0,0,0,0,1,0
32577,4.137599,0.870011,-0.190627,1.271060,-1.086807,-0.528243,-0.189207,3.254251,1,0,...,0,1,0,0,0,0,0,0,1,0
32578,5.870433,0.160129,-0.432024,4.019404,-0.006695,1.893069,2.713956,5.473767,0,0,...,0,0,1,0,0,0,0,0,1,0
32579,4.452660,1.354021,0.050769,0.855843,0.144521,-0.528243,-0.657458,4.980541,1,0,...,0,0,1,0,0,0,0,0,1,0
