In [2]:
# Cargar el dataset
import pandas as pd
from sklearn.preprocessing import StandardScaler

df = pd.read_csv('credit_risk_dataset.csv')

In [3]:
# Eliminar Outliers de las columnas person_age y person_emp_length
df = df.loc[~((df['person_age'] == 123) | (df['person_age'] == 144) | (df['person_emp_length'] == 123))]
df = df.reset_index(drop=True)

In [6]:
# Detectar valores nulos de la columna person_emp_length
column = 'person_emp_length'
null_values = df[df[column].isnull()]
display(null_values)

null_values_in_column = df[column].isnull()
display(null_values_in_column.sum())

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length


0

In [5]:
# Completar los valores nulos de la columna person_emp_length utilizando la mediana

median_emp_length = df['person_emp_length'].median()
display(median_emp_length)
df['person_emp_length'].fillna(median_emp_length, inplace=True)


4.0

In [None]:
# Completar valores vacios de person_emp_length utilizando informacion de loan_grade

# Calcular la mediana de la columna person_emp_length para cada categoría de loan_grade
median_emp_length_by_grade = df.groupby('loan_grade')['person_emp_length'].median()

# Función de imputación basada en la mediana por categoría
def impute_emp_length(row):
    if pd.isnull(row['person_emp_length']):
        return median_emp_length_by_grade.get(row['loan_grade'], row['person_emp_length'])
    return row['person_emp_length']

# Aplicar la función de imputación a los valores faltantes
df['person_emp_length'] = df.apply(impute_emp_length, axis=1)


In [9]:
# Detectar valores nulos de la columna loan_int_rate
column = 'loan_int_rate'
null_values = df[df[column].isnull()]
display(null_values)

null_values_in_column = df[column].isnull()
display(null_values_in_column.sum())

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length


0

In [13]:
# Completar los valores nulos de la columna loan_int_rate utilizando la mediana

median_loan_int_rate = df['loan_int_rate'].median()
print(median_loan_int_rate)
df['loan_int_rate'].fillna(median_loan_int_rate, inplace=True)

10.99


In [8]:
# Completar valores vacios de loan_int_rate utilizando informacion de loan_grade

# Calcular la mediana de la columna person_emp_length para cada categoría de loan_grade
median_loan_int_rate_by_grade = df.groupby('loan_grade')['loan_int_rate'].median()

display(median_loan_int_rate_by_grade)

# Función de imputación basada en la mediana por categoría
def impute_int_rate(row):
    if pd.isnull(row['loan_int_rate']):
        return median_loan_int_rate_by_grade.get(row['loan_grade'], row['loan_int_rate'])
    return row['loan_int_rate']

# Aplicar la función de imputación a los valores faltantes
df['loan_int_rate'] = df.apply(impute_int_rate, axis=1)

loan_grade
A     7.490
B    10.990
C    13.480
D    15.310
E    16.820
F    18.535
G    20.160
Name: loan_int_rate, dtype: float64

In [10]:
# En esta parte se hace el proceso de estandarizar y hacer One Hot Endoding en el dataset

# Eliminar columnas categoricas del dataset
person_home_ownership = df.pop('person_home_ownership')
loan_intent = df.pop('loan_intent')
loan_grade = df.pop('loan_grade')
cb_person_default_on_file = df.pop('cb_person_default_on_file')

numeric_columns = [
    'person_age',
    'person_income',
    'person_emp_length',
    'loan_amnt',
    'loan_int_rate',
    'loan_status',
    'loan_percent_income',
    'cb_person_cred_hist_length'
]

# Estandarizar las columnas numéricas
scaler = StandardScaler()
numeric_df = pd.DataFrame(scaler.fit_transform(df), columns=numeric_columns)

# One Hot Encoding sobre las columnas categóricas
person_home_ownership_encoded = pd.get_dummies(person_home_ownership, prefix='person_home_ownership', dtype=int)
loan_intent_encoded = pd.get_dummies(loan_intent, prefix='loan_intent', dtype=int)
loan_grade_encoded = pd.get_dummies(loan_grade, prefix='loan_grade', dtype=int)
cb_person_default_on_file_encoded = pd.get_dummies(cb_person_default_on_file, prefix='cb_person_default_on_file', dtype=int)

# Concatenar las columnas estandarizadas y codificadas al DataFrame original
final_df = pd.concat(
    [numeric_df, person_home_ownership_encoded, loan_intent_encoded, loan_grade_encoded, cb_person_default_on_file_encoded], 
    axis=1
)
final_df.to_csv('estandarizado.csv')
display(final_df)


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,-1.082763,-1.071336,0.060140,-1.358831,0.039302,-0.528268,-0.657607,-0.938403,0,0,...,0,0,1,0,0,0,0,0,1,0
1,-0.438110,-1.071336,-0.944602,-0.646823,0.577860,1.892979,3.745070,-0.691721,1,0,...,0,0,0,1,0,0,0,0,1,0
2,-0.760436,-0.007205,-0.191046,4.020786,1.312541,1.892979,3.370374,-0.938403,0,0,...,0,0,0,1,0,0,0,0,1,0
3,-0.599273,-0.218508,0.813697,4.020786,1.013688,1.892979,3.557722,-0.445040,0,0,...,0,0,0,1,0,0,0,0,0,1
4,-1.082763,-1.065625,-0.693417,-1.121495,-1.205920,1.892979,0.747502,-0.938403,0,0,...,1,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32569,4.719111,-0.245159,-0.944602,-0.599355,0.668139,-0.528268,-0.563933,5.968678,1,0,...,0,0,0,1,0,0,0,0,1,0
32570,4.235621,1.030275,-0.191046,1.271644,-1.096963,-0.528268,-0.189237,3.255182,1,0,...,0,1,0,0,0,0,0,0,1,0
32571,6.008416,0.192677,-0.442231,4.020786,-0.007394,1.892979,2.714656,5.475315,0,0,...,0,0,1,0,0,0,0,0,1,0
32572,4.557948,1.601365,0.060140,0.856306,0.145145,-0.528268,-0.657607,4.981952,1,0,...,0,0,1,0,0,0,0,0,1,0
