In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
test_ids = pd.read_csv("test.csv")["ID"]
train_data = train_data.drop(columns=["ID", "Customer_ID", "Month", "Name", "Number"])

print(train_data.head())

In [None]:
train_data.fillna(train_data.median(numeric_only=True), inplace=True)
print(train_data.head())

In [None]:
for col in ["Total_Current_Loans", "Current_Debt_Outstanding", "Income_Annual", "Credit_Limit", "Age", "Total_Credit_Cards", "Total_Bank_Accounts", "Delay_from_due_date"]:
    train_data[col] = pd.to_numeric(
        train_data[col].astype(str).str.replace("_", "", regex=False), errors="coerce"
    )
    test_data[col] = pd.to_numeric(
        test_data[col].astype(str).str.replace("_", "", regex=False), errors="coerce"
    )
train_data

In [None]:
train_data["Credit_History_Age"] = train_data["Credit_History_Age"].str.extract("(\d+)").astype(float)
test_data["Credit_History_Age"] = test_data["Credit_History_Age"].str.extract("(\d+)").astype(float)
train_data["Credit_History_Age"]

In [None]:
train_data.replace([np.inf, -np.inf], np.nan, inplace=True)
train_data.fillna(train_data.median(numeric_only=True), inplace=True)
label_encoder = LabelEncoder()
train_data["Credit_Score"] = label_encoder.fit_transform(train_data["Credit_Score"])

In [None]:
train_data.head()

In [None]:
train_data.loc[train_data['Profession'].str.startswith('_', na=False), 'Profession'] = 'Unemployed'
profession_count = train_data['Profession'].value_counts(dropna=False)
sns.set(rc={'figure.figsize': (20, 10)})
sns.barplot(x=profession_count.index, y=profession_count.values)
plt.title('Bar graph showing the value counts of the column - Profession')
plt.ylabel('Count', fontsize=12)
plt.xlabel('Profession', fontsize=12)
plt.show()

In [None]:
sns.catplot(x='Credit_Score', col='Profession', data=train_data, kind='count', col_wrap=4)

In [None]:
index_values = ~train_data['Loan_Type'].isnull().values
loan_type_data = list(train_data['Loan_Type'][index_values])
loan_type_dict = dict()
for value in loan_type_data:
    values = value.split(',')
    for each_value in values:
        loan_type = each_value.strip(' ')
        if 'and' in loan_type:
            loan_type = loan_type[4 : ]
        if loan_type in loan_type_dict:
            loan_type_dict[loan_type] += 1
        else:
            loan_type_dict[loan_type] = 1

loan_type_dict

In [None]:
sns.set(rc = {'figure.figsize': (20, 10)})
sns.barplot(x=list(loan_type_dict.keys()), y=list(loan_type_dict.values()))
plt.title('Bar graph showing the counts of the column - Loan_Type')
plt.ylabel('Count', fontsize = 12)
plt.xlabel('Loan_Type', fontsize = 12)

In [None]:
credit_mix_count = train_data['Credit_Mix'].value_counts(dropna = False)
credit_mix_count

In [None]:
sns.set(rc = {'figure.figsize': (6, 6)})
sns.barplot(x=credit_mix_count.index,y= credit_mix_count.values, alpha = 0.8)
plt.title('Bar graph showing the value counts of the column - Credit_Mix')
plt.ylabel('Number of Occurrences', fontsize = 12)
plt.xlabel('Credit Mix', fontsize = 12)
plt.show()

In [None]:
sns.catplot(x='Credit_Score', col = 'Credit_Mix', data = train_data, kind = 'count', col_wrap = 2)