In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("C:\\Users\\gajendra singh\\OneDrive\\Desktop\\pandas\\credit_scoring.csv")

In [3]:
df.head()

Unnamed: 0,Age,Gender,Marital Status,Education Level,Employment Status,Credit Utilization Ratio,Payment History,Number of Credit Accounts,Loan Amount,Interest Rate,Loan Term,Type of Loan
0,60,Male,Married,Master,Employed,0.22,2685.0,2,4675000,2.65,48,Personal Loan
1,25,Male,Married,High School,Unemployed,0.2,2371.0,9,3619000,5.19,60,Auto Loan
2,30,Female,Single,Master,Employed,0.22,2771.0,6,957000,2.76,12,Auto Loan
3,58,Female,Married,PhD,Unemployed,0.12,1371.0,2,4731000,6.57,60,Auto Loan
4,32,Male,Married,Bachelor,Self-Employed,0.99,828.0,2,3289000,6.28,36,Personal Loan


In [4]:
df.shape

(1000, 12)

In [7]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        1000 non-null   int64  
 1   Gender                     1000 non-null   object 
 2   Marital Status             1000 non-null   object 
 3   Education Level            1000 non-null   object 
 4   Employment Status          1000 non-null   object 
 5   Credit Utilization Ratio   1000 non-null   float64
 6   Payment History            1000 non-null   float64
 7   Number of Credit Accounts  1000 non-null   int64  
 8   Loan Amount                1000 non-null   int64  
 9   Interest Rate              1000 non-null   float64
 10  Loan Term                  1000 non-null   int64  
 11  Type of Loan               1000 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


In [11]:
credit_utilization_fig = px.box(df, y = "Credit Utilization Ratio",
                               title = 'Credit Utilization Ratio Distribution')

credit_utilization_fig.show()

In [12]:
# now let have  a look on the distribution of the lpoan amount in the data

loan_amount_fig = px.histogram(df, x = 'Loan Amount',
                              nbins=20,
                              title='Loan Amount Distribution')

loan_amount_fig.show()

In [13]:
# now let's have a look at the correaltion in the data

numeric_df = df[['Credit Utilization Ratio',
                'Payment History',
                'Number of Credit Accounts',
                'Loan Amount', 'Interest Rate',
                'Loan Term']]

correlation_fig = px.imshow(numeric_df.corr(),
                           title = "Correlation Heatmap")

correlation_fig.show()

In [14]:
education_level_mapping = {'High School': 1, 'Bachelor': 2, 'Master':3, 'Phd': 4}
employment_status_mapping = {'Unemployed': 0, 'Employed': 1, 'Self-Employed': 2}

In [15]:
# apply to the categorical featurers
df['Education Level'] = df['Education Level'].map(education_level_mapping)
df['Employment Status'] = df['Employment Status'].map(employment_status_mapping)

In [21]:
# calculate credit scores using the complete FICO formula

credit_scores = []

for index, row in df.iterrows():
    payment_history = row['Payment History']
    credit_utilization_ratio = row['Credit Utilization Ratio']
    number_of_credit_accounts = row['Number of Credit Accounts']
    education_level = row['Education Level']
    employment_status = row['Employment Status']
    
    # apply the FICO formula to calculate the credit score
    credit_score = (payment_history * 0.35) + (credit_utilization_ratio *0.30) + (number_of_credit_accounts *0.15) + (education_level * 0.10) + (employment_status * 0.10)
    credit_scores.append(credit_score)
    
df['Credit Score'] = credit_scores

print(df.head())

   Age  Gender Marital Status  Education Level  Employment Status  \
0   60    Male        Married              3.0                  1   
1   25    Male        Married              1.0                  0   
2   30  Female         Single              3.0                  1   
3   58  Female        Married              NaN                  0   
4   32    Male        Married              2.0                  2   

   Credit Utilization Ratio  Payment History  Number of Credit Accounts  \
0                      0.22           2685.0                          2   
1                      0.20           2371.0                          9   
2                      0.22           2771.0                          6   
3                      0.12           1371.0                          2   
4                      0.99            828.0                          2   

   Loan Amount  Interest Rate  Loan Term   Type of Loan  Credit Score  
0      4675000           2.65         48  Personal Loan       

In [23]:
df = df.dropna()

In [24]:
from sklearn.cluster import KMeans

X = df[["Credit Score"]]

kmeans = KMeans(n_clusters=4, n_init=10, random_state=42)

kmeans.fit(X)
df['Segment'] = kmeans.labels_


KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3.



In [26]:
df

Unnamed: 0,Age,Gender,Marital Status,Education Level,Employment Status,Credit Utilization Ratio,Payment History,Number of Credit Accounts,Loan Amount,Interest Rate,Loan Term,Type of Loan,Credit Score,Segment
0,60,Male,Married,3.0,1,0.22,2685.0,2,4675000,2.65,48,Personal Loan,940.516,2
1,25,Male,Married,1.0,0,0.20,2371.0,9,3619000,5.19,60,Auto Loan,831.360,2
2,30,Female,Single,3.0,1,0.22,2771.0,6,957000,2.76,12,Auto Loan,971.216,2
4,32,Male,Married,2.0,2,0.99,828.0,2,3289000,6.28,36,Personal Loan,290.797,1
5,42,Male,Divorced,3.0,0,0.94,2342.0,2,1536000,11.15,24,Personal Loan,820.582,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,53,Female,Single,3.0,1,0.40,2028.0,4,2561000,3.26,36,Home Loan,710.920,0
995,59,Male,Divorced,1.0,1,0.74,1285.0,8,3530000,12.99,48,Auto Loan,451.372,1
996,64,Male,Divorced,2.0,0,0.77,1857.0,2,1377000,18.02,60,Home Loan,650.681,0
997,63,Female,Single,3.0,2,0.18,2628.0,10,2443000,18.95,12,Personal Loan,921.854,2


In [28]:
df['Segment'] = df['Segment'].astype('category')

fig = px.scatter(df,x=df.index, y="Credit Score", color='Segment',
                color_discrete_sequence=['green', 'blue', 'yellow', 'red'])

fig.update_layout(xaxis_title = "Customer Index",
                 yaxis_title = "Credit Score",
                 title = "Customer Segmentation based on Credit Scores"
                 )
fig.show()