In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read the csv file into pandas Dataframe
credit_data = pd.read_csv('Resources/credit_data.csv')
credit_data.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,0x1602,CUS_0xd40,January,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,0x1603,CUS_0xd40,February,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,0x1604,CUS_0xd40,March,Aaron Maashoh,-500,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,0x1605,CUS_0xd40,April,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,,3,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,0x1606,CUS_0xd40,May,Aaron Maashoh,23,821-00-0265,Scientist,19114.12,1824.843333,3,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


In [3]:
## Basic formatting for ease of use/prevent errors

# Lowercase for all headings
credit_data.columns = credit_data.columns.str.lower()
credit_data.columns

Index(['id', 'customer_id', 'month', 'name', 'age', 'ssn', 'occupation',
       'annual_income', 'monthly_inhand_salary', 'num_bank_accounts',
       'num_credit_card', 'interest_rate', 'num_of_loan', 'type_of_loan',
       'delay_from_due_date', 'num_of_delayed_payment', 'changed_credit_limit',
       'num_credit_inquiries', 'credit_mix', 'outstanding_debt',
       'credit_utilization_ratio', 'credit_history_age',
       'payment_of_min_amount', 'total_emi_per_month',
       'amount_invested_monthly', 'payment_behaviour', 'monthly_balance',
       'credit_score'],
      dtype='object')

In [4]:
## Get an understanding the data
credit_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   id                        100000 non-null  object 
 1   customer_id               100000 non-null  object 
 2   month                     100000 non-null  object 
 3   name                      90015 non-null   object 
 4   age                       100000 non-null  object 
 5   ssn                       100000 non-null  object 
 6   occupation                100000 non-null  object 
 7   annual_income             100000 non-null  object 
 8   monthly_inhand_salary     84998 non-null   float64
 9   num_bank_accounts         100000 non-null  int64  
 10  num_credit_card           100000 non-null  int64  
 11  interest_rate             100000 non-null  int64  
 12  num_of_loan               100000 non-null  object 
 13  type_of_loan              88592 non-null   ob

In [5]:
## Drop columns that aren't necessary

# 'ID' is persons ID, arbitrary, not predictive in credit scoring, not required
# 'Customer_ID' as above
# 'Name' as above
# 'SSN' as above

credit_data_1 = credit_data.drop(columns=["id", "customer_id", "name", "ssn"])
credit_data_1.head()

Unnamed: 0,month,age,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,...,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,January,23,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",...,_,809.98,26.82262,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,February,23,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",...,Good,809.98,31.94496,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
2,March,-500,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",...,Good,809.98,28.609352,22 Years and 3 Months,No,49.574949,81.699521264648,Low_spent_Medium_value_payments,331.2098628537912,Good
3,April,23,Scientist,19114.12,,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736783,Good
4,May,23,Scientist,19114.12,1824.843333,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good


### Preprocessing

##### Working through each columns to understand anomalous values, and alter or delete as appropriate

In [6]:
## AGE

# Understand the range of ages, and if age values make sense
credit_data_1["age"].value_counts().sort_index().head(20)

# From perusal of column data, can see many values include'_'.
# Count how many rows with an underscore
US_credit_data_1 = credit_data_1["age"].str.contains('_').sum()
print(f"{US_credit_data_1} out of 100k rows, have a string, 4%")

# Remove underscore '_'
# credit_data_1.loc[credit_data_1["age"] == "_", "age"] = ""
    # code for where value is '_' only, not part of value is '_'
# Code to replace '_' with ""
credit_data_1["age"] = credit_data_1["age"].str.replace("_", "")

# Try changing all values to numeric. Will prove no more anomalous values
credit_data_1["age"] = pd.to_numeric(credit_data_1["age"])
f"Able to convert all values to numeric, suggesting, no more anomalous values"


4939 out of 100k rows, have a string, 4%


'Able to convert all values to numeric, suggesting, no more anomalous values'

In [7]:
## Many ages > 85 and <14 (assuming 14 is universal age ppl are allowed to be officially employed)
# Count how many values are <18 and >90, as unlikely to be seeking loans.
# If insignificant, can delete rows
credit_data_2 = credit_data_1.loc[(credit_data_1["age"] < 14) | (credit_data_1["age"] > 85)]
print(credit_data_2["age"].describe())
length_18_85 = len(credit_data_2["age"])
f"{length_18_85} of 100k values are <14 & >85. Will be deleted"

# DF updated to exclude <14 and >85
credit_data_3 = credit_data_1.loc[(credit_data_1["age"] > 14) & (credit_data_1["age"] < 86)]


count    2781.000000
mean     2813.963323
std      3068.568546
min      -500.000000
25%      -500.000000
50%      2318.000000
75%      5587.000000
max      8698.000000
Name: age, dtype: float64


In [8]:
# credit_data_3["age"].describe()

# credit_data_3 = credit_data_1[(credit_data_1["age"] > 14) & (credit_data_1["age"] < 86)]
# credit_data_3["age"].describe()

In [9]:
## MONTH

credit_data_3["month"].value_counts()
# Data included from Jan - July.
# No anomalous entries.
# Data OK

August      12094
July        12059
June        12036
May         12018
March       12002
April       11978
February    11935
January     11922
Name: month, dtype: int64

In [10]:
## OCCUPATION

credit_data_3["occupation"].value_counts()

# credit_data_1["occupation"].value_counts().sum()
# 7% of ppl don't have a recorded occupation.

# Will either convert "_______" (as copied from results below) Other.
credit_data_3.loc[credit_data_3["occupation"] == "_______", "occupation"] = "Other"
# credit_data_3

In [11]:
## ANNUAL INCOME

credit_data_3["annual_income"].value_counts()
# There are strings of '_' in income. Need to delete, use code as above.
credit_data_3["annual_income"] = credit_data_3["annual_income"].str.replace("_", "")

# Convert to numeric
credit_data_3["annual_income"] = pd.to_numeric(credit_data_3["annual_income"])

# credit_data_3["annual_income"].describe()
# Count matches DF length, data OK.


In [12]:
## MONTHLY INHAND SALARY

credit_data_3["monthly_inhand_salary"].value_counts()

# Convert to numeric
credit_data_3["monthly_inhand_salary"] = pd.to_numeric(credit_data_3["monthly_inhand_salary"])

# Ignore data point, as too many blanks, and can make similar inference from 'Annual_income'
credit_data_3a = credit_data_3.drop(columns=["monthly_inhand_salary"])


In [13]:
credit_data_3a

Unnamed: 0,month,age,occupation,annual_income,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,...,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,January,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,...,_,809.98,26.822620,22 Years and 1 Months,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good
1,February,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,...,Good,809.98,31.944960,,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good
3,April,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,...,Good,809.98,31.377862,22 Years and 4 Months,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736786,Good
4,May,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,...,Good,809.98,24.797347,22 Years and 5 Months,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good
5,June,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",8,...,Good,809.98,27.262259,22 Years and 6 Months,No,49.574949,62.430172331195294,!@9#%8,340.4792117872438,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,April,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",23,...,_,502.38,34.663572,31 Years and 6 Months,No,35.104023,60.97133255718485,High_spent_Large_value_payments,479.866,Poor
99996,May,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",18,...,_,502.38,40.565631,31 Years and 7 Months,No,35.104023,54.18595028760385,High_spent_Medium_value_payments,496.652,Poor
99997,June,25,Mechanic,39628.99,4,6,5729,2,"Auto Loan, and Student Loan",27,...,Good,502.38,41.255522,31 Years and 8 Months,No,35.104023,24.02847744864441,High_spent_Large_value_payments,516.809,Poor
99998,July,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",20,...,Good,502.38,33.638208,31 Years and 9 Months,No,35.104023,251.67258219721603,Low_spent_Large_value_payments,319.165,Standard


In [14]:
## NUM OF BANK ACCS

credit_data_3a["num_bank_accounts"].value_counts()
# Shows values in the 1000s, intuitively wrong, unlikely for a person to have 1000 bank accounts.

# Count number of values with banks accounuts > than 20
bank_acc_20plus = credit_data_3a.loc[(credit_data_3a["num_bank_accounts"] > 20)]
print(bank_acc_20plus["num_bank_accounts"].describe())
f"1274 of 100k values, will be deleted"

credit_data_4 = credit_data_3a.loc[(credit_data_3a["num_bank_accounts"] < 20)]

# Convert to numeric
credit_data_4["num_bank_accounts"] = pd.to_numeric(credit_data_4["num_bank_accounts"])


count    1274.000000
mean      899.966248
std       515.214421
min        26.000000
25%       464.000000
50%       889.000000
75%      1343.500000
max      1798.000000
Name: num_bank_accounts, dtype: float64


In [15]:
## NUM OF CREDIT CARDS

credit_data_4["num_credit_card"].value_counts()

# Shows values in the 100s, intuitively wrong, unlikely for a person to have 100s of credit cards.

# Count number of values with banks accounuts > than 10
cc_10plus = credit_data_4.loc[(credit_data_4["num_credit_card"] > 10)]
print(cc_10plus["num_credit_card"].describe())
f"2194 of 100k values, will be deleted"

credit_data_5 = credit_data_4.loc[(credit_data_4["num_credit_card"] <= 10)]

# Convert to numeric
credit_data_5["num_credit_card"] = pd.to_numeric(credit_data_5["num_credit_card"])

count    2194.000000
mean      737.804011
std       440.830082
min        11.000000
25%       336.250000
50%       740.500000
75%      1123.000000
max      1499.000000
Name: num_credit_card, dtype: float64


In [16]:
## INTEREST_RATE

credit_data_5["interest_rate"].value_counts()

# Shows values in the 100s, unlikely that interest rates would be >50% on credit cards.

# Count number of values with banks accounuts > than 10
ir_30plus = credit_data_5.loc[(credit_data_5["interest_rate"] > 50)]
print(ir_30plus["interest_rate"].describe())
f"1899 of 100k values, will be deleted"

credit_data_6 = credit_data_5.loc[(credit_data_5["interest_rate"] <= 50)]

# Convert to numeric
credit_data_6["interest_rate"] = pd.to_numeric(credit_data_6["interest_rate"])

count    1899.000000
mean     2872.620853
std      1661.728430
min        59.000000
25%      1387.000000
50%      2851.000000
75%      4301.000000
max      5797.000000
Name: interest_rate, dtype: float64


In [17]:
## NUM_OF_LOAN

credit_data_6["num_of_loan"].value_counts()
credit_data_6["num_of_loan"].describe()
credit_data_6["num_of_loan"].min()
# Count number of values < 0
# loan_less0 = credit_data_6.loc[(credit_data_6["num_of_loan"] < 0)]
# Above code has strings in it. Assume its same string as before '_', delete this string.
credit_data_6["num_of_loan"] = credit_data_6["num_of_loan"].str.replace("_", "")

# Convert data to numeric
credit_data_6["num_of_loan"] = pd.to_numeric(credit_data_6["num_of_loan"])


In [18]:
loan_less0 = credit_data_6.loc[(credit_data_6["num_of_loan"] < 0)]
# loan_less0["num_of_loan"].describe()
print(f"3512 of 100k values, will be deleted")

credit_data_7 = credit_data_6.loc[(credit_data_6["interest_rate"] >= 0)]

3512 of 100k values, will be deleted


In [19]:
## TYPE OF LOAN

credit_data_7["type_of_loan"].value_counts()
# Shows there are many values with nil, due to person having 0 loans.
# Replace blank values, with 'None'
# latest = credit_data_7["type_of_loan"].fillna("None", inplace = True)



Not Specified                                                                                                 1304
Personal Loan                                                                                                 1160
Credit-Builder Loan                                                                                           1153
Student Loan                                                                                                  1148
Debt Consolidation Loan                                                                                       1142
                                                                                                              ... 
Student Loan, Personal Loan, Payday Loan, and Credit-Builder Loan                                                1
Credit-Builder Loan, Not Specified, Personal Loan, Mortgage Loan, Not Specified, and Mortgage Loan               1
Debt Consolidation Loan, Payday Loan, Home Equity Loan, Credit-Builder Loan, Stu

In [20]:
## DELAY FROM DUE DATE

credit_data_7["delay_from_due_date"].value_counts()
# no anomalous figures. -ve values represent payments in advance.

 15    3265
 13    3147
 8     3058
 14    3009
 10    3006
       ... 
 64      55
 65      52
-5       29
 66      29
 67      17
Name: delay_from_due_date, Length: 73, dtype: int64

In [21]:
## NUM OF DELAYED PAYMENT

credit_data_7["num_of_delayed_payment"].value_counts()
credit_data_7["num_of_delayed_payment"].describe()

# From perusal, there are erroneous strings, remove
credit_data_7["num_of_delayed_payment"] = credit_data_7["num_of_delayed_payment"].str.replace("_", "")

# Replace blank values with 0 integer, to ability to calculate as a factor later
credit_data_7["num_of_delayed_payment"] = credit_data_7["num_of_delayed_payment"].fillna(0)

# Convert data to numeric
credit_data_7["num_of_delayed_payment"] = pd.to_numeric(credit_data_7["num_of_delayed_payment"])


In [22]:
## CHANGED CREDIT LIMIT

credit_data_7["changed_credit_limit"].value_counts()
credit_data_7["changed_credit_limit"].describe()

# From perusal, there are erroneous strings, remove
credit_data_7["changed_credit_limit"] = credit_data_7["changed_credit_limit"].str.replace("_", "")

# Replace blank values with 0 integer, to ability to calculate as a factor later
credit_data_7["changed_credit_limit"] = credit_data_7["changed_credit_limit"].fillna(0)

In [23]:
## NUM OF CREDIT INQUIRIES

credit_data_7["num_credit_inquiries"].value_counts()
credit_data_7["num_credit_inquiries"].describe()

# Replace blank values with 0 integer, to ability to calculate as a factor later
credit_data_7["num_credit_inquiries"] = credit_data_7["num_credit_inquiries"].fillna(0)

# Len same as DF whole, suggesting, all records (rows) have a value.

# Convert data to numeric
credit_data_7["num_credit_inquiries"] = pd.to_numeric(credit_data_7["num_credit_inquiries"])

In [24]:
## CREDTI MIX

credit_data_7["credit_mix"].value_counts()
# credit_data_7["credit_mix"].describe()

# Replace blank values with 0 integer, to ability to calculate as a factor later
# credit_data_7["credit_mix"] = credit_data_7["credit_mix"].fillna(0)

# From value_counts, 1 category is '_', replace with 'None'
credit_data_7["credit_mix"] = credit_data_7["credit_mix"].str.replace("_", "None")

In [25]:
## OUTSTANDING DEBT

credit_data_7["outstanding_debt"].value_counts()
# credit_data_7["credit_mix"].describe()

# From perusal, there are erroneous strings, remove
credit_data_7["outstanding_debt"] = credit_data_7["outstanding_debt"].str.replace("_", "")

# Convert column to numeric
credit_data_7["outstanding_debt"] = pd.to_numeric(credit_data_7["outstanding_debt"])


In [26]:
## CREDIT UTILIZATION RATIO

credit_data_7["credit_utilization_ratio"].value_counts()

# Convert column to numeric
credit_data_7["credit_utilization_ratio"] = pd.to_numeric(credit_data_7["credit_utilization_ratio"])


In [27]:
## CREDIT HISTORY AGE

credit_data_7["credit_history_age"].value_counts()

# Replace blank values with 0 integer, to ability to calculate as a factor later
credit_data_7["credit_history_age"] = credit_data_7["credit_history_age"].fillna("0")

In [28]:
# Split credit_history_age into seperate columns, to perform: numeric, calcs and amalgamation, for later ML calcs
credit_data_7[["credit_history_age_year", "ystr", "andstr", "credit_history_age_month", "mstr"]] = credit_data_7.credit_history_age.str.split(" ", expand = True)

In [29]:
# Drop redundant columns, 
credit_data_8 = credit_data_7.drop(columns=["credit_history_age", "ystr", "andstr", "mstr"], axis = 1)

In [30]:
# Convert year and month to numeric
credit_data_8["credit_history_age_year"] = pd.to_numeric(credit_data_8["credit_history_age_year"])
credit_data_8["credit_history_age_month"] = pd.to_numeric(credit_data_8["credit_history_age_month"])


In [31]:
# Convert month nominal to decimal (i.e /12)
months = credit_data_8["credit_history_age_month"]/12
credit_data_8["credit_history_age_mnthdec"] = months

In [32]:
# Concactenate
history_age = credit_data_8["credit_history_age_year"] + credit_data_8["credit_history_age_mnthdec"]
credit_data_8["credit_history_age_new"] = history_age
# credit_data_8["credit_history_age_new"]


In [33]:
# Convert NaN to 'None'
credit_data_8["credit_history_age_new"].fillna('None')

0        22.0833
1           None
3        22.3333
4        22.4167
5           22.5
          ...   
99994    31.4167
99995       31.5
99996    31.5833
99998      31.75
99999    31.8333
Name: credit_history_age_new, Length: 90677, dtype: object

In [34]:
# Drop redundant columns (split and decimals calcs)
credit_data_9 = credit_data_8.drop(columns=["credit_history_age_year", "credit_history_age_month", "credit_history_age_mnthdec"], axis = 1)
credit_data_9

Unnamed: 0,month,age,occupation,annual_income,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,...,credit_mix,outstanding_debt,credit_utilization_ratio,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score,credit_history_age_new
0,January,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,...,,809.98,26.822620,No,49.574949,80.41529543900253,High_spent_Small_value_payments,312.49408867943663,Good,22.083333
1,February,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,...,Good,809.98,31.944960,No,49.574949,118.28022162236736,Low_spent_Large_value_payments,284.62916249607184,Good,
3,April,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,...,Good,809.98,31.377862,No,49.574949,199.4580743910713,Low_spent_Small_value_payments,223.45130972736786,Good,22.333333
4,May,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,...,Good,809.98,24.797347,No,49.574949,41.420153086217326,High_spent_Medium_value_payments,341.48923103222177,Good,22.416667
5,June,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",8,...,Good,809.98,27.262259,No,49.574949,62.430172331195294,!@9#%8,340.4792117872438,Good,22.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99994,March,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",20,...,,502.38,39.323569,No,35.104023,140.58140274528395,High_spent_Medium_value_payments,410.256,Poor,31.416667
99995,April,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",23,...,,502.38,34.663572,No,35.104023,60.97133255718485,High_spent_Large_value_payments,479.866,Poor,31.500000
99996,May,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",18,...,,502.38,40.565631,No,35.104023,54.18595028760385,High_spent_Medium_value_payments,496.652,Poor,31.583333
99998,July,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",20,...,Good,502.38,33.638208,No,35.104023,251.67258219721603,Low_spent_Large_value_payments,319.165,Standard,31.750000


In [35]:
## PAYMENT OF MIN AMOUNT

credit_data_9["payment_of_min_amount"].value_counts()


# Data OK

Yes    47076
No     32732
NM     10869
Name: payment_of_min_amount, dtype: int64

In [36]:
## TOTAL EMI PER MONTH

credit_data_9["total_emi_per_month"].value_counts()

# Convert to numeric
credit_data_9["total_emi_per_month"] = pd.to_numeric(credit_data_9["total_emi_per_month"])

# Data OK

In [37]:
## AMOUNT INVESTED MONTHLY

credit_data_9["amount_invested_monthly"].value_counts()

# Remove strings
credit_data_9["amount_invested_monthly"] = credit_data_9["amount_invested_monthly"].str.replace("_", "")

# Replace blank values with 0 integer, to ability to calculate as a factor later
# credit_data_9["credit_hamount_invested_monthlyistory_age"] = credit_data_9["amount_invested_monthly"].fillna("0")



# Pandas doesn't recognise empty string (i.e. blank cells). 
# Pandas will recognise a value as null if it is a np.nan.
# Covnert empty strings to np.nan. Then convert to 0
credit_data_9["amount_invested_monthly"] = credit_data_9["amount_invested_monthly"].replace('', np.nan)

# Replace NaN with 0
credit_data_9["amount_invested_monthly"] = credit_data_9["amount_invested_monthly"].fillna(0)

# Convert to numeric
credit_data_9["amount_invested_monthly"] = pd.to_numeric(credit_data_9["amount_invested_monthly"])
credit_data_9["amount_invested_monthly"]
# Data OK

0         80.415295
1        118.280222
3        199.458074
4         41.420153
5         62.430172
            ...    
99994    140.581403
99995     60.971333
99996     54.185950
99998    251.672582
99999    167.163865
Name: amount_invested_monthly, Length: 90677, dtype: float64

In [38]:
## PAYMENT BEHAVIOUR

credit_data_9["payment_behaviour"].value_counts()

# Replace erroneous entry
credit_data_9["payment_behaviour"] = credit_data_9["payment_behaviour"].str.replace("!@9#%8", "Other")


In [39]:
credit_data_9

Unnamed: 0,month,age,occupation,annual_income,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,type_of_loan,delay_from_due_date,...,credit_mix,outstanding_debt,credit_utilization_ratio,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score,credit_history_age_new
0,January,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",3,...,,809.98,26.822620,No,49.574949,80.415295,High_spent_Small_value_payments,312.49408867943663,Good,22.083333
1,February,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",-1,...,Good,809.98,31.944960,No,49.574949,118.280222,Low_spent_Large_value_payments,284.62916249607184,Good,
3,April,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",5,...,Good,809.98,31.377862,No,49.574949,199.458074,Low_spent_Small_value_payments,223.45130972736786,Good,22.333333
4,May,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",6,...,Good,809.98,24.797347,No,49.574949,41.420153,High_spent_Medium_value_payments,341.48923103222177,Good,22.416667
5,June,23,Scientist,19114.12,3,4,3,4,"Auto Loan, Credit-Builder Loan, Personal Loan,...",8,...,Good,809.98,27.262259,No,49.574949,62.430172,Other,340.4792117872438,Good,22.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99994,March,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",20,...,,502.38,39.323569,No,35.104023,140.581403,High_spent_Medium_value_payments,410.256,Poor,31.416667
99995,April,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",23,...,,502.38,34.663572,No,35.104023,60.971333,High_spent_Large_value_payments,479.866,Poor,31.500000
99996,May,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",18,...,,502.38,40.565631,No,35.104023,54.185950,High_spent_Medium_value_payments,496.652,Poor,31.583333
99998,July,25,Mechanic,39628.99,4,6,7,2,"Auto Loan, and Student Loan",20,...,Good,502.38,33.638208,No,35.104023,251.672582,Low_spent_Large_value_payments,319.165,Standard,31.750000


In [40]:
## MONTHLY BALANCE

credit_data_9["monthly_balance"].value_counts()

credit_data_9["monthly_balance"] = credit_data_9["monthly_balance"].str.replace("-333333333333333333333333333", "")
credit_data_9["monthly_balance"] = credit_data_9["monthly_balance"].str.replace("_", "")

# Pandas doesn't recognise empty string (i.e. blank cells). 
# Pandas will recognise a value as null if it is a np.nan.
# Covnert empty strings to np.nan. Then convert to 0
credit_data_9["monthly_balance"] = credit_data_9["monthly_balance"].replace('', np.nan)

# Replace NaN with 0
credit_data_9["monthly_balance"] = credit_data_9["monthly_balance"].fillna(0)

# Convert to numeric
credit_data_9["monthly_balance"] = pd.to_numeric(credit_data_9["monthly_balance"])


In [41]:
## CREDIT SCORE

credit_data_9["credit_score"].value_counts()

Standard    48317
Poor        26078
Good        16282
Name: credit_score, dtype: int64

In [42]:
## Get an understanding the data
credit_data_9.info()

# Convert int64 columns to float
int64_columns = ["age", "num_bank_accounts", "num", "", ]



<class 'pandas.core.frame.DataFrame'>
Int64Index: 90677 entries, 0 to 99999
Data columns (total 23 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   month                     90677 non-null  object 
 1   age                       90677 non-null  int64  
 2   occupation                90677 non-null  object 
 3   annual_income             90677 non-null  float64
 4   num_bank_accounts         90677 non-null  int64  
 5   num_credit_card           90677 non-null  int64  
 6   interest_rate             90677 non-null  int64  
 7   num_of_loan               90677 non-null  int64  
 8   type_of_loan              80254 non-null  object 
 9   delay_from_due_date       90677 non-null  int64  
 10  num_of_delayed_payment    90677 non-null  int64  
 11  changed_credit_limit      90677 non-null  object 
 12  num_credit_inquiries      90677 non-null  float64
 13  credit_mix                90677 non-null  object 
 14  outsta

In [43]:
history_age

0        22.083333
1              NaN
3        22.333333
4        22.416667
5        22.500000
           ...    
99994    31.416667
99995    31.500000
99996    31.583333
99998    31.750000
99999    31.833333
Length: 90677, dtype: float64

In [44]:
# Remove values < 0
loan_less0_1 = credit_data_9.loc[(credit_data_9["monthly_balance"] < 0)]
loan_less0_1["monthly_balance"].describe()
# f"3512 of 100k values, will be deleted"

# credit_data_7 = credit_data_6.loc[(credit_data_6["interest_rate"] >= 0)]

count    0.0
mean     NaN
std      NaN
min      NaN
25%      NaN
50%      NaN
75%      NaN
max      NaN
Name: monthly_balance, dtype: float64