In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
pd.set_option("display.max_columns", 999)
pd.set_option("display.max_rows", 100)

import pandas_profiling

In [2]:
df = pd.read_excel("https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls", skiprows = 1)
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month
0,1,20000,2,2,1,24,2,2,-1,-1,-2,-2,3913,3102,689,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,0,2,2682,1725,2682,3272,3455,3261,0,1000,1000,1000,0,2000,1
2,3,90000,2,2,2,34,0,0,0,0,0,0,29239,14027,13559,14331,14948,15549,1518,1500,1000,1000,1000,5000,0
3,4,50000,2,2,1,37,0,0,0,0,0,0,46990,48233,49291,28314,28959,29547,2000,2019,1200,1100,1069,1000,0
4,5,50000,1,2,1,57,-1,0,-1,0,0,0,8617,5670,35835,20940,19146,19131,2000,36681,10000,9000,689,679,0


In [3]:
#Fix column headers
df.rename(mapper = {"PAY_0" : "pay_1", "MARRIAGE" : "marital_status"}, axis = 1, inplace = True)
df.columns = df.columns.map(lambda x: x.lower())

#Descriptive statistics
df.describe(include = "all").T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,30000.0,15000.5,8660.398374,1.0,7500.75,15000.5,22500.25,30000.0
limit_bal,30000.0,167484.322667,129747.661567,10000.0,50000.0,140000.0,240000.0,1000000.0
sex,30000.0,1.603733,0.489129,1.0,1.0,2.0,2.0,2.0
education,30000.0,1.853133,0.790349,0.0,1.0,2.0,2.0,6.0
marital_status,30000.0,1.551867,0.52197,0.0,1.0,2.0,2.0,3.0
age,30000.0,35.4855,9.217904,21.0,28.0,34.0,41.0,79.0
pay_1,30000.0,-0.0167,1.123802,-2.0,-1.0,0.0,0.0,8.0
pay_2,30000.0,-0.133767,1.197186,-2.0,-1.0,0.0,0.0,8.0
pay_3,30000.0,-0.1662,1.196868,-2.0,-1.0,0.0,0.0,8.0
pay_4,30000.0,-0.220667,1.169139,-2.0,-1.0,0.0,0.0,8.0


In [4]:
#Run a profiling for the initial checks
creditcard_report = df.profile_report(title="Credit Card Clients Detailed Profiling Report", correlation_threshold_pearson = 1, sort = "None")
creditcard_report.to_file(output_file="creditcard.html")
creditcard_report



In [5]:
# education                     0, 5 and 6 are unknowns, change to 4 = others
df.education = df.education.map(lambda x: 4 if x in [0,5,6] else x)
# marital_status                      0 values change to 3 = others
df.marital_status = df.marital_status.map(lambda x: 3 if x == 0 else x)

columns = ["education", "marital_status"]
for i in columns:
    print("feature:", i)
    print(df[i].value_counts())

feature: education
2    14030
1    10585
3     4917
4      468
Name: education, dtype: int64
feature: marital_status
2    15964
1    13659
3      377
Name: marital_status, dtype: int64
