In [72]:
import numpy as np
import pandas as pd

In [77]:
Loans = pd.read_csv("/home/jovyan/work/german_credit_data.csv")

In [78]:
Loans.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [79]:
Loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   817 non-null    object
 5   Checking account  606 non-null    object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.3+ KB


- **Age**: Numeric variable, has all 1000 rows non null
- **Sex**: Categorical variable, has all 1000 rows non null  
- **Job**: Categorical variable, has all 1000 rows non null  
- **House**: Categorical variable, has all 1000 rows non null  
- **Saving accounts**: Categorical variable, has 183 rows null  
- **Checking account**: Categorical variable, has 394 rows null 
- **Credit amount**: Numeric variable, has all 1000 rows non null
- **Duration**: Numeric variable, has all 1000 rows non null    
- **Purpose**: Categorical variable, has all 1000 rows non null    
- **Risk**: Categorical variable, has all 1000 rows non null  

In [80]:
# Converting Objects into categorical variables
Loans["Sex"] = Loans["Sex"].astype("category")
Loans["Housing"] = Loans["Housing"].astype("category")
Loans["Saving accounts"] = Loans["Saving accounts"].astype("category")
Loans["Checking account"] = Loans["Checking account"].astype("category")
Loans["Purpose"] = Loans["Purpose"].astype("category")
Loans["Risk"] = Loans["Risk"].astype("category")

# Numerical variables
num_vars = Loans.select_dtypes(include=['number']).columns.tolist()

# Categorical variables
cat_vars = Loans.select_dtypes(include=['category']).columns.tolist()

print("Numerical columns:", num_vars)
print("Categorical columns:", cat_vars)

Loans.info()

Numerical columns: ['Age', 'Job', 'Credit amount', 'Duration']
Categorical columns: ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose', 'Risk']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Age               1000 non-null   int64   
 1   Sex               1000 non-null   category
 2   Job               1000 non-null   int64   
 3   Housing           1000 non-null   category
 4   Saving accounts   817 non-null    category
 5   Checking account  606 non-null    category
 6   Credit amount     1000 non-null   int64   
 7   Duration          1000 non-null   int64   
 8   Purpose           1000 non-null   category
 9   Risk              1000 non-null   category
dtypes: category(6), int64(4)
memory usage: 38.3 KB


> **Memory Optimization with Typecasting**

Before typecasting columns to their appropriate data types, the DataFrame used **78.3 KB** of memory.  
After typecasting, memory usage dropped to **38.3 KB**—a reduction of more than 50%.

> For large datasets, this simple optimization can result in substantial storage savings and improved computational efficiency.  
>  
> **Tip:** Always review and optimize your data types for better performance!

In [81]:
target = "Risk"

In [82]:
from sklearn.impute import SimpleImputer

In [83]:
imputer = SimpleImputer(strategy = "most_frequent")

In [84]:
# Find columns with missing values
missing_cols = Loans.columns[Loans.isnull().any()]

# Impute only the missing columns
Loans[missing_cols] = imputer.fit_transform(Loans[missing_cols])

Loans["Saving accounts"] = Loans["Saving accounts"].astype("category")
Loans["Checking account"] = Loans["Checking account"].astype("category")

Loans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   Age               1000 non-null   int64   
 1   Sex               1000 non-null   category
 2   Job               1000 non-null   int64   
 3   Housing           1000 non-null   category
 4   Saving accounts   1000 non-null   category
 5   Checking account  1000 non-null   category
 6   Credit amount     1000 non-null   int64   
 7   Duration          1000 non-null   int64   
 8   Purpose           1000 non-null   category
 9   Risk              1000 non-null   category
dtypes: category(6), int64(4)
memory usage: 38.3 KB


In [85]:
Loans["Sex"].value_counts()

Sex
male      690
female    310
Name: count, dtype: int64

In [86]:
Loans["Housing"].value_counts()

Housing
own     713
rent    179
free    108
Name: count, dtype: int64

In [87]:
Loans["Saving accounts"].value_counts()

Saving accounts
little        786
moderate      103
quite rich     63
rich           48
Name: count, dtype: int64

In [88]:
Loans["Checking account"].value_counts()

Checking account
little      668
moderate    269
rich         63
Name: count, dtype: int64

In [89]:
Loans["Purpose"].value_counts()

Purpose
car                    337
radio/TV               280
furniture/equipment    181
business                97
education               59
repairs                 22
domestic appliances     12
vacation/others         12
Name: count, dtype: int64

In [90]:
Loans["Risk"].value_counts()

Risk
good    700
bad     300
Name: count, dtype: int64

In [91]:
# Describe numerical variables and round to two decimals
num_summary = Loans[num_vars].describe().round(2)
display(num_summary)

Unnamed: 0,Age,Job,Credit amount,Duration
count,1000.0,1000.0,1000.0,1000.0
mean,35.55,1.9,3271.26,20.9
std,11.38,0.65,2822.74,12.06
min,19.0,0.0,250.0,4.0
25%,27.0,2.0,1365.5,12.0
50%,33.0,2.0,2319.5,18.0
75%,42.0,2.0,3972.25,24.0
max,75.0,3.0,18424.0,72.0


- The **minimum duration** of a loan is **4 months**.
- The **maximum duration** of a loan is **72 months**.