First We Clean the Dataset

In [77]:
import pandas as pd

df = pd.read_csv('CreditPrediction.csv')


# all the data in 'Unnamed: 19' column is zero, so we delete this column
df.drop('Unnamed: 19', axis=1, inplace=True) 
df

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,45.0,M,3,High School,Married,$60K - $80K,Blue,39.0,5.0,1,3,12691.0,777,1.335,1144,42,1.625,0.061
1,818770008,49.0,F,5,Graduate,,Less than $40K,Blue,44.0,6.0,1,2,8256.0,864,1.541,1291,33,3.714,0.105
2,713982108,51.0,M,3,Graduate,Married,$80K - $120K,Blue,36.0,4.0,1,0,3418.0,0,2.594,1887,20,2.333,0.000
3,769911858,40.0,F,4,High School,,Less than $40K,Blue,34.0,3.0,4,1,3313.0,2517,1.405,1171,20,2.333,0.760
4,709106358,40.0,M,3,Uneducated,Married,$60K - $80K,,21.0,5.0,1,0,4716.0,0,2.175,816,28,2.500,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10162,718673358,35.0,M,3,Doctorate,,$80K - $120K,Blue,30.0,5.0,3,4,13590.0,1528,0.728,2137,52,0.486,0.112
10163,715207458,46.0,F,1,Unknown,Single,Less than $40K,Blue,39.0,2.0,2,1,2029.0,1074,0.514,4802,90,0.800,0.529
10164,803665983,52.0,M,0,Unknown,,$60K - $80K,Blue,46.0,3.0,2,4,2742.0,2184,0.592,3829,72,0.532,0.796
10165,713183508,39.0,F,1,High School,,Unknown,,36.0,1.0,3,2,2751.0,1158,0.821,4861,82,0.822,0.421


- We convert Non-Numerical data to Numerical data

In [75]:
# Encoding Sequential Features Using Mapping
df['Gender'] = df['Gender'].map({'F': 1, 'M': 2})

df['Education_Level'] = df['Education_Level'].map({'Unknown': 1, 'Uneducated': 2,'High School': 3,
                                                   'College': 4,'Graduate': 5, 'Post-Graduate': 6,
                                                   'Doctorate': 7})

df['Income_Category'] = df['Income_Category'].map({'Unknown': 1, 'Less than $40K': 2,
                                                   '$40K - $60K': 3, '$60K - $80K': 4,
                                                   '$80K - $120K': 5, '$120K +': 6})

# Encoding Non-Sequential Features Using One-Hot Encoding
df_encoded = pd.get_dummies(df['Marital_Status'], prefix='Marital_Status')
df_encoded = df_encoded.astype(int)
df = pd.concat([df, df_encoded], axis=1)
df.drop('Marital_Status', axis=1, inplace=True)

df_encoded = pd.get_dummies(df['Card_Category'], prefix='Card_Category')
df_encoded = df_encoded.astype(int)
df = pd.concat([df, df_encoded], axis=1)
df.drop('Card_Category', axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10167 entries, 0 to 10166
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CLIENTNUM                 10167 non-null  int64  
 1   Customer_Age              10167 non-null  float64
 2   Gender                    9968 non-null   float64
 3   Dependent_count           10167 non-null  int64  
 4   Education_Level           10167 non-null  int64  
 5   Income_Category           10167 non-null  int64  
 6   Months_on_book            9944 non-null   float64
 7   Total_Relationship_Count  10147 non-null  float64
 8   Months_Inactive_12_mon    10167 non-null  int64  
 9   Contacts_Count_12_mon     10167 non-null  int64  
 10  Credit_Limit              10167 non-null  float64
 11  Total_Revolving_Bal       10167 non-null  int64  
 12  Total_Amt_Chng_Q4_Q1      10167 non-null  float64
 13  Total_Trans_Amt           10167 non-null  int64  
 14  Total_

- Now we want to fill the NaN cells with Expectation-Maximization 

In [76]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.mixture import GaussianMixture

# Initial Imputation
initial_imputer = SimpleImputer(strategy='mean')
df_imputed = initial_imputer.fit_transform(df)

# Apply EM Algorithm
gmm = GaussianMixture(n_components=2, max_iter=100, random_state=0)
gmm.fit(df_imputed)

for _ in range(10):  # Number of EM iterations
    imputed_values = gmm.sample(n_samples=df.shape[0])[0]
    df_imputed[np.isnan(df.values)] = imputed_values[np.isnan(df.values)]
    gmm.fit(df_imputed)

df_em_imputed = pd.DataFrame(df_imputed, columns=df.columns)

df_em_imputed

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Income_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,...,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_Single,Marital_Status_Unknown,Card_Category_Blue,Card_Category_Gold,Card_Category_Platinum,Card_Category_Silver
0,768805383.0,45.0,2.0,3.0,3.0,4.0,39.0,5.0,1.0,3.0,...,1.625,0.061,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,818770008.0,49.0,1.0,5.0,5.0,2.0,44.0,6.0,1.0,2.0,...,3.714,0.105,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,713982108.0,51.0,2.0,3.0,5.0,5.0,36.0,4.0,1.0,0.0,...,2.333,0.000,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,769911858.0,40.0,1.0,4.0,3.0,2.0,34.0,3.0,4.0,1.0,...,2.333,0.760,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,709106358.0,40.0,2.0,3.0,2.0,4.0,21.0,5.0,1.0,0.0,...,2.500,0.000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10162,718673358.0,35.0,2.0,3.0,7.0,5.0,30.0,5.0,3.0,4.0,...,0.486,0.112,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10163,715207458.0,46.0,1.0,1.0,1.0,2.0,39.0,2.0,2.0,1.0,...,0.800,0.529,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
10164,803665983.0,52.0,2.0,0.0,1.0,4.0,46.0,3.0,2.0,4.0,...,0.532,0.796,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10165,713183508.0,39.0,1.0,1.0,3.0,1.0,36.0,1.0,3.0,2.0,...,0.822,0.421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
