First We Clean the Dataset

In [28]:
import pandas as pd

df = pd.read_csv('CreditPrediction.csv')

df.info()

# all the data in 'Unnamed: 19' column is zero, so we delete this column
df.drop('Unnamed: 19', axis=1, inplace=True) 
df.drop('CLIENTNUM', axis=1, inplace=True) 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10167 entries, 0 to 10166
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CLIENTNUM                 10167 non-null  int64  
 1   Customer_Age              10167 non-null  float64
 2   Gender                    9968 non-null   object 
 3   Dependent_count           10167 non-null  int64  
 4   Education_Level           10167 non-null  object 
 5   Marital_Status            8217 non-null   object 
 6   Income_Category           10167 non-null  object 
 7   Card_Category             8243 non-null   object 
 8   Months_on_book            9944 non-null   float64
 9   Total_Relationship_Count  10147 non-null  float64
 10  Months_Inactive_12_mon    10167 non-null  int64  
 11  Contacts_Count_12_mon     10167 non-null  int64  
 12  Credit_Limit              10167 non-null  float64
 13  Total_Revolving_Bal       10167 non-null  int64  
 14  Total_

- Now we want to fill the NaN cells with Column-Mean 
- And delete duplicated data

In [29]:
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Marital_Status'] = df['Marital_Status'].fillna(df['Marital_Status'].mode()[0])
df['Card_Category'] = df['Card_Category'].fillna(df['Card_Category'].mode()[0])
df['Months_on_book'] = df['Months_on_book'].fillna(df['Months_on_book'].mean().round())
df['Total_Relationship_Count'] = df['Total_Relationship_Count'].fillna(df['Total_Relationship_Count'].mean().round())

df = df.drop_duplicates()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10128 entries, 0 to 10128
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer_Age              10128 non-null  float64
 1   Gender                    10128 non-null  object 
 2   Dependent_count           10128 non-null  int64  
 3   Education_Level           10128 non-null  object 
 4   Marital_Status            10128 non-null  object 
 5   Income_Category           10128 non-null  object 
 6   Card_Category             10128 non-null  object 
 7   Months_on_book            10128 non-null  float64
 8   Total_Relationship_Count  10128 non-null  float64
 9   Months_Inactive_12_mon    10128 non-null  int64  
 10  Contacts_Count_12_mon     10128 non-null  int64  
 11  Credit_Limit              10128 non-null  float64
 12  Total_Revolving_Bal       10128 non-null  int64  
 13  Total_Amt_Chng_Q4_Q1      10128 non-null  float64
 14  Total_Trans

- We convert Non-Numerical data to Numerical data

In [30]:
# Encoding Features Using Mapping
df['Gender'] = df['Gender'].map({'F': 0, 'M': 1})

# Encoding Non-Sequential Features Using One-Hot Encoding
df['Education_Level'] = df['Education_Level'].replace("Unknown", df['Education_Level'].mode()[0])
df_encoded = pd.get_dummies(df['Education_Level'], prefix='Education_Level')
df_encoded = df_encoded.astype(int)
df = pd.concat([df, df_encoded], axis=1)
df.drop('Education_Level', axis=1, inplace=True)

df['Income_Category'] = df['Income_Category'].replace("Unknown", df['Income_Category'].mode()[0])
df_encoded = pd.get_dummies(df['Income_Category'], prefix='Income_Category')
df_encoded = df_encoded.astype(int)
df = pd.concat([df, df_encoded], axis=1)
df.drop('Income_Category', axis=1, inplace=True)


df['Marital_Status'] = df['Marital_Status'].replace("Unknown", df['Marital_Status'].mode()[0])
df_encoded = pd.get_dummies(df['Marital_Status'], prefix='Marital_Status')
df_encoded = df_encoded.astype(int)
df = pd.concat([df, df_encoded], axis=1)
df.drop('Marital_Status', axis=1, inplace=True)

df_encoded = pd.get_dummies(df['Card_Category'], prefix='Card_Category')
df_encoded = df_encoded.astype(int)
df = pd.concat([df, df_encoded], axis=1)
df.drop('Card_Category', axis=1, inplace=True)

df['Customer_Age'].describe()

count    10128.000000
mean        46.759188
std         13.540138
min         26.000000
25%         41.000000
50%         46.000000
75%         52.000000
max        352.330517
Name: Customer_Age, dtype: float64

- Handling Outlier Data

In [37]:
# Define a function to replace outliers with the mode of each column using IQR
def replace_outliers_with_mode_iqr(data):
    # Initialize an empty DataFrame to store the results
    result = pd.DataFrame(index=data.index, columns=data.columns)

    # Loop through each column in the DataFrame
    for col in data.columns:
        Q1 = data[col].quantile(0.25)  # 25th percentile (Q1)
        Q3 = data[col].quantile(0.75)  # 75th percentile (Q3)
        IQR = Q3 - Q1  # Interquartile Range (IQR)
        lower_bound = Q1 - 1.5 * IQR  # Lower bound for outliers
        upper_bound = Q3 + 1.5 * IQR  # Upper bound for outliers

        # Replace outliers with mode for the current column
        mode_val = data.loc[(data[col] >= lower_bound) & (data[col] <= upper_bound), col].mean()
        result[col] = data[col].apply(lambda x: mode_val if x < lower_bound or x > upper_bound else x)

    return result

# Apply the function to replace outliers with mode for each column separately
df = replace_outliers_with_mode_iqr(df)
df['Customer_Age'].describe()

count    10128.000000
mean        46.307859
std          8.004431
min         26.000000
25%         41.000000
50%         46.000000
75%         52.000000
max         68.000000
Name: Customer_Age, dtype: float64

- Normalizing Data

In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Initialize StandardScaler
scaler = StandardScaler()

Y = df['Credit_Limit']
df.drop('Credit_Limit', axis=1, inplace=True)

X = df

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, shuffle=True)

# Apply Standard scaling to the DataFrame
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)

X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

X_train['Customer_Age'].describe()

count    7.596000e+03
mean    -3.680866e-16
std      1.000066e+00
min     -2.540762e+00
25%     -6.615627e-01
50%     -3.516284e-02
75%      7.165170e-01
max      2.720997e+00
Name: Customer_Age, dtype: float64

Regression Models

- Linear Regression

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

linear_regression = LinearRegression()

linear_regression.fit(X_train, Y_train)

Y_pred_LR = linear_regression.predict(X_test)

mse_LR = mean_squared_error(Y_test, Y_pred_LR)
r2_LR = r2_score(Y_test, Y_pred_LR)

print("MSE : ", mse_LR)
print("R2 : ", r2_LR)

MSE :  19117455.042649165
R2 :  0.27390875457790964


- Polynomial Regression with Ridge

In [34]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge


poly = PolynomialFeatures(degree=2)

X_train_poly = poly.fit_transform(X_train)

X_test_poly = poly.transform(X_test)

ridge = Ridge()
ridge.fit(X_train_poly, Y_train)

Y_pred_poly = ridge.predict(X_test_poly)


mse_poly = mean_squared_error(Y_test, Y_pred_poly)
r2_poly = r2_score(Y_test, Y_pred_poly)

print(mse_poly)
print(r2_poly)

18491116.56936586
0.29769742734358273


- Gradient Boosting

In [35]:
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor()

gb.fit(X_train, Y_train)

Y_pred_gb = gb.predict(X_test)

mse_gb = mean_squared_error(Y_test, Y_pred_gb)
r2_gb = r2_score(Y_test, Y_pred_gb)

print("MSE : ", mse_gb)
print("R2 : ", r2_gb)

MSE :  10951911.764001306
R2 :  0.5840404889282478


- Random Forest

In [36]:
from sklearn.ensemble import RandomForestRegressor

random_forest = RandomForestRegressor(n_estimators=100, max_depth=15, random_state=10)

random_forest.fit(X_train, Y_train)

Y_pred_rf = random_forest.predict(X_test)

mse_rf = mean_squared_error(Y_test, Y_pred_rf)
r2_rf = r2_score(Y_test, Y_pred_rf)

print("MSE : ", mse_rf)
print("R2 : ", r2_rf)

MSE :  8087351.885429399
R2 :  0.6928380169035093
