## Packages importing 

In [102]:
!pip install matplotlib seaborn pandas numpy scikit-learn datasist imbalanced-learn category-encoders xgboost joblib



In [103]:
# Packages for EDA 
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd 
import numpy as np 

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from datasist.structdata import detect_outliers
from sklearn.metrics import mean_squared_error
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import category_encoders as ce
import re 

# Modeling and evaluation 
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import (
    BaggingClassifier,
    ExtraTreesClassifier,
    RandomForestClassifier,
    StackingClassifier,
    HistGradientBoostingClassifier
)
from xgboost import XGBClassifier
from sklearn.metrics import classification_report 
import joblib

# Packages options 
sns.set(rc={'figure.figsize': [14, 7]}, font_scale=1.2) # Standard figure size for all 
np.seterr(divide='ignore', invalid='ignore', over='ignore') ;

import warnings 
warnings.filterwarnings("ignore")

## Reading Data 

In [104]:
df = pd.read_csv("train.csv",low_memory=False)

In [105]:
df.sample(10)

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
22427,0x9969,CUS_0x440a,April,Krisztinao,53,854-64-6856,Musician,44789.64,,2,...,Good,122.27,23.283668,16 Years and 4 Months,No,0.0,207.0462525884097,Low_spent_Large_value_payments,426.8007474115904,Good
97484,0x25132,CUS_0x6671,May,Alistair Barri,1394,097-58-4488,Manager,9898.815,781.90125,8,...,Standard,2381.42,25.095378,10 Years and 1 Months,NM,21.362569,23.92390850144709,Low_spent_Medium_value_payments,312.9036473696598,Poor
62992,0x1871a,CUS_0x5a92,January,Gillams,41,486-47-1751,Doctor,124534.38,10621.865,424,...,Good,757.38,28.938734,17 Years and 5 Months,No,174.338527,208.70454186073584,High_spent_Large_value_payments,919.1434311654392,Good
95646,0x2466c,CUS_0x2c21,July,Ann Saphira,25,617-73-1215,Architect,32090.48,2712.206667,7,...,_,1266.79,33.568674,9 Years and 4 Months,Yes,169.32787,102.9648642105888,High_spent_Small_value_payments,258.9279323083153,Poor
6030,0x3954,CUS_0x5300,July,Schnurri,20,857-13-4456,Musician,7985.155_,870.429583,6,...,Bad,1954.62,32.559435,13 Years and 2 Months,Yes,46.177189,65.26698916781885,Low_spent_Medium_value_payments,255.5987802052671,Standard
92532,0x2342e,CUS_0x32a1,May,Noeln,55,542-03-2848,Engineer,11249.15,984.429167,3,...,Good,612.08,36.938423,26 Years and 0 Months,No,8.092135,51.12844161692379,Low_spent_Small_value_payments,329.2223404255254,Good
85502,0x20afc,CUS_0x1628,July,Frostl,33,340-56-9506,Musician,17446.07,1245.839167,6,...,Standard,2239.2,34.483426,15 Years and 2 Months,Yes,73.099417,120.00589129138174,Low_spent_Small_value_payments,221.47860799137305,Poor
16304,0x758a,CUS_0x4a77,January,imarte Danielm,33,187-66-6476,Engineer,169293.52,14365.793333,2,...,_,931.52,40.282802,30 Years and 5 Months,No,92.04008,389.1494947426903,Low_spent_Medium_value_payments,1235.3897584593035,Good
91336,0x22d2e,CUS_0x5627,January,Stempelt,50,953-46-1751,Teacher,34366,2831.833333,2,...,Good,701.41,23.298538,31 Years and 10 Months,No,44.931902,44.09079214865529,High_spent_Medium_value_payments,444.1606388281364,Standard
77762,0x1dda4,CUS_0x7e50,March,Bend,48,882-21-7332,Writer,170986.72,14137.893333,0,...,Good,396.58,31.498194,32 Years and 3 Months,No,460.204587,204.47850891367864,!@9#%8,989.1062374326516,Good


### Issues with this dataset: 
1. The following headings are supposed to be numerical but appear as categorical datatypes: Age, Annual_Income, Num_of_Loan, Num_of_Delayed_Payment, Changed_Credit_Limit, Amount_invested_monthly, Outstanding_Debt Credit_Mix, Monthly_Balance
2. Remove these headings: ID, Name and SSN (Not useful)
3. Remove missing data
4. Credit_Mix has value a value "-" which needs to be removed/fixed
5. Num_Credit_Card has __zeros__
6. Type_of_Loan nees to be rewritten
7. Negative values exist in the header: Num_Bank_Accounts
8. Outliers need to be removed
9. Missing data need to be filled/removed
10. Target column does not have even balance of outputs
11. Following headers need more fixing to balance out dataset: Credit_History_Age,Payment_of_Min_Amount,Payment_Behaviour,'Credit_Mix'

## Data cleaning

In [106]:
del df['ID'] # Identification 
del df['Name'] # Name of client 
del df['SSN'] # SSN (social security number of a person)

#### Fixing Numerical Columns to be of float data-type and removing "-" columns

1. replace _  
2. convert into float 

In [107]:
to_be_fixed = ['Age', 'Annual_Income', 'Num_of_Loan', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Amount_invested_monthly', 'Outstanding_Debt' , 'Monthly_Balance']

In [108]:
def fix_nums(num):
    try : 
        return float(num.replace("_",""))
    except :
        return np.nan

In [109]:
for col in to_be_fixed :
    df[col] = df[col].apply(fix_nums)

### Transform Type_of_Loan header

In [110]:
## transform the Type_of_Loan column, which contains categorical data, into multiple binary columns.
for i in df['Type_of_Loan'].value_counts().head(9).index[1:] : 
    df[i] = df['Type_of_Loan'].str.contains(i)

del df['Type_of_Loan']

#### Make Num_Bank_Accounts to be positive always

In [111]:
# number of bank accounts must be positive
df['Num_Bank_Accounts'] = df['Num_Bank_Accounts'].apply(lambda x :abs (x))

#### Non-Zero for Num_Credit_Card

In [112]:
# number of credit cards must be atleast 1
df['Num_Credit_Card'].replace(0,1,inplace=True)

#### Convert credit history ages under Credit_History_Age to all integers

In [113]:
def history_age(age):
    try : 
        years = int("".join(re.findall('[0-9]',''.join(age.split("and")[0]))))
        month = int("".join(re.findall('[0-9]',''.join(age.split("and")[1]))))
        return years*12 + month
    except :
        return np.nan

In [114]:
df['Credit_History_Age'] = df['Credit_History_Age'].apply(history_age)

#### Payment_of_Min_Amount

In [115]:
df['Payment_of_Min_Amount'].replace("NM","No",inplace=True)

In [116]:
df['Payment_of_Min_Amount'].value_counts()

Payment_of_Min_Amount
Yes    52326
No     47674
Name: count, dtype: int64

#### Payment_Behaviour

In [117]:
df['Payment_Behaviour']= df['Payment_Behaviour'].replace("!@9#%8",'Medium_spent_Medium_value_payments')

In [118]:
df['Payment_Behaviour'].value_counts()

Payment_Behaviour
Low_spent_Small_value_payments        25513
High_spent_Medium_value_payments      17540
Low_spent_Medium_value_payments       13861
High_spent_Large_value_payments       13721
High_spent_Small_value_payments       11340
Low_spent_Large_value_payments        10425
Medium_spent_Medium_value_payments     7600
Name: count, dtype: int64

### Occupation

In [119]:
df['Occupation'].value_counts()

Occupation
_______          7062
Lawyer           6575
Architect        6355
Engineer         6350
Scientist        6299
Mechanic         6283
Accountant       6271
Developer        6235
Media_Manager    6224
Teacher          6215
Entrepreneur     6174
Doctor           6087
Journalist       6085
Manager          5973
Musician         5911
Writer           5885
Others             16
Name: count, dtype: int64

In [120]:
occs = df['Occupation'].value_counts().index[1:]
occs

Index(['Lawyer', 'Architect', 'Engineer', 'Scientist', 'Mechanic',
       'Accountant', 'Developer', 'Media_Manager', 'Teacher', 'Entrepreneur',
       'Doctor', 'Journalist', 'Manager', 'Musician', 'Writer', 'Others'],
      dtype='object', name='Occupation')

In [121]:
# for the specified customer, identify the most common occupation in their records
# replace missing values of "_______" with the most common occupation for that customer
id_ = "CUS_0xb891"
oc = df[df['Customer_ID'] == id_]['Occupation'].mode()[0]
df[df['Customer_ID'] == id_].replace("_______",oc)

Unnamed: 0,Customer_ID,Month,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,...,Monthly_Balance,Credit_Score,Credit-Builder Loan,Personal Loan,Debt Consolidation Loan,Student Loan,Payday Loan,Mortgage Loan,Auto Loan,Home Equity Loan
24,CUS_0xb891,January,54.0,Entrepreneur,30689.89,2612.490833,2,5,4,1.0,...,433.604773,Standard,False,False,False,False,False,False,False,False
25,CUS_0xb891,September,54.0,Entrepreneur,30689.89,2612.490833,2,5,4,1.0,...,409.951812,Standard,False,False,False,False,False,False,False,False
26,CUS_0xb891,March,55.0,Entrepreneur,30689.89,2612.490833,2,5,4,1.0,...,411.427123,Standard,False,False,False,False,False,False,False,False
27,CUS_0xb891,April,55.0,Entrepreneur,30689.89,2612.490833,2,5,4,1.0,...,262.499594,Standard,False,False,False,False,False,False,False,False
28,CUS_0xb891,May,55.0,Entrepreneur,30689.89,2612.490833,2,5,4,1.0,...,359.374916,Standard,False,False,False,False,False,False,False,False
29,CUS_0xb891,August,55.0,Entrepreneur,30689.89,2612.490833,2,5,4,1.0,...,419.880784,Standard,False,False,False,False,False,False,False,False
30,CUS_0xb891,July,55.0,Entrepreneur,30689.89,2612.490833,2,5,4,1.0,...,443.549957,Standard,False,False,False,False,False,False,False,False
31,CUS_0xb891,,55.0,Entrepreneur,30689.89,2612.490833,2,5,4,-100.0,...,379.216381,Standard,False,False,False,False,False,False,False,False


In [122]:
# for _id_ in df[df['Occupation'] == "_______"]['Customer_ID'] : 
#     oc = df[df['Customer_ID'] == _id_]['Occupation'].mode()[0]
#     df[df['Customer_ID'] == _id_] = df[df['Customer_ID'] == _id_].replace("_______",oc)

In [123]:
missing_occupation = df['Occupation'] == '_______'
# Find the most frequent occupation for each Customer_ID
most_frequent_occupation = df[~missing_occupation].groupby('Customer_ID')['Occupation'].agg(lambda x: x.mode()[0])
# Replace missing Occupation values
df.loc[missing_occupation, 'Occupation'] = df.loc[missing_occupation, 'Customer_ID'].map(most_frequent_occupation)


In [124]:
df['Occupation'].value_counts()

Occupation
Lawyer           7096
Engineer         6864
Architect        6824
Mechanic         6768
Scientist        6744
Accountant       6744
Developer        6720
Media_Manager    6712
Teacher          6672
Entrepreneur     6648
Doctor           6568
Journalist       6536
Manager          6432
Musician         6352
Writer           6304
Others             16
Name: count, dtype: int64

In [125]:
df['Occupation'] = df['Occupation'].replace("_______",df['Occupation'].mode()[0])

In [126]:
df['Occupation'].value_counts()

Occupation
Lawyer           7096
Engineer         6864
Architect        6824
Mechanic         6768
Scientist        6744
Accountant       6744
Developer        6720
Media_Manager    6712
Teacher          6672
Entrepreneur     6648
Doctor           6568
Journalist       6536
Manager          6432
Musician         6352
Writer           6304
Others             16
Name: count, dtype: int64

In [127]:
df['Credit_Mix'].value_counts()

Credit_Mix
Standard    36479
Good        24337
_           20195
Bad         18989
Name: count, dtype: int64

In [128]:
m = {
    "Bad":0,
    "Standard":1,
    "Good":2,
    "_":np.nan
}

In [129]:
df['Credit_Mix'] = df['Credit_Mix'].map(m)

In [130]:
df['Credit_Mix'].value_counts()

Credit_Mix
1.0    36479
2.0    24337
0.0    18989
Name: count, dtype: int64

### Advanced Handling Missing Data 

In [131]:
# edit Columns from bool to int 
for col in list(df.columns[-8:]):
    df[col] = df[col].astype(float)

In [132]:
# IDs = 1 
# for ID in df['Customer_ID'].unique() :
#     df['Customer_ID'] = df['Customer_ID'].replace(ID,IDs)
#     IDs += 1 

In [133]:
# Use pd.factorize to assign unique integers to each unique Customer_ID
df['Customer_ID'] = pd.factorize(df['Customer_ID'])[0] + 1

In [134]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=1)

In [135]:
Numericals = df.select_dtypes(exclude='object').columns[1:]
Numericals

Index(['Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
       'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan',
       'Delay_from_due_date', 'Num_of_Delayed_Payment', 'Changed_Credit_Limit',
       'Num_Credit_Inquiries', 'Credit_Mix', 'Outstanding_Debt',
       'Credit_Utilization_Ratio', 'Credit_History_Age', 'Total_EMI_per_month',
       'Amount_invested_monthly', 'Monthly_Balance', 'Credit-Builder Loan',
       'Personal Loan', 'Debt Consolidation Loan', 'Student Loan',
       'Payday Loan', 'Mortgage Loan', 'Auto Loan', 'Home Equity Loan'],
      dtype='object')

In [136]:
# Replace infinity values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for NaN values and handle them (e.g., by replacing them with the mean of the column)
for col in Numericals:
    if df[col].isnull().any():
        df[col].fillna(df[col].mean(), inplace=True)

In [137]:
from sklearn.impute import SimpleImputer

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to each numerical column
for col in Numericals:
    df[[col]] = imputer.fit_transform(df[[col]])


In [138]:
for col in Numericals[1:]:
    imputer.fit(df[['Customer_ID',col]])
    df[['Customer_ID',col]] = imputer.transform(df[['Customer_ID',col]])

In [139]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 32 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Customer_ID               100000 non-null  float64
 1   Month                     87500 non-null   object 
 2   Age                       100000 non-null  float64
 3   Occupation                100000 non-null  object 
 4   Annual_Income             100000 non-null  float64
 5   Monthly_Inhand_Salary     100000 non-null  float64
 6   Num_Bank_Accounts         100000 non-null  float64
 7   Num_Credit_Card           100000 non-null  float64
 8   Interest_Rate             100000 non-null  float64
 9   Num_of_Loan               100000 non-null  float64
 10  Delay_from_due_date       100000 non-null  float64
 11  Num_of_Delayed_Payment    100000 non-null  float64
 12  Changed_Credit_Limit      100000 non-null  float64
 13  Num_Credit_Inquiries      100000 non-null  fl

In [140]:
imputer = SimpleImputer(strategy="most_frequent")
imputer.fit(df[['Payment_Behaviour']])
df[['Payment_Behaviour']] = imputer.transform(df[['Payment_Behaviour']])

### Handling Outliers 

In [141]:
## replace Outliers with median 
for col in Numericals :
    outliers_indecies = detect_outliers(df,0,[col])
    median = df[col].median()
    df[col].iloc[outliers_indecies] = median

In [142]:
df.to_csv("processed2.csv")

# Data Preprocessing

### Handling Catogerical 

In [143]:
df.select_dtypes(include="object")

Unnamed: 0,Month,Occupation,Payment_of_Min_Amount,Payment_Behaviour,Credit_Score
0,January,Scientist,No,High_spent_Small_value_payments,Good
1,February,Scientist,No,Low_spent_Large_value_payments,Good
2,March,Scientist,No,Low_spent_Medium_value_payments,Good
3,April,Scientist,No,Low_spent_Small_value_payments,Good
4,May,Scientist,No,High_spent_Medium_value_payments,Good
...,...,...,...,...,...
99995,April,Others,No,High_spent_Large_value_payments,Poor
99996,May,Others,No,High_spent_Medium_value_payments,Poor
99997,June,Others,No,High_spent_Large_value_payments,Poor
99998,July,Others,No,Low_spent_Large_value_payments,Standard


In [144]:
df['Month'].value_counts()

Month
January      12500
April        12500
May          12500
February     12456
July         12437
March        12419
June         12401
October         81
December        63
August          52
November        47
September       44
Name: count, dtype: int64

In [145]:
df['Credit_Score'].value_counts()

Credit_Score
Standard    53174
Poor        28998
Good        17828
Name: count, dtype: int64

In [146]:
m = {
    "Poor":0,
    "Standard":1,
    "Good":2
}

In [147]:
df['Credit_Score'] = df['Credit_Score'].map(m)

In [148]:
df['Credit_Score'].value_counts()

Credit_Score
1    53174
0    28998
2    17828
Name: count, dtype: int64

In [149]:
del df['Customer_ID']

In [150]:
df = pd.get_dummies(df,drop_first=True)

In [151]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 60 columns):
 #   Column                                                Non-Null Count   Dtype  
---  ------                                                --------------   -----  
 0   Age                                                   100000 non-null  float64
 1   Annual_Income                                         100000 non-null  float64
 2   Monthly_Inhand_Salary                                 100000 non-null  float64
 3   Num_Bank_Accounts                                     100000 non-null  float64
 4   Num_Credit_Card                                       100000 non-null  float64
 5   Interest_Rate                                         100000 non-null  float64
 6   Num_of_Loan                                           100000 non-null  float64
 7   Delay_from_due_date                                   100000 non-null  float64
 8   Num_of_Delayed_Payment                       

In [152]:
df.to_csv("final_processed_data.csv")

In [153]:
# df = pd.read_csv("final_processed_data.csv",low_memory=False)
# df = df.iloc[:, 1:]

In [154]:
df.shape

(100000, 60)

### Data Spliting 
- Try Resampling 

In [155]:
# define dataset
X, y = df.drop("Credit_Score",axis=1).values , df["Credit_Score"] 

In [156]:
X_shape = X.shape
y_shape = y.shape

X_shape, y_shape

((100000, 59), (100000,))

## Apply oversampling
- As our data is not fully balanced, we intend to use oversampling to balance it out as much as possible using SMOTE
- SMOTE (Synthetic Minority Over-sampling Technique) is a popular method used to address class imbalance in datasets, particularly in the context of binary classification problems.

In [157]:
y.value_counts(normalize=True)

Credit_Score
1    0.53174
0    0.28998
2    0.17828
Name: proportion, dtype: float64

In [158]:
from imblearn.over_sampling import SMOTE
rus = SMOTE(sampling_strategy='auto')
X_data_rus, y_data_rus = rus.fit_resample(X, y)

In [159]:
y_data_rus.value_counts(normalize=True)

Credit_Score
2    0.333333
1    0.333333
0    0.333333
Name: proportion, dtype: float64

In [160]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_data_rus, y_data_rus, test_size=0.3, random_state=42,stratify=y_data_rus)

### Handling Numerical 
- Using Power transformer to avoid Data Skewness
- The PowerTransformer is a tool in scikit-learn used to apply power transformations to numerical data to stabilize variance and make the data more Gaussian-like (i.e., normally distributed)

In [161]:
scalar = PowerTransformer(method='yeo-johnson', standardize=True).fit(X_train)

In [162]:
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

In [163]:
print("Number of features used for fitting the scaler:", scalar.n_features_in_)


Number of features used for fitting the scaler: 59


# Modeling and Evaluation


#### Model Building

In [164]:
bagging = BaggingClassifier(n_jobs=-1)
extraTrees = ExtraTreesClassifier(max_depth=10, n_jobs=-1)
randomForest = RandomForestClassifier(n_jobs=-1)
histGradientBoosting = HistGradientBoostingClassifier()
XGB = XGBClassifier(n_jobs=-1)

model = StackingClassifier([
    ('bagging', bagging),
    ('extraTress', extraTrees),
    ('randomforest', randomForest),
    ('histGradientBoosting', histGradientBoosting),
    ('XGB', XGB)
], n_jobs=-1)


#### Model fitting

In [165]:
model.fit(X_train, y_train)

#### model evaluation

In [166]:
print("Train Score: ",model.score(X_train, y_train))

Train Score:  0.9998746249944029


In [167]:
print("Test Score: ",model.score(X_test, y_test))

Test Score:  0.8484652192991621


In [168]:
y_pred = model.predict(X_test)

In [169]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86     15489
           1       0.81      0.79      0.80     16281
           2       0.89      0.88      0.89     16087

    accuracy                           0.85     47857
   macro avg       0.85      0.85      0.85     47857
weighted avg       0.85      0.85      0.85     47857



#### Saving the model

In [170]:
import joblib

# Save the StackingClassifier model
joblib.dump(model, 'stacking_model_2.pkl')

# Save the scaler
joblib.dump(scalar, 'stacking_model_scaler_2.pkl')

['stacking_model_scaler_2.pkl']

### Test with examples ###

In [3]:
import pandas as pd
import joblib
case_1 = {
    'Age': 45,
    'Annual_Income': 120000,
    'Monthly_Inhand_Salary': 10000,
    'Num_Bank_Accounts': 5,
    'Num_Credit_Card': 8,
    'Interest_Rate': 3,
    'Num_of_Loan': 2,
    'Delay_from_due_date': 0,
    'Num_of_Delayed_Payment': 1,
    'Changed_Credit_Limit': 5000,
    'Num_Credit_Inquiries': 2,
    'Credit_Mix': 1,
    'Outstanding_Debt': 10000,
    'Credit_Utilization_Ratio': 20,
    'Credit_History_Age': 20,
    'Total_EMI_per_month': 1500,
    'Amount_invested_monthly': 2000,
    'Monthly_Balance': 5000,
    'Credit-Builder Loan': 0.0,
    'Personal Loan': 1.0,
    'Debt Consolidation Loan': 0.0,
    'Student Loan': 0.0,
    'Payday Loan': 0.0,
    'Mortgage Loan': 1.0,
    'Auto Loan': 1.0,
    'Home Equity Loan': 0.0,
    'Month_August': 0,
    'Month_December': 0,
    'Month_February': 1,
    'Month_January': 0,
    'Month_July': 0,
    'Month_June': 0,
    'Month_March': 0,
    'Month_May': 0,
    'Month_November': 0,
    'Month_October': 0,
    'Month_September': 0,
    'Occupation_Architect': 0,
    'Occupation_Developer': 0,
    'Occupation_Doctor': 0,
    'Occupation_Engineer': 1,
    'Occupation_Entrepreneur': 0,
    'Occupation_Journalist': 0,
    'Occupation_Lawyer': 0,
    'Occupation_Manager': 1,
    'Occupation_Mechanic': 0,
    'Occupation_Media_Manager': 0,
    'Occupation_Musician': 0,
    'Occupation_Others': 0,
    'Occupation_Scientist': 0,
    'Occupation_Teacher': 0,
    'Occupation_Writer': 0,
    'Payment_of_Min_Amount_Yes': 1,
    'Payment_Behaviour_High_spent_Medium_value_payments': 1,
    'Payment_Behaviour_High_spent_Small_value_payments': 0,
    'Payment_Behaviour_Low_spent_Large_value_payments': 0,
    'Payment_Behaviour_Low_spent_Medium_value_payments': 0,
    'Payment_Behaviour_Low_spent_Small_value_payments': 0,
    'Payment_Behaviour_Medium_spent_Medium_value_payments': 0
}

case_poor = {
    'Age': 25,
    'Annual_Income': 20000,
    'Monthly_Inhand_Salary': 1500,
    'Num_Bank_Accounts': 2,
    'Num_Credit_Card': 1,
    'Interest_Rate': 20,
    'Num_of_Loan': 4,
    'Delay_from_due_date': 30,
    'Num_of_Delayed_Payment': 5,
    'Changed_Credit_Limit': 200,
    'Num_Credit_Inquiries': 10,
    'Credit_Mix': 0,
    'Outstanding_Debt': 50000,
    'Credit_Utilization_Ratio': 90,
    'Credit_History_Age': 1,
    'Total_EMI_per_month': 3000,
    'Amount_invested_monthly': 100,
    'Monthly_Balance': -500,
    'Credit-Builder Loan': 0.0,
    'Personal Loan': 1.0,
    'Debt Consolidation Loan': 0.0,
    'Student Loan': 1.0,
    'Payday Loan': 1.0,
    'Mortgage Loan': 0.0,
    'Auto Loan': 0.0,
    'Home Equity Loan': 0.0,
    'Month_August': 0,
    'Month_December': 0,
    'Month_February': 0,
    'Month_January': 1,
    'Month_July': 0,
    'Month_June': 0,
    'Month_March': 0,
    'Month_May': 0,
    'Month_November': 0,
    'Month_October': 0,
    'Month_September': 0,
    'Occupation_Architect': 0,
    'Occupation_Developer': 0,
    'Occupation_Doctor': 0,
    'Occupation_Engineer': 0,
    'Occupation_Entrepreneur': 0,
    'Occupation_Journalist': 0,
    'Occupation_Lawyer': 0,
    'Occupation_Manager': 0,
    'Occupation_Mechanic': 1,
    'Occupation_Media_Manager': 0,
    'Occupation_Musician': 0,
    'Occupation_Others': 0,
    'Occupation_Scientist': 0,
    'Occupation_Teacher': 0,
    'Occupation_Writer': 0,
    'Payment_of_Min_Amount_Yes': 0,
    'Payment_Behaviour_High_spent_Medium_value_payments': 0,
    'Payment_Behaviour_High_spent_Small_value_payments': 1,
    'Payment_Behaviour_Low_spent_Large_value_payments': 0,
    'Payment_Behaviour_Low_spent_Medium_value_payments': 0,
    'Payment_Behaviour_Low_spent_Small_value_payments': 0,
    'Payment_Behaviour_Medium_spent_Medium_value_payments': 0
}


# Convert the case to DataFrame
case_df = pd.DataFrame([case_poor])

# Ensure all expected columns are present
expected_features = [
    'Age', 'Annual_Income', 'Monthly_Inhand_Salary', 'Num_Bank_Accounts',
    'Num_Credit_Card', 'Interest_Rate', 'Num_of_Loan', 'Delay_from_due_date',
    'Num_of_Delayed_Payment', 'Changed_Credit_Limit', 'Num_Credit_Inquiries',
    'Credit_Mix', 'Outstanding_Debt', 'Credit_Utilization_Ratio',
    'Credit_History_Age', 'Total_EMI_per_month', 'Amount_invested_monthly',
    'Monthly_Balance', 'Credit-Builder Loan', 'Personal Loan',
    'Debt Consolidation Loan', 'Student Loan', 'Payday Loan', 'Mortgage Loan',
    'Auto Loan', 'Home Equity Loan', 'Month_August', 'Month_December',
    'Month_February', 'Month_January', 'Month_July', 'Month_June', 'Month_March',
    'Month_May', 'Month_November', 'Month_October', 'Month_September',
    'Occupation_Architect', 'Occupation_Developer', 'Occupation_Doctor',
    'Occupation_Engineer', 'Occupation_Entrepreneur', 'Occupation_Journalist',
    'Occupation_Lawyer', 'Occupation_Manager', 'Occupation_Mechanic',
    'Occupation_Media_Manager', 'Occupation_Musician', 'Occupation_Others',
    'Occupation_Scientist', 'Occupation_Teacher', 'Occupation_Writer',
    'Payment_of_Min_Amount_Yes',
    'Payment_Behaviour_High_spent_Medium_value_payments',
    'Payment_Behaviour_High_spent_Small_value_payments',
    'Payment_Behaviour_Low_spent_Large_value_payments',
    'Payment_Behaviour_Low_spent_Medium_value_payments',
    'Payment_Behaviour_Low_spent_Small_value_payments',
    'Payment_Behaviour_Medium_spent_Medium_value_payments'
]

# Ensure the columns are in the correct order
case_df = case_df[expected_features]

scalar = joblib.load('stacking_model_scaler_2.pkl')
model = joblib.load('stacking_model_2.pkl')
# Transform the input data using the saved scaler
case_transformed = scalar.transform(case_df)

# Make a prediction
prediction = model.predict(case_transformed)
m = {
    0: "Poor",
    1: "Standard",
    2: "Good"
}
print(f"The model predicts: {m[prediction[0]]}")



The model predicts: Standard


In [105]:
len(expected_features)

58

In [None]:
# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 100000 entries, 0 to 99999
# Data columns (total 60 columns):
#  #   Column                                                Non-Null Count   Dtype  
# ---  ------                                                --------------   -----  
#  0   Age                                                   100000 non-null  float64
#  1   Annual_Income                                         100000 non-null  float64
#  2   Monthly_Inhand_Salary                                 100000 non-null  float64
#  3   Num_Bank_Accounts                                     100000 non-null  float64
#  4   Num_Credit_Card                                       100000 non-null  float64
#  5   Interest_Rate                                         100000 non-null  float64
#  6   Num_of_Loan                                           100000 non-null  float64
#  7   Delay_from_due_date                                   100000 non-null  float64
#  8   Num_of_Delayed_Payment                                100000 non-null  float64
#  9   Changed_Credit_Limit                                  100000 non-null  float64
#  10  Num_Credit_Inquiries                                  100000 non-null  float64
#  11  Credit_Mix                                            100000 non-null  float64
#  12  Outstanding_Debt                                      100000 non-null  float64
#  13  Credit_Utilization_Ratio                              100000 non-null  float64
#  14  Credit_History_Age                                    100000 non-null  float64
#  15  Total_EMI_per_month                                   100000 non-null  float64
#  16  Amount_invested_monthly                               100000 non-null  float64
#  17  Monthly_Balance                                       100000 non-null  float64
#  18  Credit_Score                                          100000 non-null  int64  
#  19  Credit-Builder Loan                                   100000 non-null  float64
#  20  Personal Loan                                         100000 non-null  float64
#  21  Debt Consolidation Loan                               100000 non-null  float64
#  22  Student Loan                                          100000 non-null  float64
#  23  Payday Loan                                           100000 non-null  float64
#  24  Mortgage Loan                                         100000 non-null  float64
#  25  Auto Loan                                             100000 non-null  float64
#  26  Home Equity Loan                                      100000 non-null  float64
#  27  Month_August                                          100000 non-null  bool   
#  28  Month_December                                        100000 non-null  bool   
#  29  Month_February                                        100000 non-null  bool   
#  30  Month_January                                         100000 non-null  bool   
#  31  Month_July                                            100000 non-null  bool   
#  32  Month_June                                            100000 non-null  bool   
#  33  Month_March                                           100000 non-null  bool   
#  34  Month_May                                             100000 non-null  bool   
#  35  Month_November                                        100000 non-null  bool   
#  36  Month_October                                         100000 non-null  bool   
#  37  Month_September                                       100000 non-null  bool   
#  38  Occupation_Architect                                  100000 non-null  bool   
#  39  Occupation_Developer                                  100000 non-null  bool   
#  40  Occupation_Doctor                                     100000 non-null  bool   
#  41  Occupation_Engineer                                   100000 non-null  bool   
#  42  Occupation_Entrepreneur                               100000 non-null  bool   
#  43  Occupation_Journalist                                 100000 non-null  bool   
#  44  Occupation_Lawyer                                     100000 non-null  bool   
#  45  Occupation_Manager                                    100000 non-null  bool   
#  46  Occupation_Mechanic                                   100000 non-null  bool   
#  47  Occupation_Media_Manager                              100000 non-null  bool   
#  48  Occupation_Musician                                   100000 non-null  bool   
#  49  Occupation_Others                                     100000 non-null  bool   
#  50  Occupation_Scientist                                  100000 non-null  bool   
#  51  Occupation_Teacher                                    100000 non-null  bool   
#  52  Occupation_Writer                                     100000 non-null  bool   
#  53  Payment_of_Min_Amount_Yes                             100000 non-null  bool   
#  54  Payment_Behaviour_High_spent_Medium_value_payments    100000 non-null  bool   
#  55  Payment_Behaviour_High_spent_Small_value_payments     100000 non-null  bool   
#  56  Payment_Behaviour_Low_spent_Large_value_payments      100000 non-null  bool   
#  57  Payment_Behaviour_Low_spent_Medium_value_payments     100000 non-null  bool   
#  58  Payment_Behaviour_Low_spent_Small_value_payments      100000 non-null  bool   
#  59  Payment_Behaviour_Medium_spent_Medium_value_payments  100000 non-null  bool   
# dtypes: bool(33), float64(26), int64(1)